Импорт XML-файла, содержащего несколько таблиц в R dataframe

#r #xml #dataframe #tidyverse #xml2

#r #xml #dataframe #tidyverse #xml2

Вопрос:

У меня есть XML-файл, содержащий больше таблиц, и я хочу импортировать его в свой dataframe.

Я использую следующий код, у меня есть только первая таблица в my datafame, а все остальные таблицы имеют все значения в одном столбце. Я не могу поместить все узлы в отдельные столбцы. Узел Instrument_Ratings значение находится в одном столбце в моем dataframe

Есть способ?

 doc <- xmlParse("file.xml")
df <- xmlToDataFrame(doc, stringsAsFactors = FALSE)
 
 <Instrument_Roots xmlns="http://www.moodys.com/fdp" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" file_type="Delta" frequency="24Hour" generation_time="2020-02-18T12:00:00Z" batch_number="000018" batch_date="2020-02-18Z">
<Instrument_Root action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Deal_number indicator="x" xsi:nil="true"/>
<Class_Code indicator="x">834</Class_Code>
<Class_Text indicator="x">Regular Bond/Debenture</Class_Text>
<Class_Short_Description indicator="x">REG</Class_Short_Description>
<Dated_Date indicator="x" xsi:nil="true"/>
<ISO_Currency_Code indicator="x">USD</ISO_Currency_Code>
<Currency_Multiple_Indicator indicator="x">N</Currency_Multiple_Indicator>
<Maturity_Date indicator="x">2021-02-09T00:00:00</Maturity_Date>
<Maturity_Year indicator="x">2021</Maturity_Year>
<Sale_Date indicator="x">2020-02-04T00:00:00</Sale_Date>
<Face_Amount_USD indicator="x">1.0000000000</Face_Amount_USD>
<Credit_Linked_Indicator indicator="x">N</Credit_Linked_Indicator>
<Takedown_Indicator indicator="x">N</Takedown_Indicator>
<Security_Description indicator="x" xsi:nil="true"/>
<Instrument_Type_Code indicator="x">24021</Instrument_Type_Code>
<Instrument_Type_Text indicator="x">INDEX LINKED EURO MTNS</Instrument_Type_Text>
<Private_Placement_Code indicator="x">24922</Private_Placement_Code>
<Private_Placement_Text indicator="x">Not Applicable</Private_Placement_Text>
<Coupon_Type_Code indicator="x">20434</Coupon_Type_Code>
<Coupon_Type_Text indicator="x">Non Interest Bearing</Coupon_Type_Text>
<Coupon_Type_Short_Description indicator="x">NIB</Coupon_Type_Short_Description>
<Coupon_Frequency_Code indicator="x">17</Coupon_Frequency_Code>
<Coupon_Frequency_Text indicator="x">Not Applicable</Coupon_Frequency_Text>
<Coupon_Frequency_Short_Description indicator="x">NA</Coupon_Frequency_Short_Description>
<Coupon_Rate indicator="x" xsi:nil="true"/>
<Instrument_Description indicator="x" xsi:nil="true"/>
<Product_Line_Description indicator="x" xsi:nil="true"/>
<Series_Class_Text indicator="x">Ser. 53156</Series_Class_Text>
<Market_Description indicator="x">EUROMARKET</Market_Description>
<Face_Amount indicator="x">1.0000000000</Face_Amount>
<Structured_Finance_Indicator indicator="x">N</Structured_Finance_Indicator>
<Structured_Finance_Sequence_Number indicator="x" xsi:nil="true"/>
<Instrument_Ratings>
<Instrument_Rating action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Rating_Class_Number indicator="x">46612</Rating_Class_Number>
<Rating_Date indicator="x">2020-02-17T11:13:23</Rating_Date>
<Moodys_Rating_ID indicator="x">831263727</Moodys_Rating_ID>
<Rating_Level indicator="x">I</Rating_Level>
<Rating_Class_Text indicator="x">Senior Unsecured</Rating_Class_Text>
<Security_Class_Code indicator="x">834</Security_Class_Code>
<Security_Class_Text indicator="x">Regular Bond/Debenture</Security_Class_Text>
<Security_Class_Short_Description indicator="x">REG</Security_Class_Short_Description>
<Duration_Code indicator="x">25636</Duration_Code>
<Duration_Text indicator="x">Long-Term Debt Rating</Duration_Text>
<Duration_Short_Description indicator="x">LT</Duration_Short_Description>
<Seniority_Code indicator="x">18744</Seniority_Code>
<Seniority_Text indicator="x">Senior Unsecured</Seniority_Text>
<Seniority_Short_Description indicator="x">SU</Seniority_Short_Description>
<Evaluation_Type_Code indicator="x">25648</Evaluation_Type_Code>
<Evaluation_Type_Text indicator="x">Credit Risk</Evaluation_Type_Text>
<Shadow_Code indicator="x">19139</Shadow_Code>
<Shadow_Text indicator="x">Corporate Finance Regular</Shadow_Text>
<Shadow_Short_Description indicator="x">CFR</Shadow_Short_Description>
<Rating_Subclass_Code indicator="x" xsi:nil="true"/>
<Rating_Subclass_Text indicator="x" xsi:nil="true"/>
<Currency_Capd_Code indicator="x">19141</Currency_Capd_Code>
<Currency_Capd_Text indicator="x">Foreign Currency</Currency_Capd_Text>
<Rating_Text indicator="x">NR</Rating_Text>
<Credit_Grade indicator="x" xsi:nil="true"/>
<Rating_Rank indicator="x">0</Rating_Rank>
<Rating_Direction_Code indicator="x">19102</Rating_Direction_Code>
<Rating_Direction_Text indicator="x">DECISION NOT TO RATE</Rating_Direction_Text>
<Rating_Direction_Short_Description indicator="x">NR</Rating_Direction_Short_Description>
<Rating_Type_Code indicator="x">534</Rating_Type_Code>
<Rating_Type_Text indicator="x">Long-Term Debt Rating</Rating_Type_Text>
<Rating_Type_Short_Description indicator="x">LT</Rating_Type_Short_Description>
<Rating_Enhancement_Level indicator="x">UND</Rating_Enhancement_Level>
<Rating_Local_Date indicator="x">2020-02-17T11:13:23</Rating_Local_Date>
<Rating_Termination_Date indicator="x" xsi:nil="true"/>
<Rating_Termination_Local_Date indicator="x" xsi:nil="true"/>
<Rating_Reason_Code indicator="x">25530</Rating_Reason_Code>
<Rating_Reason_Text indicator="x">DECISION NOT TO RATE</Rating_Reason_Text>
<Rating_Currency_Code indicator="x">30</Rating_Currency_Code>
<Rating_Currency_Text indicator="x">U.S. Dollar</Rating_Currency_Text>
<Rating_Currency_ISO_Code indicator="x">USD</Rating_Currency_ISO_Code>
<Rating_Monitor_Indicator indicator="x">1</Rating_Monitor_Indicator>
<Initial_Rating_Indicator indicator="x">Y</Initial_Rating_Indicator>
<Instrument_Watchlist xsi:nil="true"/>
<Instrument_Rating_Attributes>
<Instrument_Rating_Attribute action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Rating_Attribute_Type_Code indicator="x">5159697</Rating_Attribute_Type_Code>
<Rating_Attribute_Type_Text indicator="x">Rating Office</Rating_Attribute_Type_Text>
<Rating_Class_Number indicator="x">46612</Rating_Class_Number>
<Moodys_Rating_ID indicator="x">831263727</Moodys_Rating_ID>
<Effective_Date indicator="x">2020-02-17T11:13:23</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
<Rating_Attribute_Code indicator="x">5159717</Rating_Attribute_Code>
<Rating_Attribute_Text indicator="x">Toronto - Moody's Canada Inc.</Rating_Attribute_Text>
</Instrument_Rating_Attribute>
</Instrument_Rating_Attributes>
</Instrument_Rating>
</Instrument_Ratings>
<Instrument_Supports xsi:nil="true"/>
<Instrument_Organizations>
<Instrument_Organization action="New">
<Organization_ID indicator="x">600008042</Organization_ID>
<Moodys_Legal_Name indicator="x">ROYAL BANK OF CANADA (LONDON BRANCH)</Moodys_Legal_Name>
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Organization_Role_Code indicator="x">129</Organization_Role_Code>
<Organization_Role_Text indicator="x">Issuer</Organization_Role_Text>
<Effective_Date indicator="x">2020-02-03T00:00:00</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
</Instrument_Organization>
<Instrument_Organization action="New">
<Organization_ID indicator="x">600022460</Organization_ID>
<Moodys_Legal_Name indicator="x">RBC EUROPE LIMITED</Moodys_Legal_Name>
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Organization_Role_Code indicator="x">25005</Organization_Role_Code>
<Organization_Role_Text indicator="x">Dealer</Organization_Role_Text>
<Effective_Date indicator="x">2020-02-03T00:00:00</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
</Instrument_Organization>
</Instrument_Organizations>
<Instrument_Identifiers>
<Instrument_Identifier action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<ID_Type_Code indicator="x">109</ID_Type_Code>
<ID_Type_Text indicator="x">ISIN</ID_Type_Text>
<ID_Type_Short_Description indicator="x">ISI</ID_Type_Short_Description>
<Instrument_ID_Value indicator="x">XS2097291137</Instrument_ID_Value>
</Instrument_Identifier>
<Instrument_Identifier action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<ID_Type_Code indicator="x">2002</ID_Type_Code>
<ID_Type_Text indicator="x">Bloomberg Global Identifier</ID_Type_Text>
<ID_Type_Short_Description indicator="x">BBG ID</ID_Type_Short_Description>
<Instrument_ID_Value indicator="x">BBG00RHP5JH1</Instrument_ID_Value>
</Instrument_Identifier>
</Instrument_Identifiers>
<Instrument_Attributes xsi:nil="true"/>
<Instrument_Markets>
<Instrument_Market action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Domain_Number indicator="x">806356849</Domain_Number>
<Domain_Name indicator="x">EUROMARKET</Domain_Name>
<Effective_Date indicator="x">2020-02-03T00:00:00</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
</Instrument_Market>
</Instrument_Markets>
</Instrument_Root>
 

Комментарии:

1. Ваш XML неправильно сформирован. Пространства xsi:nil имен должны быть определены (обычно в корневом теге). Кроме того, <Instrument_Ratings> узел не закрывается.

2. Ваш корневой каталог не закрывается. Возможно, потребуется последняя строка </Instruments_Roots> .

3. @parfait это выдержка из моего XML-файла. Последняя строка моего XML-файла — </instruments_Roots> . Но не работает ваш код

4. <Instrument_Roots xmlns="http://www.moodys.com/fdp" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" file_type="Delta" frequency="24Hour" generation_time="2020-02-18T12:00:00Z" batch_number="000018" batch_date="2020-02-18Z"> ... </Instrument_Roots>

Ответ №1:

Внутри xmlToDataFrame укажите уровень узла для извлечения getNodeSet . Однако, поскольку у вас есть необъявленный префикс пространства имен в XML xmlns="http://www.moodys.com/fdp" , вам необходимо определить префикс пространства имен для синтаксического анализа XPath в R с использованием именованного символьного вектора.

Оттуда вы можете извлечь все повторяющиеся разделы в разные фреймы данных. Ниже cbind приведена каждая коллекция узлов ( Instrument_Rating_Attribute , Instrument_Organization , Instrument_Identifiers , Instrument_Markets с родительской корневой информацией, каждая с различным количеством наблюдений.

 library(XML)

doc <- xmlParse("Input.xml")
nmsp <- c(doc="http://www.moodys.com/fdp")
 

Instrument_Ratings

 ### Instrument_Ratings
ratings_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
                    xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Rating", namespaces=nmsp), stringsAsFactors = FALSE),
                    xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Rating_Attribute", namespaces=nmsp), stringsAsFactors = FALSE)
                    )

### CLEAN DUPLICATE NAMES
table(names(ratings_df))[table(names(ratings_df)) > 1]
# Instrument_ID    Moodys_Rating_ID Rating_Class_Number 
#             3                   2                   2 

names(ratings_df)[grep("Instrument_ID|Moodys_Rating_ID|Rating_Class_Number", names(ratings_df))]
# [1] "Instrument_ID"  "Instrument_ID" "Rating_Class_Number" "Moodys_Rating_ID" "Instrument_ID" "Rating_Class_Number" "Moodys_Rating_ID" 
names(ratings_df)[grep("Instrument_ID|Moodys_Rating_ID|Rating_Class_Number", names(ratings_df))] <-
    c("Instrument_ID_1", "Instrument_ID_2", "Rating_Class_Number_1", "Moodys_Rating_ID_1",
      "Instrument_ID_3", "Rating_Class_Number_2", "Moodys_Rating_ID_2")

### REMOVE OVERALPS
ratings_df <- within(ratings_df, {
    rm(Instrument_Ratings,
       Instrument_Organizations,
       Instrument_Identifiers,
       Instrument_Markets,
       Instrument_Rating_Attributes
    )
})

str(ratings_df)
# 'data.frame': 1 obs. of  89 variables:
# $ Instrument_ID_1                   : chr "831260145"
# $ Deal_number                       : chr ""
# $ Class_Code                        : chr "834"
# ...
 

Инструмент_организации

 orgs_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
                 xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Organization", namespaces=nmsp), stringsAsFactors = FALSE)
)

### CLEAN DUPLICATE NAMES
table(names(orgs_df))[table(names(orgs_df)) > 1]
#Instrument_ID 
#            2

names(orgs_df)[grep("Instrument_ID$", names(orgs_df))] <- c("Instrument_ID_1", "Instrument_ID_2")

### REMOVE OVERALPS
orgs_df <- within(orgs_df, {
  rm(Instrument_Ratings, Instrument_Organizations, Instrument_Identifiers, Instrument_Markets)
})

str(orgs_df)
# 'data.frame': 2 obs. of  42 variables:
# $ Instrument_ID_1                   : chr  "831260145" "831260145"
# $ Deal_number                       : chr  "" ""
# $ Class_Code                        : chr  "834" "834"
# ...
 

Instrument_Identifiers

 ids_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
                xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Identifier", namespaces=nmsp), stringsAsFactors = FALSE)
)

### CLEAN DUPLICATE NAMES
table(names(ids_df))[table(names(ids_df)) > 1]
#Instrument_ID 
#            2

names(ids_df)[grep("Instrument_ID$", names(ids_df))] <- c("Instrument_ID_1", "Instrument_ID_2")

### REMOVE OVERALPS
ids_df <- within(ids_df, {
  rm(Instrument_Ratings, Instrument_Identifiers, Instrument_Markets)
})

str(ids_df)
# 'data.frame': 2 obs. of  41 variables:
# $ Instrument_ID_1                   : chr  "831260145" "831260145"
# $ Deal_number                       : chr  "" ""
# $ Class_Code                        : chr  "834" "834"
# ...
 

Instrument_Markets

 mkts_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
                 xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Market", namespaces=nmsp), stringsAsFactors = FALSE)
)

### CLEAN DUPLICATE NAMES
table(names(mkts_df))[table(names(mkts_df)) > 1]
#Instrument_ID 
#            2

names(mkts_df)[grep("Instrument_ID$", names(mkts_df))] <- c("Instrument_ID_1", "Instrument_ID_2")

### REMOVE OVERALPS
mkts_df <- within(mkts_df, {
  rm(Instrument_Ratings, Instrument_Markets)
})

str(mkts_df)
# 'data.frame': 1 obs. of  42 variables:
# $ Instrument_ID_1                   : chr "831260145"
# $ Deal_number                       : chr ""
# $ Class_Code                        : chr "834"
 

Комментарии:

1. не работайте с вашим кодом, вы привязываете одну и ту же таблицу. Узлы Instruments_Rating имеют все значения в одном столбце

2. Конечно, поскольку ваше обновление XML значительно изменилось! Теперь у вас есть необъявленное пространство имен для обработки. См. раздел редактирование создание четырех разных фреймов данных из структуры XML.

3. Спасибо, я изменил некоторые моменты вашего кода, и он работает. Спасибо

4. Отлично. Рад помочь. Счастливого кодирования!