#r #xml #dataframe #tidyverse #xml2
#r #xml #dataframe #tidyverse #xml2
Вопрос:
У меня есть XML-файл, содержащий больше таблиц, и я хочу импортировать его в свой dataframe.
Я использую следующий код, у меня есть только первая таблица в my datafame, а все остальные таблицы имеют все значения в одном столбце. Я не могу поместить все узлы в отдельные столбцы. Узел Instrument_Ratings значение находится в одном столбце в моем dataframe
Есть способ?
doc <- xmlParse("file.xml")
df <- xmlToDataFrame(doc, stringsAsFactors = FALSE)
<Instrument_Roots xmlns="http://www.moodys.com/fdp" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" file_type="Delta" frequency="24Hour" generation_time="2020-02-18T12:00:00Z" batch_number="000018" batch_date="2020-02-18Z">
<Instrument_Root action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Deal_number indicator="x" xsi:nil="true"/>
<Class_Code indicator="x">834</Class_Code>
<Class_Text indicator="x">Regular Bond/Debenture</Class_Text>
<Class_Short_Description indicator="x">REG</Class_Short_Description>
<Dated_Date indicator="x" xsi:nil="true"/>
<ISO_Currency_Code indicator="x">USD</ISO_Currency_Code>
<Currency_Multiple_Indicator indicator="x">N</Currency_Multiple_Indicator>
<Maturity_Date indicator="x">2021-02-09T00:00:00</Maturity_Date>
<Maturity_Year indicator="x">2021</Maturity_Year>
<Sale_Date indicator="x">2020-02-04T00:00:00</Sale_Date>
<Face_Amount_USD indicator="x">1.0000000000</Face_Amount_USD>
<Credit_Linked_Indicator indicator="x">N</Credit_Linked_Indicator>
<Takedown_Indicator indicator="x">N</Takedown_Indicator>
<Security_Description indicator="x" xsi:nil="true"/>
<Instrument_Type_Code indicator="x">24021</Instrument_Type_Code>
<Instrument_Type_Text indicator="x">INDEX LINKED EURO MTNS</Instrument_Type_Text>
<Private_Placement_Code indicator="x">24922</Private_Placement_Code>
<Private_Placement_Text indicator="x">Not Applicable</Private_Placement_Text>
<Coupon_Type_Code indicator="x">20434</Coupon_Type_Code>
<Coupon_Type_Text indicator="x">Non Interest Bearing</Coupon_Type_Text>
<Coupon_Type_Short_Description indicator="x">NIB</Coupon_Type_Short_Description>
<Coupon_Frequency_Code indicator="x">17</Coupon_Frequency_Code>
<Coupon_Frequency_Text indicator="x">Not Applicable</Coupon_Frequency_Text>
<Coupon_Frequency_Short_Description indicator="x">NA</Coupon_Frequency_Short_Description>
<Coupon_Rate indicator="x" xsi:nil="true"/>
<Instrument_Description indicator="x" xsi:nil="true"/>
<Product_Line_Description indicator="x" xsi:nil="true"/>
<Series_Class_Text indicator="x">Ser. 53156</Series_Class_Text>
<Market_Description indicator="x">EUROMARKET</Market_Description>
<Face_Amount indicator="x">1.0000000000</Face_Amount>
<Structured_Finance_Indicator indicator="x">N</Structured_Finance_Indicator>
<Structured_Finance_Sequence_Number indicator="x" xsi:nil="true"/>
<Instrument_Ratings>
<Instrument_Rating action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Rating_Class_Number indicator="x">46612</Rating_Class_Number>
<Rating_Date indicator="x">2020-02-17T11:13:23</Rating_Date>
<Moodys_Rating_ID indicator="x">831263727</Moodys_Rating_ID>
<Rating_Level indicator="x">I</Rating_Level>
<Rating_Class_Text indicator="x">Senior Unsecured</Rating_Class_Text>
<Security_Class_Code indicator="x">834</Security_Class_Code>
<Security_Class_Text indicator="x">Regular Bond/Debenture</Security_Class_Text>
<Security_Class_Short_Description indicator="x">REG</Security_Class_Short_Description>
<Duration_Code indicator="x">25636</Duration_Code>
<Duration_Text indicator="x">Long-Term Debt Rating</Duration_Text>
<Duration_Short_Description indicator="x">LT</Duration_Short_Description>
<Seniority_Code indicator="x">18744</Seniority_Code>
<Seniority_Text indicator="x">Senior Unsecured</Seniority_Text>
<Seniority_Short_Description indicator="x">SU</Seniority_Short_Description>
<Evaluation_Type_Code indicator="x">25648</Evaluation_Type_Code>
<Evaluation_Type_Text indicator="x">Credit Risk</Evaluation_Type_Text>
<Shadow_Code indicator="x">19139</Shadow_Code>
<Shadow_Text indicator="x">Corporate Finance Regular</Shadow_Text>
<Shadow_Short_Description indicator="x">CFR</Shadow_Short_Description>
<Rating_Subclass_Code indicator="x" xsi:nil="true"/>
<Rating_Subclass_Text indicator="x" xsi:nil="true"/>
<Currency_Capd_Code indicator="x">19141</Currency_Capd_Code>
<Currency_Capd_Text indicator="x">Foreign Currency</Currency_Capd_Text>
<Rating_Text indicator="x">NR</Rating_Text>
<Credit_Grade indicator="x" xsi:nil="true"/>
<Rating_Rank indicator="x">0</Rating_Rank>
<Rating_Direction_Code indicator="x">19102</Rating_Direction_Code>
<Rating_Direction_Text indicator="x">DECISION NOT TO RATE</Rating_Direction_Text>
<Rating_Direction_Short_Description indicator="x">NR</Rating_Direction_Short_Description>
<Rating_Type_Code indicator="x">534</Rating_Type_Code>
<Rating_Type_Text indicator="x">Long-Term Debt Rating</Rating_Type_Text>
<Rating_Type_Short_Description indicator="x">LT</Rating_Type_Short_Description>
<Rating_Enhancement_Level indicator="x">UND</Rating_Enhancement_Level>
<Rating_Local_Date indicator="x">2020-02-17T11:13:23</Rating_Local_Date>
<Rating_Termination_Date indicator="x" xsi:nil="true"/>
<Rating_Termination_Local_Date indicator="x" xsi:nil="true"/>
<Rating_Reason_Code indicator="x">25530</Rating_Reason_Code>
<Rating_Reason_Text indicator="x">DECISION NOT TO RATE</Rating_Reason_Text>
<Rating_Currency_Code indicator="x">30</Rating_Currency_Code>
<Rating_Currency_Text indicator="x">U.S. Dollar</Rating_Currency_Text>
<Rating_Currency_ISO_Code indicator="x">USD</Rating_Currency_ISO_Code>
<Rating_Monitor_Indicator indicator="x">1</Rating_Monitor_Indicator>
<Initial_Rating_Indicator indicator="x">Y</Initial_Rating_Indicator>
<Instrument_Watchlist xsi:nil="true"/>
<Instrument_Rating_Attributes>
<Instrument_Rating_Attribute action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Rating_Attribute_Type_Code indicator="x">5159697</Rating_Attribute_Type_Code>
<Rating_Attribute_Type_Text indicator="x">Rating Office</Rating_Attribute_Type_Text>
<Rating_Class_Number indicator="x">46612</Rating_Class_Number>
<Moodys_Rating_ID indicator="x">831263727</Moodys_Rating_ID>
<Effective_Date indicator="x">2020-02-17T11:13:23</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
<Rating_Attribute_Code indicator="x">5159717</Rating_Attribute_Code>
<Rating_Attribute_Text indicator="x">Toronto - Moody's Canada Inc.</Rating_Attribute_Text>
</Instrument_Rating_Attribute>
</Instrument_Rating_Attributes>
</Instrument_Rating>
</Instrument_Ratings>
<Instrument_Supports xsi:nil="true"/>
<Instrument_Organizations>
<Instrument_Organization action="New">
<Organization_ID indicator="x">600008042</Organization_ID>
<Moodys_Legal_Name indicator="x">ROYAL BANK OF CANADA (LONDON BRANCH)</Moodys_Legal_Name>
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Organization_Role_Code indicator="x">129</Organization_Role_Code>
<Organization_Role_Text indicator="x">Issuer</Organization_Role_Text>
<Effective_Date indicator="x">2020-02-03T00:00:00</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
</Instrument_Organization>
<Instrument_Organization action="New">
<Organization_ID indicator="x">600022460</Organization_ID>
<Moodys_Legal_Name indicator="x">RBC EUROPE LIMITED</Moodys_Legal_Name>
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Organization_Role_Code indicator="x">25005</Organization_Role_Code>
<Organization_Role_Text indicator="x">Dealer</Organization_Role_Text>
<Effective_Date indicator="x">2020-02-03T00:00:00</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
</Instrument_Organization>
</Instrument_Organizations>
<Instrument_Identifiers>
<Instrument_Identifier action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<ID_Type_Code indicator="x">109</ID_Type_Code>
<ID_Type_Text indicator="x">ISIN</ID_Type_Text>
<ID_Type_Short_Description indicator="x">ISI</ID_Type_Short_Description>
<Instrument_ID_Value indicator="x">XS2097291137</Instrument_ID_Value>
</Instrument_Identifier>
<Instrument_Identifier action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<ID_Type_Code indicator="x">2002</ID_Type_Code>
<ID_Type_Text indicator="x">Bloomberg Global Identifier</ID_Type_Text>
<ID_Type_Short_Description indicator="x">BBG ID</ID_Type_Short_Description>
<Instrument_ID_Value indicator="x">BBG00RHP5JH1</Instrument_ID_Value>
</Instrument_Identifier>
</Instrument_Identifiers>
<Instrument_Attributes xsi:nil="true"/>
<Instrument_Markets>
<Instrument_Market action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Domain_Number indicator="x">806356849</Domain_Number>
<Domain_Name indicator="x">EUROMARKET</Domain_Name>
<Effective_Date indicator="x">2020-02-03T00:00:00</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
</Instrument_Market>
</Instrument_Markets>
</Instrument_Root>
Комментарии:
1. Ваш XML неправильно сформирован. Пространства
xsi:nil
имен должны быть определены (обычно в корневом теге). Кроме того,<Instrument_Ratings>
узел не закрывается.2. Ваш корневой каталог не закрывается. Возможно, потребуется последняя строка
</Instruments_Roots>
.3. @parfait это выдержка из моего XML-файла. Последняя строка моего XML-файла — </instruments_Roots> . Но не работает ваш код
4.
<Instrument_Roots xmlns="http://www.moodys.com/fdp" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" file_type="Delta" frequency="24Hour" generation_time="2020-02-18T12:00:00Z" batch_number="000018" batch_date="2020-02-18Z"> ... </Instrument_Roots>
Ответ №1:
Внутри xmlToDataFrame
укажите уровень узла для извлечения getNodeSet
. Однако, поскольку у вас есть необъявленный префикс пространства имен в XML xmlns="http://www.moodys.com/fdp"
, вам необходимо определить префикс пространства имен для синтаксического анализа XPath в R с использованием именованного символьного вектора.
Оттуда вы можете извлечь все повторяющиеся разделы в разные фреймы данных. Ниже cbind
приведена каждая коллекция узлов ( Instrument_Rating_Attribute
, Instrument_Organization
, Instrument_Identifiers
, Instrument_Markets
с родительской корневой информацией, каждая с различным количеством наблюдений.
library(XML)
doc <- xmlParse("Input.xml")
nmsp <- c(doc="http://www.moodys.com/fdp")
Instrument_Ratings
### Instrument_Ratings
ratings_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Rating", namespaces=nmsp), stringsAsFactors = FALSE),
xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Rating_Attribute", namespaces=nmsp), stringsAsFactors = FALSE)
)
### CLEAN DUPLICATE NAMES
table(names(ratings_df))[table(names(ratings_df)) > 1]
# Instrument_ID Moodys_Rating_ID Rating_Class_Number
# 3 2 2
names(ratings_df)[grep("Instrument_ID|Moodys_Rating_ID|Rating_Class_Number", names(ratings_df))]
# [1] "Instrument_ID" "Instrument_ID" "Rating_Class_Number" "Moodys_Rating_ID" "Instrument_ID" "Rating_Class_Number" "Moodys_Rating_ID"
names(ratings_df)[grep("Instrument_ID|Moodys_Rating_ID|Rating_Class_Number", names(ratings_df))] <-
c("Instrument_ID_1", "Instrument_ID_2", "Rating_Class_Number_1", "Moodys_Rating_ID_1",
"Instrument_ID_3", "Rating_Class_Number_2", "Moodys_Rating_ID_2")
### REMOVE OVERALPS
ratings_df <- within(ratings_df, {
rm(Instrument_Ratings,
Instrument_Organizations,
Instrument_Identifiers,
Instrument_Markets,
Instrument_Rating_Attributes
)
})
str(ratings_df)
# 'data.frame': 1 obs. of 89 variables:
# $ Instrument_ID_1 : chr "831260145"
# $ Deal_number : chr ""
# $ Class_Code : chr "834"
# ...
Инструмент_организации
orgs_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Organization", namespaces=nmsp), stringsAsFactors = FALSE)
)
### CLEAN DUPLICATE NAMES
table(names(orgs_df))[table(names(orgs_df)) > 1]
#Instrument_ID
# 2
names(orgs_df)[grep("Instrument_ID$", names(orgs_df))] <- c("Instrument_ID_1", "Instrument_ID_2")
### REMOVE OVERALPS
orgs_df <- within(orgs_df, {
rm(Instrument_Ratings, Instrument_Organizations, Instrument_Identifiers, Instrument_Markets)
})
str(orgs_df)
# 'data.frame': 2 obs. of 42 variables:
# $ Instrument_ID_1 : chr "831260145" "831260145"
# $ Deal_number : chr "" ""
# $ Class_Code : chr "834" "834"
# ...
Instrument_Identifiers
ids_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Identifier", namespaces=nmsp), stringsAsFactors = FALSE)
)
### CLEAN DUPLICATE NAMES
table(names(ids_df))[table(names(ids_df)) > 1]
#Instrument_ID
# 2
names(ids_df)[grep("Instrument_ID$", names(ids_df))] <- c("Instrument_ID_1", "Instrument_ID_2")
### REMOVE OVERALPS
ids_df <- within(ids_df, {
rm(Instrument_Ratings, Instrument_Identifiers, Instrument_Markets)
})
str(ids_df)
# 'data.frame': 2 obs. of 41 variables:
# $ Instrument_ID_1 : chr "831260145" "831260145"
# $ Deal_number : chr "" ""
# $ Class_Code : chr "834" "834"
# ...
Instrument_Markets
mkts_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Market", namespaces=nmsp), stringsAsFactors = FALSE)
)
### CLEAN DUPLICATE NAMES
table(names(mkts_df))[table(names(mkts_df)) > 1]
#Instrument_ID
# 2
names(mkts_df)[grep("Instrument_ID$", names(mkts_df))] <- c("Instrument_ID_1", "Instrument_ID_2")
### REMOVE OVERALPS
mkts_df <- within(mkts_df, {
rm(Instrument_Ratings, Instrument_Markets)
})
str(mkts_df)
# 'data.frame': 1 obs. of 42 variables:
# $ Instrument_ID_1 : chr "831260145"
# $ Deal_number : chr ""
# $ Class_Code : chr "834"
Комментарии:
1. не работайте с вашим кодом, вы привязываете одну и ту же таблицу. Узлы Instruments_Rating имеют все значения в одном столбце
2. Конечно, поскольку ваше обновление XML значительно изменилось! Теперь у вас есть необъявленное пространство имен для обработки. См. раздел редактирование создание четырех разных фреймов данных из структуры XML.
3. Спасибо, я изменил некоторые моменты вашего кода, и он работает. Спасибо
4. Отлично. Рад помочь. Счастливого кодирования!