#python-3.x #xml
Вопрос:
Заранее приношу свои извинения, это большой вопрос
Я извлекаю новости из новостной RSS-ленты (https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms) формат которого приведен ниже.
<article ID="84968170">
<author/>
<headline>Normal rainfall expected in Aug-Sept: IMD</headline>
<cats>India</cats>
<subcats/>
<imagename>https://static.toiimg.com/photo/84968170.cms</imagename>
<caption/>
<summary/>
<content>
<![CDATA[ <div class="section1"><div class="Normal">NEW DELHI: Rainfall during the second half of the monsoon season, in the months of August-September, is likely to be normal in most parts of the country.<br/><br/><br/>It is expected at 95-105% of the long period average over peninsular and adjacent central India, the Indian Meteorological Department (IMD) has said.<br/><br/>However some regions in north, northwest and northeast could experience below normal rainfall during the same period.<br/><br/> <div data-type="image" data-title="IMD" data-msid="84968287" data-hostid="83" data-src="/img/84968287/Master.jpg"><msid>84968287</msid></div></content>
</article>
<article ID="84968310">
<author/>
<headline>Where is Masood Azhar?</headline>
<cats>India</cats>
<subcats/>
<imagename>https://static.toiimg.com/photo/84968310.cms</imagename>
<caption/>
<summary/>
<content>
...
</content>
</article>
ЧТОБЫ достичь этого, я должен сохранить данные во фрейме данных, а затем сохранить их в XML с моим тегом в соответствии с нашими требованиями. Имя XML-это идентификатор статей (идентификатор статьи=»84968310″) . Ниже приведен формат требуемого XML
<head><cats>India</cats><iim ver="3"><ds value="79" num="1:20"/><ds value="TOIOnline" num="1:30"/><ds value="TOIOnline" num="2:20"/><ds value="Wanted: 6,000 teachers in India’s central universities" num="2:25"/><ds value="Wanted: 6,000 teachers in India’s central universities" num="2:105"/></iim></head>
<body><body.content>A little over 33% teaching positions and 37% non-teaching positions are vacant in central universities across India. <br/><br/>As per official data, 15 of 44 central universities, including Delhi University, have over 40% of sanctioned teaching positions vacant. Sanctioned teaching positions are those for which approval has been granted but have not been filled. <br/><br/> </body.content></body>
Чтобы достичь этого, я пишу приведенный ниже код на python, потому что я силен в python.
# Python code to illustrate parsing of XML files
# importing the required modules
import re as re
import xml.etree.ElementTree as ET
import pandas as pd
from urllib.request import Request, urlopen
import configparser
import os
def loadRSS():
try:
# url of rss feed
url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"
# creating HTTP response object from given url
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
#saving the url data in xml byte form
web_byte = urlopen(req)
#return data
return web_byte
except OSError as e:
print("Error in connecting TIL site :- ",e)
input("Press andy to Close")
def parseXML(xmlfile):
news=[]
# create element tree object
tree = ET.parse(xmlfile)
# get root element
root = tree.getroot()
# iterate through each node of the tree
for node in root:
s_article = node.attrib.get("ID")
s_headline = node.find("headline").text
s_imagename = node.find("imagename").text
s_content = node.find("content").text
s_summary = node.find("summary").text
s_caption = node.find("caption").text
s_cats = node.find("cats").text
#update data in news list
news.append({"ID": s_article, "headline": s_headline,
"imagename": s_imagename, "content": s_content,
"summary": s_summary, "caption": s_caption,"cats":s_cats})
#return data in form of list
return news
def savetodf(newsitems):
#difining Data fram columns
df_cols = ['ID','headline', 'imagename', 'content', 'caption', 'summary','cats']
#making data fram
out_df = pd.DataFrame(newsitems, columns = df_cols)
#removing unwanted chrater form content
out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))
#returning data frame
return out_df
def define_filename(filename):
#Defining file name of each news which save in xml
config = configparser.ConfigParser()
config.read('path.ini')
for section_name in config.sections():
for name, value in config.items(section_name):
if name=='default_path':
default_path=value
file_formate="xml"
return os.path.join(default_path,filename "." file_formate)
def build_item_xml(row):
#defining new xml as per CCI sturctuer
items = ET.Element('nitf')
#defining Head and other attributes
head = ET.SubElement(items,'head')
title = ET.SubElement(head,'title')
title.text=row["headline"]
country=ET.SubElement(head,'cats')
country.text=row["cats"]
item1=ET.SubElement(head,'iim', ver='3')
ET.SubElement(item1, 'ds num="1:20"', value="79")
#This is important attribute to Import in CCI
ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
# ET.SubElement(item1, 'ds num="1:80"',value="113052 0000")
# ET.SubElement(item1, 'ds num="2:10"',value="3")
ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
#savine content in body of xml
body=ET.SubElement(items, 'body')
content= ET.SubElement(body, 'body.content')
content.text=row["content"]
tree = ET.ElementTree(items)
#riting in XMl
tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
#returning in form of row
return row
def main():
# load rss from web to update existing xml file
lodrss=loadRSS()
# parse xml file
newsitems = parseXML(lodrss)
# store news items in a datafram|
df=savetodf(newsitems)
# this calls build_item_xml per row
df.apply(build_item_xml, axis=1)
if __name__ == "__main__":
# pd.set_option('display.max_colwidth', -1)
# calling main function
main()
this code is working fine it saves in XML as per the requirements and my system reads well.
Question:
"<br/>"
tags in content section (in body part). I want replace
to paragraph tag because my target system only take <p>……</p>
for paragraphs.
How I can do this ….i one have an idea