Python заменяет тег br на абзацы в XMl

#python-3.x #xml

Вопрос:

Заранее приношу свои извинения, это большой вопрос

Я извлекаю новости из новостной RSS-ленты (https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms) формат которого приведен ниже.

 <article ID="84968170">
<author/>
<headline>Normal rainfall expected in Aug-Sept: IMD</headline>
<cats>India</cats>
<subcats/>
<imagename>https://static.toiimg.com/photo/84968170.cms</imagename>
<caption/>
<summary/>
<content>
<![CDATA[ <div class="section1"><div class="Normal">NEW DELHI: Rainfall during the second half of the monsoon season, in the months of August-September, is likely to be normal in most parts of the country.<br/><br/><br/>It is expected at 95-105% of the long period average over peninsular and adjacent central India, the Indian Meteorological Department (IMD) has said.<br/><br/>However some regions in north, northwest and northeast could experience below normal rainfall during the same period.<br/><br/> <div data-type="image" data-title="IMD" data-msid="84968287" data-hostid="83" data-src="/img/84968287/Master.jpg"><msid>84968287</msid></div></content>
</article>
<article ID="84968310">
<author/>
<headline>Where is Masood Azhar?</headline>
<cats>India</cats>
<subcats/>
<imagename>https://static.toiimg.com/photo/84968310.cms</imagename>
<caption/>
<summary/>
<content>
...
</content>
</article>
 

ЧТОБЫ достичь этого, я должен сохранить данные во фрейме данных, а затем сохранить их в XML с моим тегом в соответствии с нашими требованиями. Имя XML-это идентификатор статей (идентификатор статьи=»84968310″) . Ниже приведен формат требуемого XML

 <head><cats>India</cats><iim ver="3"><ds value="79" num="1:20"/><ds value="TOIOnline" num="1:30"/><ds value="TOIOnline" num="2:20"/><ds value="Wanted: 6,000 teachers in India’s central universities" num="2:25"/><ds value="Wanted: 6,000 teachers in India’s central universities" num="2:105"/></iim></head>
<body><body.content>A little over 33% teaching positions and 37% non-teaching positions are vacant in central universities across India. <br/><br/>As per official data, 15 of 44 central universities, including Delhi University, have over 40% of sanctioned teaching positions vacant. Sanctioned teaching positions are those for which approval has been granted but have not been filled. <br/><br/> </body.content></body>
 

Чтобы достичь этого, я пишу приведенный ниже код на python, потому что я силен в python.

 # Python code to illustrate parsing of XML files
# importing the required modules
import re as re
import xml.etree.ElementTree as ET
import pandas as pd
from urllib.request import Request, urlopen
import configparser
import os

def loadRSS():
    try:
        # url of rss feed
        url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"
        # creating HTTP response object from given url
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        #saving the url data in xml byte form
        web_byte = urlopen(req)
        #return data 
        return web_byte
    except OSError as e:
        print("Error in connecting TIL site :- ",e)
        input("Press andy to Close")

def parseXML(xmlfile):    
    news=[]
  # create element tree object
    tree = ET.parse(xmlfile)
    # get root element
    root = tree.getroot()
    # iterate through each node of the tree
    for node in root: 
        s_article  = node.attrib.get("ID")
        s_headline = node.find("headline").text
        s_imagename = node.find("imagename").text
        s_content = node.find("content").text
        s_summary = node.find("summary").text
        s_caption = node.find("caption").text
        s_cats = node.find("cats").text
        #update data in news list 
        news.append({"ID": s_article, "headline": s_headline, 
            "imagename": s_imagename, "content": s_content,
            "summary": s_summary, "caption": s_caption,"cats":s_cats})
    #return data in form of list 
    return news

def savetodf(newsitems):
    #difining Data fram columns 
    df_cols  = ['ID','headline', 'imagename', 'content', 'caption', 'summary','cats']
    #making data fram 
    out_df = pd.DataFrame(newsitems, columns = df_cols)
    #removing unwanted chrater form content
    out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))
    #returning data frame 
    return out_df

def define_filename(filename):
    #Defining file name of each news which save in xml 
    config = configparser.ConfigParser()
    config.read('path.ini')
    for section_name in config.sections():
        for name, value in config.items(section_name):
            if name=='default_path':
                default_path=value
        file_formate="xml"
        return os.path.join(default_path,filename   "."   file_formate)

def build_item_xml(row):
    #defining new xml as per CCI sturctuer
    items = ET.Element('nitf')
    #defining Head and other attributes 
    head = ET.SubElement(items,'head')
    title = ET.SubElement(head,'title')
    title.text=row["headline"]
    country=ET.SubElement(head,'cats')
    country.text=row["cats"]
    item1=ET.SubElement(head,'iim', ver='3')
    ET.SubElement(item1, 'ds num="1:20"', value="79")
    #This is important attribute to Import in CCI
    ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
#   ET.SubElement(item1, 'ds num="1:80"',value="113052 0000")
#   ET.SubElement(item1, 'ds num="2:10"',value="3")
    ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
    ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
    ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
    #savine content in body of xml
    body=ET.SubElement(items, 'body')
    content= ET.SubElement(body, 'body.content')
    content.text=row["content"]
    tree = ET.ElementTree(items)
    #riting in XMl 
    tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
    #returning in form of row 
    return row  

def main():
    # load rss from web to update existing xml file
    lodrss=loadRSS()
    # parse xml file
    newsitems = parseXML(lodrss)
    # store news items in a datafram|
    df=savetodf(newsitems)
    # this calls build_item_xml per row
    df.apply(build_item_xml, axis=1)
         
if __name__ == "__main__":
#     pd.set_option('display.max_colwidth', -1)
    # calling main function
    main()
 

this code is working fine it saves in XML as per the requirements and my system reads well.
Question:
"<br/>" tags in content section (in body part). I want replace
to paragraph tag because my target system only take <p>……</p> for paragraphs.

How I can do this ….i one have an idea