#python #html #parsing #beautifulsoup
Вопрос:
У меня есть куча HTML-документов инструментов GCOV для ветвей и линий, файлы выглядят так:
lt;trgt; lt;td align="right" class="lineno"gt;lt;pregt;224lt;/pregt;lt;/tdgt; lt;td align="right" class="linebranch"gt;lt;span class="takenBranch" title="Branch 1 taken 329 times"gt;amp;check;lt;/spangt;lt;span class="notTakenBranch" title="Branch 2 not taken"gt;amp;cross;lt;/spangt;lt;span class="notTakenBranch" title="Branch 4 not taken"gt;amp;cross;lt;/spangt;lt;span class="takenBranch" title="Branch 5 taken 329 times"gt;amp;check;lt;/spangt;lt;br/gt;lt;span class="notTakenBranch" title="Branch 6 not taken"gt;amp;cross;lt;/spangt;lt;span class="takenBranch" title="Branch 7 taken 329 times"gt;amp;check;lt;/spangt;lt;/tdgt; lt;td align="right" class="linecount coveredLine"gt;lt;pregt;329lt;/pregt;lt;/tdgt; lt;td align="left" class="src coveredLine"gt;lt;pregt; line of C codelt;/pregt;lt;/tdgt; lt;/trgt; lt;trgt; lt;td align="right" class="lineno"gt;lt;pregt;225lt;/pregt;lt;/tdgt; lt;td align="right" class="linebranch"gt;lt;/tdgt; lt;td align="right" class="linecount uncoveredLine"gt;lt;pregt;lt;/pregt;lt;/tdgt; lt;td align="left" class="src uncoveredLine"gt;lt;pregt; another line of C code;lt;/pregt;lt;/tdgt; lt;/trgt;
Я хотел бы извлечь текст «(другая) строка кода C «, а в идеале также номер строки, чтобы вывод выглядел так:
224 line of C code 225 another line of C code
Я пытался использовать BeautifulSoup, но он не предоставляет запрошенный вывод, мой код выглядит так:
from itertools import islice import codecs import glob from ntpath import join import os from bs4 import BeautifulSoup lineNo = "lt;td align="right" class="lineNo"gt;lt;pregt;" linetextCovered = "lt;td align="left" class="src coveredLine"gt;lt;pregt;" linetextNotCovered = "lt;td align="left" class="src uncoveredLine"gt;lt;pregt;" open('Output.txt', 'w').close() #Erase any content of Output.txt file for filepath in glob.iglob('path/To/Reports/*.html'): with codecs.open(os.path.join(filepath), "r") as inputFile, open('Output.txt',"a") as outputFile: for num, line in enumerate(inputFile, 1): if lineNo in line: inputSoup = BeautifulSoup(line) text = inputSoup.getText() outputFile.write("".join(islice(text, 1) "t")) if linetextCovered or linetextNotCovered in line: inputSoup = BeautifulSoup(line) text = inputSoup.getText() outputFile.write("".join(islice(text, 4))) outputFile.write("n") print("Done")
Но результат выглядит так
/* L a:li { colo text }
Что я делаю не так? Большое вам спасибо за любую помощь.
Ответ №1:
Ты можешь сделать вот так:
from bs4 import BeautifulSoup html = ''' lt;trgt; lt;td align="right" class="lineno"gt;lt;pregt;224lt;/pregt;lt;/tdgt; lt;td align="right" class="linebranch"gt;lt;span class="takenBranch" title="Branch 1 taken 329 times"gt;amp;check;lt;/spangt;lt;span class="notTakenBranch" title="Branch 2 not taken"gt;amp;cross;lt;/spangt;lt;span class="notTakenBranch" title="Branch 4 not taken"gt;amp;cross;lt;/spangt;lt;span class="takenBranch" title="Branch 5 taken 329 times"gt;amp;check;lt;/spangt;lt;br/gt;lt;span class="notTakenBranch" title="Branch 6 not taken"gt;amp;cross;lt;/spangt;lt;span class="takenBranch" title="Branch 7 taken 329 times"gt;amp;check;lt;/spangt;lt;/tdgt; lt;td align="right" class="linecount coveredLine"gt;lt;pregt;329lt;/pregt;lt;/tdgt; lt;td align="left" class="src coveredLine"gt;lt;pregt; line of C codelt;/pregt;lt;/tdgt; lt;/trgt; lt;trgt; lt;td align="right" class="lineno"gt;lt;pregt;225lt;/pregt;lt;/tdgt; lt;td align="right" class="linebranch"gt;lt;/tdgt; lt;td align="right" class="linecount uncoveredLine"gt;lt;pregt;lt;/pregt;lt;/tdgt; lt;td align="left" class="src uncoveredLine"gt;lt;pregt; another line of C code;lt;/pregt;lt;/tdgt; lt;/trgt; ''' for tr in BeautifulSoup(html.encode(), 'html.parser').find_all('tr'): lineno = tr.find('td',{'class':'src'}).text.strip() src = tr.find('td', {'class':'lineno'}).text.strip() print(lineno, src)