#python #html #web-scraping #html-tableextract
#python #HTML #очистка веб-страниц #html-tableextract
Вопрос:
Это мой код работает нормально.
import pandas as pd
html_data = """<table id="example" class="table table-hover dataTable no-footer" role="grid" aria-describedby="example_info">
<thead>
<tr role="row"><th class="sorting_desc" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-sort="descending" aria-label="Start Date/Time: activate to sort column ascending">Start Date/Time</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="End Date/Time: activate to sort column ascending">End Date/Time</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="Caller Name: activate to sort column ascending">Caller Name</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="Caller Number: activate to sort column ascending">Caller Number</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="Callee: activate to sort column ascending">Callee</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="Used Mins.: activate to sort column ascending">Used Mins.</th><th class="text-center sorting_disabled" rowspan="1" colspan="1" aria-label="File">File</th></tr>
</thead>
<tbody>
<tr role="row" class="odd"><td class="sorting_1">2020-11-27 12:50:23</td><td>2020-11-27 12:51:04</td><td>ABC 3</td><td>7111</td><td>923333222</td><td>1</td><td class=" text-center"><audio controls="">
<source src="../record_files_out/3/2020/oc_1.wav.wav" type="audio/ogg">
<source src="../record_files_out/358/2020-11-27/oc_1934553_358.wav.wav" type="audio/mpeg">
Your browser does not support the audio element.
</audio></td></tr></tbody>
</table>
"""
print(pd.read_html(html_data)[0].to_csv(index=False, header=True))
Вот вывод
2020-11-27 12:50:23,2020-11-27 12:51:04,ABC 3,7111,923333222,1,Your browser does not support the audio element.
Но я хочу извлечь
../record_files_out/3/2020/oc_1.wav.wav
Вместо этого
Your browser does not support the audio element.
Ответ №1:
Я рекомендую вам проверить этот рекомендуемый вариант:
# Importing the required modules
import os
import sys
import pandas as pd
from bs4 import BeautifulSoup
path = 'html.html'
# empty list
data = []
# for getting the header from
# the HTML file
list_header = []
soup = BeautifulSoup(open(path),'html.parser')
header = soup.find_all("table")[0].find("tr")
for items in header:
try:
list_header.append(items.get_text())
except:
continue
# for getting the data
HTML_data = soup.find_all("table")[0].find_all("tr")[1:]
for element in HTML_data:
sub_data = []
for sub_element in element:
try:
sub_data.append(sub_element.get_text())
except:
continue
data.append(sub_data)
# Storing the data into Pandas
# DataFrame
dataFrame = pd.DataFrame(data = data, columns = list_header)
# Converting Pandas DataFrame
# into CSV file
dataFrame.to_csv('Geeks.csv')
Комментарии:
1. Та же проблема, я хочу также извлечь «../ record_files_out/3/2020/ oc_1.wav.wav»