#python #web-scraping #xpath #valueerror
#python #веб-очистка #xpath #ошибка значения
Вопрос:
Недавно я написал 2-уровневый scraper для платформы выборов в Гонконге, и это сработало хорошо. Код позволяет мне извлекать информацию на уровне района. Код приведен ниже:
from typing import List
import requests
import csv
from lxml import etree
from urllib.parse import urljoin
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can/A.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls(self, response):
raw_tree = etree.HTML(response)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#self.pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: List[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id="main"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("rn", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id="main"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("rn", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id="main"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("rn", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a ', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
platform_urls = self.extract_info_urls(response)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
Тем не менее, поскольку я хотел улучшить свои навыки, я попытался вместо этого сделать 3-уровневый scraper. Я хотел очистить платформы всех политиков в 18 округах одновременно.
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls_district(self, response):
raw_tree = etree.HTML(response)
district_urls = raw_tree.xpath('//*[@id="content-area"]/table[2]/tr/td/div/ol/li/a/@href')
scraped_url_district = "https://www.elections.gov.hk/dc2019/eng/intro_to_can.html"
#pdf_url = "../eng/intro_to_can/A.html"
district_urls = [urljoin(scraped_url_district, pdf_url) for pdf_url in district_urls]
return district_urls
def extract_info_urls_platform(self, district_urls):
raw_tree = etree.HTML(district_urls)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: list[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id="main"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("rn", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id="main"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("rn", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id="main"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("rn", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a ', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
district_urls = self.extract_info_urls_district(response)
platform_urls = self.extract_info_urls_platform(district_urls)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
Но это не удалось. Интересно, что я сделал не так.
Полная трассировка:
Traceback (most recent call last):
File "C:Program FilesJetBrainsPyCharm Community Edition 2020.3.2pluginspython-cehelperspydevpydevd.py", line 1477, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:Program FilesJetBrainsPyCharm Community Edition 2020.3.2pluginspython-cehelperspydev_pydev_imps_pydev_execfile.py", line 18, in execfile
exec(compile(contents "n", file, 'exec'), glob, loc)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 71, in <module>
runner.run()
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 61, in run
platform_urls = self.extract_info_urls_platform(district_urls)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 31, in extract_info_urls_platform
raw_tree = etree.HTML(district_urls)
File "src/lxml/etree.pyx", line 3185, in lxml.etree.HTML
File "src/lxml/parser.pxi", line 1895, in lxml.etree._parseMemoryDocument
ValueError: can only parse strings
Я ценю вашу помощь и время — с нетерпением жду возможности поучиться у этого удивительного сообщества!
Ответ №1:
Вы пытались очистить содержимое напрямую, используя анализатор lxml, даже не отправляя запросы. Я внес некоторые изменения в ваши xpath, в которых не было необходимости. Я также использовал generator, чтобы сделать его эффективным. Обязательно добавьте этот save_information
метод в скрипт, так как мне пришлось его удалить, чтобы посмотреть, что происходит:
import csv
import time
import random
import requests
from lxml import etree
from typing import List
from urllib.parse import urljoin
class hongkongelection(object):
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
r.raise_for_status()
return r.text
def extract_info_urls_district(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//a[contains(@href,"/intro_to_can/")]/@href'):
yield urljoin(url,pdf_url)
def extract_info_urls_platform(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//*[@id="table-district-member"]//a[contains(@href,"/pdf/intro_to_can/") and contains(.,"Text")]/@href'):
yield urljoin(url,pdf_url)
def extract_info(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id="main"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("rn", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id="main"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("rn", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id="main"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("rn", "") for x in dict_result['message_list']]
return dict_result
def run(self):
for district_url in self.extract_info_urls_district(self.url):
for url in self.extract_info_urls_platform(district_url):
raw_json = self.extract_info(url)
raw_json['platform_url'] = url
print(raw_json)
time.sleep(random.randint(3,8))
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
Комментарии:
1. Большое вам спасибо!! это сработало отлично, и я многому научился у вас. Очень ценю вашу доброту за то, что вы изучили проблему для меня 🙂 один дополнительный вопрос: есть ли способ установить временную задержку в скрипте, чтобы это было сделано этническим способом? (он же не хочет слишком сильно нагружать сервер веб-сайта)