Проблема с веб-скребком: может анализировать только строки

#python #web-scraping #xpath #valueerror

#python #веб-очистка #xpath #ошибка значения

Вопрос:

Недавно я написал 2-уровневый scraper для платформы выборов в Гонконге, и это сработало хорошо. Код позволяет мне извлекать информацию на уровне района. Код приведен ниже:

 from typing import List
import requests
import csv
from lxml import etree
from urllib.parse import urljoin


class hongkongelection:
    def __init__(self):
        self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can/A.html'

def send_request(self, url):
    r = requests.get(url)
    if r.text:
        html_result = r.text
        print('get result la')
        return html_result
    else:
        print('get result fail la')
        return ''

def extract_info_urls(self, response):
    raw_tree = etree.HTML(response)
    platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
    scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
    #self.pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
    platform_urls: List[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
    return platform_urls

def extract_info(self, platform_urls):
    raw_tree = etree.HTML(platform_urls)
    dict_result = {}
    dict_result['namelist'] = raw_tree.xpath("//*[@id="main"]/p[2]/span[2]/text()")
    dict_result['namelist'] = [x.replace("rn", "") for x in dict_result['namelist']]
    dict_result['partylist'] = raw_tree.xpath("//*[@id="main"]/p[5]/span[2]/text()")
    dict_result['partylist'] = [x.replace("rn", "") for x in dict_result['partylist']]
    dict_result['message_list'] = raw_tree.xpath("//*[@id="main"]/p[8]/span/text()")
    dict_result['message_list'] = [x.replace("rn", "") for x in dict_result['message_list']]
    return dict_result


def save_information(self, raw_json):
    with open('platform.csv', 'a ', encoding='UTF-8') as out_f:
        csv_writer =csv.DictWriter(out_f, raw_json.keys())
        if out_f.tell() == 0:
            csv_writer.writeheader()

        csv_writer.writerow(raw_json)

def run(self):
    response = self.send_request(self.url)
    platform_urls = self.extract_info_urls(response)
    for url in platform_urls:
        info_response = self.send_request(url)
        raw_json =self.extract_info(info_response)
        raw_json['platform_url'] = url
        self.save_information(raw_json)



if __name__ == '__main__' :
    runner = hongkongelection()
    runner.run()
 

Тем не менее, поскольку я хотел улучшить свои навыки, я попытался вместо этого сделать 3-уровневый scraper. Я хотел очистить платформы всех политиков в 18 округах одновременно.

 class hongkongelection:
def __init__(self):
    self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'

def send_request(self, url):
    r = requests.get(url)
    if r.text:
        html_result = r.text
        print('get result la')
        return html_result
    else:
        print('get result fail la')
        return ''

def extract_info_urls_district(self, response):
    raw_tree = etree.HTML(response)
    district_urls = raw_tree.xpath('//*[@id="content-area"]/table[2]/tr/td/div/ol/li/a/@href')
    scraped_url_district = "https://www.elections.gov.hk/dc2019/eng/intro_to_can.html"
    #pdf_url = "../eng/intro_to_can/A.html"
    district_urls = [urljoin(scraped_url_district, pdf_url) for pdf_url in district_urls]
    return district_urls

def extract_info_urls_platform(self, district_urls):
    raw_tree = etree.HTML(district_urls)
    platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
    scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
    #pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
    platform_urls: list[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
    return platform_urls

def extract_info(self, platform_urls):
    raw_tree = etree.HTML(platform_urls)
    dict_result = {}
    dict_result['namelist'] = raw_tree.xpath("//*[@id="main"]/p[2]/span[2]/text()")
    dict_result['namelist'] = [x.replace("rn", "") for x in dict_result['namelist']]
    dict_result['partylist'] = raw_tree.xpath("//*[@id="main"]/p[5]/span[2]/text()")
    dict_result['partylist'] = [x.replace("rn", "") for x in dict_result['partylist']]
    dict_result['message_list'] = raw_tree.xpath("//*[@id="main"]/p[8]/span/text()")
    dict_result['message_list'] = [x.replace("rn", "") for x in dict_result['message_list']]
    return dict_result


def save_information(self, raw_json):
    with open('platform.csv', 'a ', encoding='UTF-8') as out_f:
        csv_writer =csv.DictWriter(out_f, raw_json.keys())
        if out_f.tell() == 0:
            csv_writer.writeheader()

        csv_writer.writerow(raw_json)

def run(self):
    response = self.send_request(self.url)
    district_urls = self.extract_info_urls_district(response)
    platform_urls = self.extract_info_urls_platform(district_urls)
    for url in platform_urls:
        info_response = self.send_request(url)
        raw_json =self.extract_info(info_response)
        raw_json['platform_url'] = url
        self.save_information(raw_json)


if __name__ == '__main__' :
    runner = hongkongelection()
    runner.run()
 

Но это не удалось. Интересно, что я сделал не так.

Полная трассировка:

 Traceback (most recent call last):


 File "C:Program FilesJetBrainsPyCharm Community Edition 2020.3.2pluginspython-cehelperspydevpydevd.py", line 1477, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
  File "C:Program FilesJetBrainsPyCharm Community Edition 2020.3.2pluginspython-cehelperspydev_pydev_imps_pydev_execfile.py", line 18, in execfile
    exec(compile(contents "n", file, 'exec'), glob, loc)
  File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 71, in <module>
    runner.run()
  File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 61, in run
    platform_urls = self.extract_info_urls_platform(district_urls)
  File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 31, in extract_info_urls_platform
    raw_tree = etree.HTML(district_urls)
  File "src/lxml/etree.pyx", line 3185, in lxml.etree.HTML
  File "src/lxml/parser.pxi", line 1895, in lxml.etree._parseMemoryDocument
ValueError: can only parse strings
 

Я ценю вашу помощь и время — с нетерпением жду возможности поучиться у этого удивительного сообщества!

Ответ №1:

Вы пытались очистить содержимое напрямую, используя анализатор lxml, даже не отправляя запросы. Я внес некоторые изменения в ваши xpath, в которых не было необходимости. Я также использовал generator, чтобы сделать его эффективным. Обязательно добавьте этот save_information метод в скрипт, так как мне пришлось его удалить, чтобы посмотреть, что происходит:

 import csv
import time
import random
import requests
from lxml import etree
from typing import List
from urllib.parse import urljoin

class hongkongelection(object):

    def __init__(self):
        self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'

    def send_request(self, url):
        r = requests.get(url)
        r.raise_for_status()
        return r.text

    def extract_info_urls_district(self, url):
        res = self.send_request(url)
        raw_tree = etree.HTML(res)
        for pdf_url in raw_tree.xpath('//a[contains(@href,"/intro_to_can/")]/@href'):
            yield urljoin(url,pdf_url)

    def extract_info_urls_platform(self, url):
        res = self.send_request(url)
        raw_tree = etree.HTML(res)
        for pdf_url in raw_tree.xpath('//*[@id="table-district-member"]//a[contains(@href,"/pdf/intro_to_can/") and contains(.,"Text")]/@href'):
            yield urljoin(url,pdf_url) 

    def extract_info(self, url):
        res = self.send_request(url)
        raw_tree = etree.HTML(res)
        dict_result = {}
        dict_result['namelist'] = raw_tree.xpath("//*[@id="main"]/p[2]/span[2]/text()")
        dict_result['namelist'] = [x.replace("rn", "") for x in dict_result['namelist']]
        dict_result['partylist'] = raw_tree.xpath("//*[@id="main"]/p[5]/span[2]/text()")
        dict_result['partylist'] = [x.replace("rn", "") for x in dict_result['partylist']]
        dict_result['message_list'] = raw_tree.xpath("//*[@id="main"]/p[8]/span/text()")
        dict_result['message_list'] = [x.replace("rn", "") for x in dict_result['message_list']]
        return dict_result

    def run(self):
        for district_url in self.extract_info_urls_district(self.url):
            for url in self.extract_info_urls_platform(district_url):
                raw_json = self.extract_info(url)
                raw_json['platform_url'] = url
                print(raw_json)
            time.sleep(random.randint(3,8))


if __name__ == '__main__' :
    runner = hongkongelection()
    runner.run()
 

Комментарии:

1. Большое вам спасибо!! это сработало отлично, и я многому научился у вас. Очень ценю вашу доброту за то, что вы изучили проблему для меня 🙂 один дополнительный вопрос: есть ли способ установить временную задержку в скрипте, чтобы это было сделано этническим способом? (он же не хочет слишком сильно нагружать сервер веб-сайта)