Python BeautifulSoup и производительность многопоточности

#python #multithreading #performance #beautifulsoup

#python #многопоточность #Производительность #beautifulsoup

Вопрос:

Я анализирую некоторые веб-страницы, используя BeautifulSoup4 и многопоточность. Каждый поток принимает задания (URL-адреса) из очереди и вызывает parse_results_page , который выполняет HTTP-запрос (вызов get_html_page_source() ), а затем анализирует каждый элемент на странице в цикле for .

Что я заметил: если у меня 5 потоков, для анализа страницы требуется от 0,7 до 8 секунд *. Если у меня 50 потоков, для анализа страницы требуется от 25 до 60 секунд *. Даже с 10 потоками время выполнения значительно увеличивается.

* выполнение измеряется после выполнения HTTP-запроса, поэтому различия не связаны с моей пропускной способностью или временем отклика сервера.

Кроме того, наличие 50 потоков увеличивает использование памяти до 6 ГБ.

Может кто-нибудь объяснить, почему это происходит и как я могу оптимизировать свой код для большего количества потоков?

Ниже приведен мой код. Спасибо

     def parse_item(self, item, *args):
    """Parse a given item
    @item - beautifulsoup.Tag object
    @queue - queue of dicts
    """
    item_url = item.find('h2', class_="heading")
    item_url = re.sub(r'.html?. $', '.html', item_url.a['href'])
    if 'redirect' in item_url or '/external/url' in item_url:
        return

    item_title = item.find('span', class_="mp-listing-title")
    item_title = item_title.get_text().strip() if item_title else None

    description = item.find('div', class_="listing-title-description")
    if description:
        description.h2.extract()
        description = description.get_text().strip()
        description = ' '.join(description.split())

    price = item.find('span', class_="price")
    if price:
        try:
            raw_price = price.get_text().strip().replace(
                                            '.', '').replace(',', '.')
            currency, price = raw_price.split()
            price = price
            try:
                currency = self.currency_code[currency.strip()]
            except KeyError:
                currency = None
                price = raw_price
        except ValueError:
            price = raw_price
            currency = None
    else:
        price, currency = None, None

    seller_id = item.find('div', class_="seller-name")
    if seller_id:
        seller_id = seller_id.find('a')
    if seller_id:
        seller_id = seller_id['href'].split('/')[-1].replace('.html',
                                                             '')
    location = item.find('div', class_='location-name')
    if location:
        location = location.get_text().split(',')
        if len(location) == 1:
            city = location[0]
            region = None
        else:
            city = location[0]
            region = location[1].strip()
    else:
        city, region = None, None
    date_posted = ''
    date_string = item.find('div', class_="date"
                            ).get_text().strip()
    if 'Vandaag' in date_string:
        date_posted = datetime.datetime.now().date().strftime(
                                                        '%Y-%m-%d')

    elif 'Gisteren' in date_string:
        date_posted = datetime.datetime.now(
                                    ).date() - datetime.timedelta(1)
        date_posted = date_posted.strftime('%Y-%m-%d')

    elif 'Eergisteren' in date_string:
        date_posted = datetime.datetime.now(
                                    ).date() - datetime.timedelta(2)
        date_posted = date_posted.strftime('%Y-%m-%d')

    else:
        date_posted = datetime.datetime.strptime(date_string.replace(
                                                            '.', ''),
                                                 "%d %b '%y")

        date_posted = date_posted.strftime('%Y-%m-%d')
    date_scraped = datetime.datetime.now().strftime(
                                                '%Y-%m-%d %H:%M:%S')
    return {"Url": item_url,
            'Category': args[0],
            "AdTitle": item_title,
            "Subcategory": args[1],
            "Description": description,
            "Currency": currency,
            "Price": price,
            "SellerId": seller_id,
            "SellerPhone": None,
            "SellerWebsite": None,
            "SellerEmail": None,
            "SellerCity": city,
            "SellerCountry": None,
            "SellerRegion": region,
            "DatePosted": date_posted,
            "DateSaved": date_scraped}

def parse_results_page(self, url, queue):
    """Get all items from a given page
    @url - string
    @queue - output queue
    """

    page = self.get_html_page_source(url)
    if not page:
        return
    page = BeautifulSoup(page, "html5lib")
    items = page.find_all('article')
    breadcrumb = page.find('ul', class_="breadcrumbs").find_all('span')
    category = breadcrumb[0].get_text().strip()
    subcategory = breadcrumb[1].get_text().strip()
    for item in items:
        queue.put(self.parse_item(item, category, subcategory))

Вопрос:

Вам также может понравиться

Навигация Vaadin

Python: Есть ли какое-либо правило, которое может защитить некоторые слова, не подлежащие обработке на этапе предварительной обработки данных

Передача большого объема данных на aspx-сайт и получение обратно изображения