Очистка веб-страниц с помощью python Beautifulsoup. Как очистить текст в элементах dd и dt?

#python #web-scraping #beautifulsoup

Вопрос:

Я пытаюсь почистить желтые страницы в поисках информации о компании. Пока все идет гладко. Но я просто не могу получить текст в элементах dd и dt на странице конкретной компании. Не могли бы вы быть так добры и помочь мне с этим? Мы высоко ценим каждое предложение! Спасибо.

Вот мой код: (Я сначала захожу на сайт и получаю результаты поиска. Затем я получаю ссылку на веб-страницу отдельных компаний и анализирую то, что там есть. Проблема в том, что я не могу получить информацию, хранящуюся в элементах dd на странице отдельной компании.)

 from bs4 import BeautifulSoup as soup
import urllib.request
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from random import randint
import time
import socket

out_filename = "expeditors_in_NJ.csv"
headers = "business_name, business_type, business_website, business_phone, general_info, business_history, service_and_products, work_hours   n"
f = open(out_filename, "w")
f.write(headers)

for i in range(0,50):
    page_url = "https://www.yellowpages.com/search?search_terms=expeditorsamp;geo_location_terms=NJamp;page="   str(i 1)   "amp;sort=amp;source=real-user"
    req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
    uClient = urlopen(req)
    page_soup = soup(uClient, "html.parser")
    uClient.close()
    containers = page_soup.findAll("div", {"class", "v-card"})
    for container in containers: 
        business_name = container.find("a", {"class":"business-name"}).span.text
        link = str(container.find("a",{"class":"business-name"}).get('href'))
        container_url = "https://www.yellowbook.com"   link
        req_ = Request(container_url, headers={'User-Agent': 'Mozilla/5.0'})
        uClient = urlopen(req_)
        container_soup = soup(uClient, "html.parser")
        uClient.close()
        info_list = container_soup.findAll("dd")
        
        try:
            business_type = container.find("div",{"class":"categories"}).text
        except:
            business_type = str(None)
        try: 
            years_in_business = str(container.find("div",{"class":"count"}).text)
        except: 
            years_in_business = str(None)
        try: 
            business_website = container.find("a",{"class":"track-visit-website"}).get('href')
        except: 
            business_website = str(None)
        try:
            business_address = container.find("div",{"class":"street-address"}).text   " "   container.find("div",{"class":"locality"}).text
        except:
            business_address = str(None)
        try: 
            business_phone = container.find("div",{"class":"phones phone primary"}).text
        except: 
            business_phone = str(None)
        try:
            general_info = info_list[0].text
        except:
            general_info = str(None)
        try:
            work_hours = info_list[1].text
        except:
            work_hours = str(None)
        
        print("business name: "   business_name   "n")
        print("business type: "   business_type   "n")
        print("years_in_business: "   years_in_business   "n")
        print("business_website: "   business_website   "n")
        print("business_address: "   business_address   "n")
        print("business_phone: "   business_phone   "n")
        print("general_info: "   general_info   "n")
        print("work_hours: "   work_hours   "n")

        
        f.write(business_name.replace(",", "|")   ", "   
                business_type.replace(",", "|").replace("/", "|")   ", "   
                years_in_business.replace(",", "|").replace("/", "|")   ", "   
                business_website.replace(",", "|").replace("/", "|")   ", "   
                business_address.replace(",", "|").replace("/", "|")   ", "   
                business_phone.replace(",", "|").replace("/", "|")   ", "   
                general_info.replace(",", "|").replace("/", "|")  
                work_hours.replace(",", "|").replace("/", "|")  
                "n")

f.close()

 

Если вы хотите сильно изменить код или сделать это совершенно по-другому, пожалуйста, дайте некоторые пояснения, чтобы я мог понять. Я новичок в программировании. Большое спасибо.

Ответ №1:

 import httpx
import trio
from bs4 import BeautifulSoup
import csv

limit = trio.CapacityLimiter(6)


async def scrape(client, item, sender):
    async with limit, sender:
        r = await client.get(f'https://www.yellowpages.com{item[1]}')
        soup = BeautifulSoup(r.text, 'lxml')

        try:
            bw = soup.select_one('.primary-btn')['href']
        except (TypeError, AttributeError):
            bw = None
        try:
            phone = soup.select_one('p.phone').text
        except (TypeError, AttributeError):
            phone = None
        try:
            biy = soup.select_one('.number').text
        except AttributeError:
            biy = None

        result = [item[0], bw, biy, phone]
        print(result)
        await sender.send(result)


async def worker(client, num, sender, nurse):
    async with limit, sender:
        params = {
            "search_terms": "expeditors",
            "geo_location_terms": "NJ",
            "page": num,
            "sort": "",
            "source": "real-user"
        }
        r = await client.get('https://www.yellowpages.com/search', params=params)
        soup = BeautifulSoup(r.text, 'lxml')
        goal = [(i.span.text, i['href'])
                for i in soup.select('.business-name')]
        for what in goal:
            nurse.start_soon(scrape, client, what, sender.clone())


async def main():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
    }
    async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
        client.headers.update(headers)

        sender, receiver = trio.open_memory_channel(0)
        nurse.start_soon(rec, receiver)

        async with sender:
            for item in range(1, 2):
                nurse.start_soon(worker, client, item, sender.clone(), nurse)


async def rec(receiver):
    with open('result.csv', 'w', buffering=1, newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Business Name', 'Website',
                        'Years In Business', 'Phone'])
        async with receiver:
            async for value in receiver:
                writer.writerow(value)

if __name__ == "__main__":
    trio.run(main)
 

Комментарии:

1. Интересный. Вероятно, стоит добавить пару коротких пояснительных примечаний.