#python #web-scraping #beautifulsoup
Вопрос:
Я пытаюсь почистить желтые страницы в поисках информации о компании. Пока все идет гладко. Но я просто не могу получить текст в элементах dd и dt на странице конкретной компании. Не могли бы вы быть так добры и помочь мне с этим? Мы высоко ценим каждое предложение! Спасибо.
Вот мой код: (Я сначала захожу на сайт и получаю результаты поиска. Затем я получаю ссылку на веб-страницу отдельных компаний и анализирую то, что там есть. Проблема в том, что я не могу получить информацию, хранящуюся в элементах dd на странице отдельной компании.)
from bs4 import BeautifulSoup as soup
import urllib.request
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from random import randint
import time
import socket
out_filename = "expeditors_in_NJ.csv"
headers = "business_name, business_type, business_website, business_phone, general_info, business_history, service_and_products, work_hours n"
f = open(out_filename, "w")
f.write(headers)
for i in range(0,50):
page_url = "https://www.yellowpages.com/search?search_terms=expeditorsamp;geo_location_terms=NJamp;page=" str(i 1) "amp;sort=amp;source=real-user"
req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
uClient = urlopen(req)
page_soup = soup(uClient, "html.parser")
uClient.close()
containers = page_soup.findAll("div", {"class", "v-card"})
for container in containers:
business_name = container.find("a", {"class":"business-name"}).span.text
link = str(container.find("a",{"class":"business-name"}).get('href'))
container_url = "https://www.yellowbook.com" link
req_ = Request(container_url, headers={'User-Agent': 'Mozilla/5.0'})
uClient = urlopen(req_)
container_soup = soup(uClient, "html.parser")
uClient.close()
info_list = container_soup.findAll("dd")
try:
business_type = container.find("div",{"class":"categories"}).text
except:
business_type = str(None)
try:
years_in_business = str(container.find("div",{"class":"count"}).text)
except:
years_in_business = str(None)
try:
business_website = container.find("a",{"class":"track-visit-website"}).get('href')
except:
business_website = str(None)
try:
business_address = container.find("div",{"class":"street-address"}).text " " container.find("div",{"class":"locality"}).text
except:
business_address = str(None)
try:
business_phone = container.find("div",{"class":"phones phone primary"}).text
except:
business_phone = str(None)
try:
general_info = info_list[0].text
except:
general_info = str(None)
try:
work_hours = info_list[1].text
except:
work_hours = str(None)
print("business name: " business_name "n")
print("business type: " business_type "n")
print("years_in_business: " years_in_business "n")
print("business_website: " business_website "n")
print("business_address: " business_address "n")
print("business_phone: " business_phone "n")
print("general_info: " general_info "n")
print("work_hours: " work_hours "n")
f.write(business_name.replace(",", "|") ", "
business_type.replace(",", "|").replace("/", "|") ", "
years_in_business.replace(",", "|").replace("/", "|") ", "
business_website.replace(",", "|").replace("/", "|") ", "
business_address.replace(",", "|").replace("/", "|") ", "
business_phone.replace(",", "|").replace("/", "|") ", "
general_info.replace(",", "|").replace("/", "|")
work_hours.replace(",", "|").replace("/", "|")
"n")
f.close()
Если вы хотите сильно изменить код или сделать это совершенно по-другому, пожалуйста, дайте некоторые пояснения, чтобы я мог понять. Я новичок в программировании. Большое спасибо.
Ответ №1:
import httpx
import trio
from bs4 import BeautifulSoup
import csv
limit = trio.CapacityLimiter(6)
async def scrape(client, item, sender):
async with limit, sender:
r = await client.get(f'https://www.yellowpages.com{item[1]}')
soup = BeautifulSoup(r.text, 'lxml')
try:
bw = soup.select_one('.primary-btn')['href']
except (TypeError, AttributeError):
bw = None
try:
phone = soup.select_one('p.phone').text
except (TypeError, AttributeError):
phone = None
try:
biy = soup.select_one('.number').text
except AttributeError:
biy = None
result = [item[0], bw, biy, phone]
print(result)
await sender.send(result)
async def worker(client, num, sender, nurse):
async with limit, sender:
params = {
"search_terms": "expeditors",
"geo_location_terms": "NJ",
"page": num,
"sort": "",
"source": "real-user"
}
r = await client.get('https://www.yellowpages.com/search', params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(i.span.text, i['href'])
for i in soup.select('.business-name')]
for what in goal:
nurse.start_soon(scrape, client, what, sender.clone())
async def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
client.headers.update(headers)
sender, receiver = trio.open_memory_channel(0)
nurse.start_soon(rec, receiver)
async with sender:
for item in range(1, 2):
nurse.start_soon(worker, client, item, sender.clone(), nurse)
async def rec(receiver):
with open('result.csv', 'w', buffering=1, newline='') as f:
writer = csv.writer(f)
writer.writerow(['Business Name', 'Website',
'Years In Business', 'Phone'])
async with receiver:
async for value in receiver:
writer.writerow(value)
if __name__ == "__main__":
trio.run(main)
Комментарии:
1. Интересный. Вероятно, стоит добавить пару коротких пояснительных примечаний.