#python #beautifulsoup
#python #beautifulsoup
Вопрос:
Файлы csv повторяют одну и ту же информацию. Мне нужна обновленная информация, отправленная на лист Excel из трех встроенных источников. Есть код, который связывает ссылку href с базовой страницы со страницами обзора продукта. Кроме того, при проверке выходных данных из пакета spyder / anaconda python все было закодировано правильно. Таким образом, проблема должна быть в функции записи. Кто-нибудь может, пожалуйста, помочь?
Я пытался переформатировать код, используя a и w / w , но, похоже, ничего не работает.
import requests as r
from bs4 import BeautifulSoup
#Get URL
main_url = 'http://drd.ba.ttu.edu/isqs6339/imbadproducts/'
response = r.get(main_url)
#Set filepaths
filepath = 'dataout1.csv'
filepath2 = 'dataout2.csv'
#Check for good link and get headers
print(response.status_code)
print (response.headers)
soup = BeautifulSoup(response.text, 'lxml')
print(soup.prettify())
#Find all anchors on the page
search_results = soup.find('div', attrs={'id' : 'searchresults'})
product_results = search_results.find_all('a')
#Define product link, id, title, price, and description for all products
for link in product_results:
link_url = main_url link.get('href')
productId = link.find('span', attrs={'class' : 'productid'}).text
product_title = link.find('span', attrs={'class' : 'producttitle'}).text
product_price = link.find('span', attrs={'class' : 'productprice'}).text
product_description = link.find('span', attrs={'class' : 'productdesc'}).text
#Get links for each product
response2 = r.get(link_url)
soup2 = BeautifulSoup(response2.text, 'lxml')
#Find each user review for the product on the page
user_review = soup2.find('div', attrs={'id' : 'userreviews'})
review_results = user_review.find_all('div')
#Find author, stars, and review info for each review of the page's product and print results
for rev in review_results:
print ('ProductID: ' productId)
print ('Product Title: ' product_title)
print ('Product Price: ' product_price)
print('Product Description: ' product_description)
print ('User Review: ' )
author = rev.find('span', attrs={'class' : 'rauthor'}).text
print('Author: ' author)
stars = rev.find('span', attrs={'class' : 'rstars'}).text
print('Stars: ' stars)
review_of_product = rev.find('span' , attrs={'class' : 'rtext'}).text
print('Review: ' review_of_product)
review_length = len(review_of_product)
print('Length: ')
print(review_length)
print('------------')
#Import CSV
import csv
#Open File 1 in CSV File
with open(filepath, 'w') as dataout:
datawriter = csv.writer(dataout, delimiter= ',', quotechar= '"', quoting = csv.QUOTE_NONNUMERIC)
headers = ['ProductId', 'Product Title', 'Product Price', 'Author', 'Stars', 'Length of Review']
datawriter.writerow(headers)
for link in product_results:
productId = link.find('span', attrs={'class' : 'productid'}).text
product_title = link.find('span', attrs={'class' : 'producttitle'}).text
product_price = link.find('span', attrs={'class' : 'productprice'}).text
for rev in review_results:
author = rev.find('span', attrs={'class' : 'rauthor'}).text
stars = rev.find('span', attrs={'class' : 'rstars'}).text
review_of_product = rev.find('span' , attrs={'class' : 'rtext'}).text
datawriter.writerow([productId, product_title, product_price, author, stars, len(review_of_product)])
#Open File 2 in CSV File
with open(filepath2, 'w') as dataout2:
datawriter = csv.writer(dataout2, delimiter= ',', quotechar= '"', quoting = csv.QUOTE_NONNUMERIC)
headers = ['ProductId', 'Author', 'Stars', 'Review Text']
datawriter.writerow(headers)
for link in product_results:
productId = link.find('span', attrs={'class' : 'productid'}).text
for rev in review_results:
author = rev.find('span', attrs={'class' : 'rauthor'}).text
stars = rev.find('span', attrs={'class' : 'rstars'}).text
review_of_product = rev.find('span' , attrs={'class' : 'rtext'}).text
datawriter.writerow([productId, author, stars, review_of_product])
Ответ №1:
Вы просматриваете обзоры каждой ссылки, но ЗА пределами вашего цикла по ссылкам. Таким образом, по сути, вы придерживаетесь только последней итерации этих обзоров. Вам нужно просмотреть эти обзоры по каждой ссылке. Итак, по сути, вам нужно, чтобы это был вложенный цикл.
Я также исправил проблему с пропущенными строками в вашем csv, добавив параметр newline=''
import requests as r
from bs4 import BeautifulSoup
import csv
#Get URL
main_url = 'http://drd.ba.ttu.edu/isqs6339/imbadproducts/'
response = r.get(main_url)
#Set filepaths
filepath = 'dataout1.csv'
filepath2 = 'dataout2.csv'
#Check for good link and get headers
print(response.status_code)
print (response.headers)
soup = BeautifulSoup(response.text, 'lxml')
print(soup.prettify())
#Find all anchors on the page
search_results = soup.find('div', attrs={'id' : 'searchresults'})
product_results = search_results.find_all('a')
with open(filepath, 'w', newline = '') as dataout:
datawriter = csv.writer(dataout, delimiter= ',', quotechar= '"', quoting = csv.QUOTE_NONNUMERIC)
headers = ['ProductId', 'Product Title', 'Product Price', 'Author', 'Stars', 'Length of Review']
datawriter.writerow(headers)
#Define product link, id, title, price, and description for all products
for link in product_results:
link_url = main_url link.get('href')
productId = link.find('span', attrs={'class' : 'productid'}).text
product_title = link.find('span', attrs={'class' : 'producttitle'}).text
product_price = link.find('span', attrs={'class' : 'productprice'}).text
product_description = link.find('span', attrs={'class' : 'productdesc'}).text
response2 = r.get(link_url)
soup2 = BeautifulSoup(response2.text, 'lxml')
#Find each user review for the product on the page
user_review = soup2.find('div', attrs={'id' : 'userreviews'})
review_results = user_review.find_all('div')
for rev in review_results:
author = rev.find('span', attrs={'class' : 'rauthor'}).text
stars = rev.find('span', attrs={'class' : 'rstars'}).text
review_of_product = rev.find('span' , attrs={'class' : 'rtext'}).text
datawriter.writerow([productId, product_title, product_price, author, stars, len(review_of_product)])
#Open File 2 in CSV File
with open(filepath2, 'w', newline = '') as dataout2:
datawriter = csv.writer(dataout2, delimiter= ',', quotechar= '"', quoting = csv.QUOTE_NONNUMERIC)
headers = ['ProductId', 'Author', 'Stars', 'Review Text']
datawriter.writerow(headers)
for link in product_results:
link_url = main_url link.get('href')
productId = link.find('span', attrs={'class' : 'productid'}).text
response2 = r.get(link_url)
soup2 = BeautifulSoup(response2.text, 'lxml')
#Find each user review for the product on the page
user_review = soup2.find('div', attrs={'id' : 'userreviews'})
review_results = user_review.find_all('div')
for rev in review_results:
author = rev.find('span', attrs={'class' : 'rauthor'}).text
stars = rev.find('span', attrs={'class' : 'rstars'}).text
review_of_product = rev.find('span' , attrs={'class' : 'rtext'}).text
datawriter.writerow([productId, author, stars, review_of_product])