#python #python-3.x #web-scraping #beautifulsoup
#python #python-3.x #очистка веб-страниц #beautifulsoup
Вопрос:
Я пытаюсь выполнить некоторую очистку, и я застрял на основной проблеме (я полагаю?)
Вот мой сценарий до сих пор :
from requests import get
from bs4 import BeautifulSoup
url = 'http://www.imdb.com/search/title?release_date=2017amp;sort=num_votes,descamp;page=1'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
movies_containers = soup.find_all('div', class_ = 'lister-item mode-advanced')
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
#gross=[] #many movies have no record
movie_description=[]
movie_duration=[]
movie_genre=[]
for container in movies_containers:
if container.find_all('div', class_ = 'ratings-metascore') is not None:
name = container.find('h3', class_ = 'lister-item-header').a.text
names.append(name)
year = container.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
year = year.replace('(', ' ')
year = year.replace(')', ' ')
years.append(year)
imdb_rating = float(container.find('div', class_ = 'inline-block ratings-imdb-rating').text)
imdb_ratings.append(imdb_rating)
score = container.find('span', class_ = 'metascore').text
metascores.append(score)
И я получил эту ошибку :
AttributeError: 'NoneType' object has no attribute 'text'
Я не понимаю, почему эта строка кода не работает.
Когда я удаляю .text :
score = container.find('span', class_ = 'metascore')
Это дает мне это :
<span class="metascore favorable">77 </span>
Есть идеи?
Спасибо
Ответ №1:
Некоторые score
теги на самом деле None
являются причиной ошибки. Попробуйте это:
import requests
from bs4 import BeautifulSoup
url = 'http://www.imdb.com/search/title?release_date=2017amp;sort=num_votes,descamp;page=1'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
movies_containers = soup.find_all('div', class_='lister-item mode-advanced')
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
movie_description = []
movie_duration = []
movie_genre = []
for container in movies_containers:
if container.find_all('div', class_='ratings-metascore') is not None:
name = container.find('h3', class_='lister-item-header').a.text
names.append(name)
year = container.h3.find('span', class_='lister-item-year text-muted unbold').text
years.append(year.replace('(', ' ').replace(')', ' '))
imdb_rating = float(container.find('div', class_='inline-block ratings-imdb-rating').text)
imdb_ratings.append(imdb_rating)
score = container.find('span', class_='metascore')
if score:
metascores.append(score.getText(strip=True))
print(metascores)
Вывод:
['77', '74', '67', '84', '94', '76', '73', '85', '69', '81', '86', '88', '45', '81', '87', '75', '58', '65', '44', '62', '39', '65', '94', '48', '82', '52', '54', '93', '56', '73', '52', '41', '75', '47', '77', '63', '34', '75', '29', '51', '37', '65']