Проблемы с очисткой с помощью BeautifulSoup

#python #python-3.x #web-scraping #beautifulsoup

#python #python-3.x #очистка веб-страниц #beautifulsoup

Вопрос:

Я пытаюсь выполнить некоторую очистку, и я застрял на основной проблеме (я полагаю?)

Вот мой сценарий до сих пор :

 from requests import get
from bs4 import BeautifulSoup

url = 'http://www.imdb.com/search/title?release_date=2017amp;sort=num_votes,descamp;page=1'

response = get(url)

soup = BeautifulSoup(response.text, 'html.parser')


movies_containers = soup.find_all('div', class_ = 'lister-item mode-advanced')

names = []
years = []
imdb_ratings = []
metascores = []
votes = []
#gross=[] #many movies have no record
movie_description=[]
movie_duration=[]
movie_genre=[]


for container in movies_containers:
    if container.find_all('div', class_ = 'ratings-metascore') is not None:

        name = container.find('h3', class_ = 'lister-item-header').a.text
        names.append(name)

        year = container.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
        year = year.replace('(', ' ')
        year = year.replace(')', ' ')
        years.append(year)

        imdb_rating = float(container.find('div', class_ = 'inline-block ratings-imdb-rating').text)
        imdb_ratings.append(imdb_rating)

        score = container.find('span', class_ = 'metascore').text
        metascores.append(score)
  

И я получил эту ошибку :

 AttributeError: 'NoneType' object has no attribute 'text'
  

Я не понимаю, почему эта строка кода не работает.

Когда я удаляю .text :

 score = container.find('span', class_ = 'metascore')
  

Это дает мне это :

 <span class="metascore favorable">77        </span>
  

Есть идеи?

Спасибо

Ответ №1:

Некоторые score теги на самом деле None являются причиной ошибки. Попробуйте это:

 import requests
from bs4 import BeautifulSoup

url = 'http://www.imdb.com/search/title?release_date=2017amp;sort=num_votes,descamp;page=1'

soup = BeautifulSoup(requests.get(url).text, 'html.parser')
movies_containers = soup.find_all('div', class_='lister-item mode-advanced')

names = []
years = []
imdb_ratings = []
metascores = []
votes = []
movie_description = []
movie_duration = []
movie_genre = []

for container in movies_containers:
    if container.find_all('div', class_='ratings-metascore') is not None:
        name = container.find('h3', class_='lister-item-header').a.text
        names.append(name)

        year = container.h3.find('span', class_='lister-item-year text-muted unbold').text
        years.append(year.replace('(', ' ').replace(')', ' '))

        imdb_rating = float(container.find('div', class_='inline-block ratings-imdb-rating').text)
        imdb_ratings.append(imdb_rating)

        score = container.find('span', class_='metascore')
        if score:
            metascores.append(score.getText(strip=True))
print(metascores)
  

Вывод:

 ['77', '74', '67', '84', '94', '76', '73', '85', '69', '81', '86', '88', '45', '81', '87', '75', '58', '65', '44', '62', '39', '65', '94', '48', '82', '52', '54', '93', '56', '73', '52', '41', '75', '47', '77', '63', '34', '75', '29', '51', '37', '65']