#python #pandas #selenium #selenium-webdriver
Вопрос:
в своем проекте я пытаюсь сделать несколько веб-скребков, чтобы получить все данные о футболистах, принадлежащих к определенной национальности . Но проблема в том , что, хотя я получаю все ссылки для стран и игроков, мой код останавливается после достижения 10-й страны, и он возвращает мне эту ошибку, хотя я уже получил ссылку. Вот мой код :
from time import sleep
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.chrome.options import Options
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
options = Options()
# Creating our Players Dictionary
sofa_score_players = pd.DataFrame(columns=['name', 'nationality', 'age', 'height', 'preferred_foot',
'team', 'position', 'number', 'highest_transfert_fee'])
# We specify our Chrome driver path
path = "C:/Users/Al4D1N/Documents/ChromeDriver_webscraping/chromedriver.exe"
driver = webdriver.Chrome(options=options, executable_path=path)
url = "https://www.sofascore.com/football/rankings/fifa"
driver.get(url)
footballCountries = driver.find_elements_by_css_selector("div.styles__RankingsItemContainer-coiowh-0.hvNSrf")
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException,)
# waiting_issue = WebDriverWait(driver, 3, ignored_exceptions=ignored_exceptions).until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, footballCountries)))
for footballCountry in footballCountries:
sleep(2)
#waiting_issue = WebDriverWait(driver, 3, ignored_exceptions=ignored_exceptions).until(
# expected_conditions.presence_of_element_located((By.CSS_SELECTOR, footballCountries)))
# in each element , we select the a tag
atags = footballCountry.find_elements_by_css_selector('a')
for atag in atags:
# In each atag, select the href
href = atag.get_attribute('href')
print(href)
# Open a new window
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[1])
driver.get(href)
# sleep(2)
# get <a> tags for countries players
boxplayers = driver.find_elements_by_css_selector('a.componentStyles__CardWrapper-sc-8o3g86-0.cAhval')
for player in boxplayers:
playerlink = player.get_attribute('href')
# Open a new window
print(playerlink)
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[2])
driver.get(playerlink)
# sleep(2)
# Get player factors
player_details = driver.find_elements_by_css_selector('h2.styles__DetailBoxTitle-sc-1ss54tr-11.gMYPyy')
player_source_details = driver.find_elements_by_css_selector('div.Content-sc-1o55eay-0.gYsVZh')
player_nationality = player_details[0].text
player_age = player_details[1].text
player_height = player_details[2].text
player_preferred_foot = player_details[3].text
player_position = player_details[4].text
player_number = player_details[5].text
player_name = player_source_details[0].text
player_team = player_source_details[2].text
player_highest_transfert_fee = driver.find_element_by_css_selector('div.Content-sc-1o55eay-0.gYsVZh').text
# Now we store them in our dataframe
sofa_score_players = sofa_score_players.append({'name': player_name, 'nationality': player_nationality,
'age': player_age, 'height': player_height,
'preferred_foot': player_preferred_foot,
'team': player_team,
'position': player_position, 'number': player_number,
'highest_transfert_fee': player_highest_transfert_fee},
ignore_index=True)
# Close the tab with URL B
driver.close()
# Switch back to the first tab with URL A
driver.switch_to.window(driver.window_handles[1])
# Close the tab with URL B
driver.close()
# Switch back to the first tab with URL A
driver.switch_to.window(driver.window_handles[0])
driver.close()
# We store our dataframe in an excel file to be easily readable
sofa_score_players.to_excel('Sofa_Score_Players.xlsx', index=False)
Поскольку этот код занимает много времени , чтобы закончить, есть ли какой-либо способ, которым я могу быть уверен, что он пройдет по всем ссылкам, которые я получил в начале?