Не удается очистить URL-адреса изображений с помощью beautiful soup и python

#python #web-scraping #beautifulsoup #python-requests #scrapinghub

#питон #соскабливание полотна #прекрасный суп #python-запросы #scrapinghub

Вопрос:

Итак, в основном я использую приведенный ниже код, чтобы очистить URL-адреса изображений кредитных карт от соответствующих ссылок в переменной explore_more_url.

 from urllib.request import urlopen
from bs4 import BeautifulSoup
import json, requests, re
from selenium import webdriver

driver = webdriver.Chrome(executable_path="C:\Users\Hari\Downloads\chromedriver.exe")

img_url = []

explore_more_url = ['https://www.axisbank.com/retail/cards/credit-card/axis-bank-ace-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-aura-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/privilege-easy-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/reserve-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-plus-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/indianoil-axis-bank-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-magnus-card/feature-benefits', 'https://www.axisbank.com/retail/cards/credit-card/flipkart-axisbank-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/my-zone-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/neo-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-signature-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-infinite-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/privilege-credit-card-with-unlimited-travel-benefits-account', 'https://www.axisbank.com/retail/cards/credit-card/miles-more-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/axis-bank-select-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/pride-platinum-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/pride-signature-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/my-zone-easy-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/insta-easy-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/signature-credit-card-with-lifestyle-benefits', 'https://www.axisbank.com/retail/cards/credit-card/platinum-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/titanium-smart-traveler-credit-card', 'https://www.axisbank.com/retail/cards/credit-card/my-wings-credit-card/features-benefits']

for x in explore_more_url:
    driver.get(x)
    soup_1 = BeautifulSoup(driver.page_source, 'lxml')
    img_url.append("https://www.axisbank.com"   soup_1.find('img', alt="Fast Forward Banner").get('src'))

print(img_url)
 

Вывод :

 Traceback (most recent call last):
 File "C:UsersHariPycharmProjectsCard_Prjaxis.py", line 82, in <module>
   img_url.append("https://www.axisbank.com"   soup_1.find('img', alt="Fast Forward Banner").get('src'))
AttributeError: 'NoneType' object has no attribute 'get'
 

Изображения выглядят примерно так в каждой ссылке:
введите описание изображения здесь

Какой подходящий код я мог бы использовать, чтобы получить именно то, что я ожидаю?

Ответ №1:

Одним из способов получения изображения может быть следующий:

 import requests
from bs4 import BeautifulSoup

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
page = requests.get("https://www.axisbank.com/retail/cards/credit-card/axis-bank-ace-credit-card", headers=headers).text
img_src_ = BeautifulSoup(page, "html.parser").select_one('.bannerWrapper img')["src"]

with open(img_src_.rsplit("/")[-1], "wb") as image:
    image.write(requests.get(f"https://www.axisbank.com{img_src_}").content)
 

Вывод: .jpg файл в локальном каталоге скрипта.

 ace-product-landing-web-version-1920x360.jpg
 

РЕДАКТИРОВАТЬ: Чтобы получить только исходные URL-адреса, попробуйте следующее:

 import requests
from bs4 import BeautifulSoup

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}

explore_more_url = [
    'https://www.axisbank.com/retail/cards/credit-card/axis-bank-ace-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/axis-bank-aura-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/privilege-easy-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/reserve-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-plus-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/indianoil-axis-bank-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/axis-bank-magnus-card/feature-benefits',
    'https://www.axisbank.com/retail/cards/credit-card/flipkart-axisbank-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/axis-bank-freecharge-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/my-zone-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/neo-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-signature-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/axis-bank-vistara-infinite-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/privilege-credit-card-with-unlimited-travel-benefits-account',
    'https://www.axisbank.com/retail/cards/credit-card/miles-more-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/axis-bank-select-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/pride-platinum-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/pride-signature-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/my-zone-easy-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/insta-easy-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/signature-credit-card-with-lifestyle-benefits',
    'https://www.axisbank.com/retail/cards/credit-card/platinum-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/titanium-smart-traveler-credit-card',
    'https://www.axisbank.com/retail/cards/credit-card/my-wings-credit-card/features-benefits',
]

img_urls = []
for url in explore_more_url:
    page = requests.get(url, headers=headers).text
    try:
        img_src_ = BeautifulSoup(page, "html.parser").select_one('.bannerWrapper img')["src"]
        print(f"Finding image source url for {url}")
        img_urls.append(f"https://www.axisbank.com{img_src_}")
    except (KeyError, TypeError):
        continue

print(img_urls)
 

Вывод:

 ['https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/ace-product-landing-web-version-1920x360.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/health-and-wellness-product-page-1920x360_v1.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/freecharge-product-landing-page-desktop-banner-revised.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/indian-oil-banner-desktop.jpg', 'https://www.axisbank.com/img/magnuscard/apply-now.png', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/flipkart-abcc-desk.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/product-landing-page-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/myzone-easy-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/neo-credit-card-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/vistara-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/vistara-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/vistara-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/privilege-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/miles---more-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/desktop-select-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/pride-platinum-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/pride-platinum-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/myzone-easy-1920-360-desktop-banner.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/insta-easy-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/signature-credit-card-with.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/platinum-credit-card.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/signature-credit-card-with.jpg', 'https://www.axisbank.com/images/default-source/revamp_new/cards/credit-cards/desktop/my-wings-credit-card.jpg']
 

Комментарии:

1. @ baduker Я высоко ценю код, который вы написали, спасибо за это, но на данный момент мне просто нужен список всех URL-адресов изображений карточек.

2. @ baduker слов недостаточно, чтобы описать мою благодарность вам за вашу правку, но есть небольшая проблема: я просмотрел ссылки в выходных данных, некоторые ссылки дают мне изображение карты, а некоторые ссылки дают не изображение карты, а какое-то неуместное изображение. Я был бы рад, если бы вы смогли решить эту проблему.

3. @ baduker Да, но есть некоторые ссылки, в которых нет баннеров, но есть изображения карточек где-то внизу страницы.