Количество результатов разбивки на страницы BS4 ограничено: 10 вместо 200

#python #beautifulsoup

#python #beautifulsoup

Вопрос:

Вместо вывода 10 ссылок на каждой странице он возвращает только десять ссылок на последней странице. Другими словами, если бы это работало, общее количество ссылок было бы 200.

 from goose3 import Goose
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re

for x in range(1,20):
    numb = str(x)
    req = Request("https://search.crossref.org/?q=north koreaamp;page="   numb)
    html_page = urlopen(req)    


soup = BeautifulSoup(html_page, 'lxml')
print(soup)
links = []
 
for link in soup.findAll('a', attrs={'href': re.compile("^https://doi")}):
    links.append(link.get('href'))
 
print(links)

for ft in links:
    try:
        url = ft
        g = Goose()
        article = g.extract(url=url)
        m = article.cleaned_text
        print(m)
        print("⸻⸻⸻⸻⸻⸻⸻⸻")
    except requests.exceptions.ConnectionError as e:
        pass
  

Вывод: [‘https://doi.org/10.1057/9781137347633.0021 ‘, ‘https://doi.org/10.4135/9781412939607.n388 ‘, ‘https://doi.org/10.4135/9781412953924.n601 ‘, ‘https://doi.org/10.4324/9780203164365 ‘, ‘https://doi.org/10.1787/eco_surveys-kor-2018-4-en ‘, ‘https://doi.org/10.21236/ada523754 ‘, ‘https://doi.org/10.21236/ada441640 ‘, ‘https://doi.org/10.21236/ada441540 ‘, ‘https://doi.org/10.21236/ada560116 ‘, ‘https://doi.org/10.1787/888932592489 «]

Ответ №1:

 import requests
from bs4 import BeautifulSoup

params = {
    'q': 'north korea'
}


def main(url):
    with requests.Session() as req:
        allin = []
        for page in range(1, 21):
            print(f"Extracting Page# {page}")
            params['page'] = page
            r = req.get(url, params=params)
            soup = BeautifulSoup(r.content, 'html.parser')
            target = [x.a['href'] for x in soup.select("div.item-links")]
            allin.extend(target)
        print(allin)


main("https://search.crossref.org/")