#python #beautifulsoup
#python #beautifulsoup
Вопрос:
Вместо вывода 10 ссылок на каждой странице он возвращает только десять ссылок на последней странице. Другими словами, если бы это работало, общее количество ссылок было бы 200.
from goose3 import Goose
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
for x in range(1,20):
numb = str(x)
req = Request("https://search.crossref.org/?q=north koreaamp;page=" numb)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, 'lxml')
print(soup)
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^https://doi")}):
links.append(link.get('href'))
print(links)
for ft in links:
try:
url = ft
g = Goose()
article = g.extract(url=url)
m = article.cleaned_text
print(m)
print("⸻⸻⸻⸻⸻⸻⸻⸻")
except requests.exceptions.ConnectionError as e:
pass
Вывод: [‘https://doi.org/10.1057/9781137347633.0021 ‘, ‘https://doi.org/10.4135/9781412939607.n388 ‘, ‘https://doi.org/10.4135/9781412953924.n601 ‘, ‘https://doi.org/10.4324/9780203164365 ‘, ‘https://doi.org/10.1787/eco_surveys-kor-2018-4-en ‘, ‘https://doi.org/10.21236/ada523754 ‘, ‘https://doi.org/10.21236/ada441640 ‘, ‘https://doi.org/10.21236/ada441540 ‘, ‘https://doi.org/10.21236/ada560116 ‘, ‘https://doi.org/10.1787/888932592489 «]
Ответ №1:
import requests
from bs4 import BeautifulSoup
params = {
'q': 'north korea'
}
def main(url):
with requests.Session() as req:
allin = []
for page in range(1, 21):
print(f"Extracting Page# {page}")
params['page'] = page
r = req.get(url, params=params)
soup = BeautifulSoup(r.content, 'html.parser')
target = [x.a['href'] for x in soup.select("div.item-links")]
allin.extend(target)
print(allin)
main("https://search.crossref.org/")