#python #web-scraping #scrapy #web-crawler #google-crawlers
#python #очистка веб-страниц #scrapy #веб-сканер #google-сканеры
Вопрос:
Через определенное время механизм широкого обхода зависал, не показывая никаких ошибок / журналов, как вы можете видеть здесь. Я изменил много настроек времени, таких как встроенные 15 выделенных IP-прокси, download_delay, concurrent_items и concurrent_requests, concurrent_requests_per_ip и т. Д., Но все еще сталкивался с этой проблемой.
myspider.py
import datetime
import urllib3
from six.moves.urllib.parse import urlsplit
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import CustomLinkExtractor
from ..items import BroadCrawlerItem
import collections
from ..extractors import DateExtractor, FaviconExtractor
import extraction
from scrapy.utils.project import get_project_settings
import os
import tldextract
from urllib.parse import urlparse
class FollowAllSpider(CrawlSpider):
name = 'follow_all'
start_urls = [
'https://news.google.com/topstories',
"https://en.wikipedia.org/wiki/Bill_Gates",
"https://detailed.com/50/",
"https://www.techlearning.com/news/15-awesome-article-sites-for-students",
"https://jamesclear.com/articles",
'https://en.wikipedia.org/wiki/Donald_Trump',
'https://en.wikipedia.org/wiki/Elon_Musk'
]
denylist = ['icann.org', 'blogspot.com', 'ganji.com', 'dihe.cn', 'google.com', 'twitter.com',
'glassdoor.com', 'glassdoor.ie', 'youtube.com', 'wordcamp.com', 'wordcamp.org',
'ganchang.cn', 'aa.com.tr', 'xinhuanet.com', 'nasdaq.com', 'aa.com.tr', 'wikipedia.org',
'wikinews.org', 'wikimedia.org','indianexpress.com', 'whatsapp.com', 'edweek.org', 'apple.com',
'facebook.com', 'reddit.com', 'linkedin.com', 'stackoverflow.com', 't.co', 'fzcom.cn', 'github.com',
'amazon.com']
rules = [Rule(CustomLinkExtractor(deny_domains=denylist), process_links='filter_links', follow=True, callback='parse_item')]
count = 1
def filter_links(self, links):
for link in links:
url = link.url.lower()
if 'privacy' in url or 'forgot' in url or 'password' in url or 'developer' in url
or 'login' in url or 'twitter.com' in url or 'linkedin.com':
continue
yield link
def parse_item(self, response):
items = BroadCrawlerItem()
absolute_url = response.request.url
domain = urlsplit(absolute_url)[0] "://" urlsplit(absolute_url)[1]
title = self.get_title(response)
meta_keywords = self.get_meta_keywords(response)
extracted = extraction.Extractor().extract(response.body, source_url=absolute_url)
meta_descripiton = self.get_meta_description(extracted)
if not meta_descripiton:
print('description not found..!')
return
print(f'{self.count} | {domain} | {absolute_url}')
self.count = 1
yield items
settings.py
BOT_NAME = 'broad_crawler'
SPIDER_MODULES = ['broad_crawler.spiders']
NEWSPIDER_MODULE = 'broad_crawler.spiders'
ITEM_PIPELINES = {
'broad_crawler.pipelines.BroadCrawlerPipeline': 300,
}
DOWNLOAD_DELAY = 0
CONCURRENT_ITEMS = 15
CONCURRENT_REQUESTS = 20
CONCURRENT_REQUESTS_PER_IP = 8
CONCURRENT_REQUESTS_PER_DOMAIN = 2
REACTOR_THREADPOOL_MAXSIZE = 20
ROBOTSTXT_OBEY = False
DOWNLOAD_TIMEOUT = 8
USER_AGENT = 'my-bot'
COOKIES_ENABLED = False
RETRY_ENABLED = False
DOWNLOAD_MAXSIZE = 5592405
REDIRECT_ENABLED = False
AJAXCRAWL_ENABLED = True
LOG_LEVEL = 'WARN'
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
# SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = True
# REDIS_URL = "redis =//127.0.0.1 =6379"
Комментарии:
1. Есть ли какие-либо действия
LOG_LEVEL = 'DEBUG'
?2. нет, активности также нет