#python #web-scraping #scrapy #bloom-filter
#python #очистка веб-страниц #scrapy #bloom-filter
Вопрос:
Я использую Scrapy с BloomFilter, и через 10 минут у меня возникает эта ошибка в цикле :
2016-10-03 18:03:34 [twisted] CRITICAL:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/task.py", line 517, in _oneWorkUnit
result = next(self._iterator)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 63, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
File "/usr/local/lib/python2.7/dist-packages/scrapy/core/scraper.py", line 183, in _process_spidermw_output
self.crawler.engine.crawl(request=output, spider=spider)
File "/usr/local/lib/python2.7/dist-packages/scrapy/core/engine.py", line 209, in crawl
self.schedule(request, spider)
File "/usr/local/lib/python2.7/dist-packages/scrapy/core/engine.py", line 215, in schedule
if not self.slot.scheduler.enqueue_request(request):
File "/usr/local/lib/python2.7/dist-packages/scrapy/core/scheduler.py", line 54, in enqueue_request
if not request.dont_filter and self.df.request_seen(request):
File "dirbot/custom_filters.py", line 20, in request_seen
self.fingerprints.add(fp)
File "/usr/local/lib/python2.7/dist-packages/pybloom/pybloom.py", line 182, in add
raise IndexError("BloomFilter is at capacity")
IndexError: BloomFilter is at capacity
filter.py :
from pybloom import BloomFilter
from scrapy.utils.job import job_dir
from scrapy.dupefilters import BaseDupeFilter
class BLOOMDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
def __init__(self, path=None):
self.file = None
self.fingerprints = BloomFilter(2000000, 0.00001)
@classmethod
def from_settings(cls, settings):
return cls(job_dir(settings))
def request_seen(self, request):
fp = request.url
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
def close(self, reason):
self.fingerprints = None
Я ищу в Google все возможности, но ничего не работает.
Спасибо за вашу помощь.
Ответ №1:
Используйте pybloom.ScalableBloomFilter
вместо BloomFilter
.
from pybloom import ScalableBloomFilter
from scrapy.utils.job import job_dir
from scrapy.dupefilters import BaseDupeFilter
class BLOOMDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
def __init__(self,
path=None,
initial_capacity=2000000,
error_rate=0.00001,
mode=ScalableBloomFilter.SMALL_SET_GROWTH):
self.file = None
self.fingerprints = ScalableBloomFilter(
initial_capacity, error_rate, mode)
Комментарии:
1. Нужно ли мне добавлять метод @classmethod, как в моем предыдущем коде?
2. @Pixel, добавляй все, что хочешь!