#python #scrapy
#питон #скребок
Вопрос:
Я извлекаю данные веб-страницы, и мне нужно сохранить выходные данные некоторых загрузчиков в списке словарей. Например, часть моего вывода будет выглядеть так:
{"nine:"character","ten":"condition", "eleven":"score", "twelve":"graded":,"v9": "Electabuzz", "v10": "Near Mint", "v11": "8", "v12": "Yes"}
Вместо этого я хочу сохранить эти значения в виде двух отдельных списков, один для тех, которые начинаются с v
, а другой для цифр. Например,
{"numeral":["character", "condition", "score","graded],"values":["Electabuzz", "Near Mint", "8", "Yes"]}
Я пытаюсь выполнить это в рамках scrapy, однако я не могу получить результат, подобный приведенному выше, например, вот мой код:
import scrapy from scrapy.item import Field from itemloaders.processors import TakeFirst from scrapy.crawler import CrawlerProcess from scrapy.loader import ItemLoader class EbayItem(scrapy.Item): category = Field(output_processor=TakeFirst()) name = Field(output_processor=TakeFirst()) price = Field(output_processor=TakeFirst()) product_url = Field(output_processor=TakeFirst()) eleven = Field(output_processor=TakeFirst()) twelve = Field(output_processor=TakeFirst()) v11 = Field(output_processor=TakeFirst()) v12 = Field(output_processor=TakeFirst()) class EbaySpider(scrapy.Spider): name = 'ebay' start_urls = { 'english': 'https://www.ebay.com/sch/i.html?_from=R40amp;_nkw=pokemon cardsamp;_sacat=2536amp;LH_TitleDesc=0amp;_sop=16amp;LH_All=1amp;rt=ncamp;Language=Englishamp;_dcat=183454', 'japanese':'https://www.ebay.com/sch/i.html?_from=R40amp;_nkw=pokemon cardsamp;_sacat=2536amp;LH_TitleDesc=0amp;_sop=16amp;LH_All=1amp;_oaa=1amp;rt=ncamp;Language=Japaneseamp;_dcat=183454' } def start_requests(self): for category, url in self.start_urls.items(): yield scrapy.Request( url=url, callback=self.parse, cb_kwargs={ 'category': category } ) def parse(self, response, category): all_cards = response.xpath('//div[@class="s-item__wrapper clearfix"]') for card in all_cards: loader = ItemLoader(EbayItem(), selector=card) loader.add_value('category', category) loader.add_xpath('name', './/h3/text()') loader.add_xpath('price', './/span[@class="s-item__price"]//text()') loader.add_xpath('product_url', './/a[@class="s-item__link"]//@href') yield scrapy.Request( card.xpath('.//a[@class="s-item__link"]//@href').get(), callback=self.parse_product_details, cb_kwargs={'loader': loader} ) def parse_product_details(self, response, loader): #content - names data11 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][5]/div[@class='ux-labels-values__labels'][2]/div[@class='ux-labels-values__labels-content']/div/span//text()").get() loader.add_value('eleven', data11) data12 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][6]/div[@class='ux-labels-values__labels'][2]/div[@class='ux-labels-values__labels-content']/div/span//text()").get() loader.add_value('twelve', data12) #values val11 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][5]/div[@class='ux-labels-values__values'][2]/div[@class='ux-labels-values__values-content']/div/span//text()").get() loader.add_value('v11', val11) val12 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][6]/div[@class='ux-labels-values__values'][2]/div[@class='ux-labels-values__values-content']/div/span//text()").get() loader.add_value('v12', val12) yield loader.load_item() process = CrawlerProcess( settings={ 'FEED_URI': 'test.jl', 'FEED_FORMAT': 'jsonlines' } ) process.crawl(EbaySpider) process.start()
Я также пробовал форматировать loaders
подобное:
data11 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][5]/div[@class='ux-labels-values__labels'][2]/div[@class='ux-labels-values__labels-content']/div/span//text()").get() data12 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][6]/div[@class='ux-labels-values__labels'][2]/div[@class='ux-labels-values__labels-content']/div/span//text()").get() loader.add_value('eleven', [data11, data12]) #values val11 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][5]/div[@class='ux-labels-values__values'][2]/div[@class='ux-labels-values__values-content']/div/span//text()").get() val12 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][6]/div[@class='ux-labels-values__values'][2]/div[@class='ux-labels-values__values-content']/div/span//text()").get() loader.add_value('v11', [val11, val12])
Однако я считаю, что при этом выбираются только значения из последнего ответа и первой переменной в loader.add_value
Кажется, это работает довольно эффективно, когда я это делаю:
loader.add_value("v11", {"v11":[val11, val12]})
Ответ №1:
Если я правильно понимаю ваш вопрос, то ваш xpath неверен.
import scrapy from scrapy.item import Field from itemloaders.processors import TakeFirst from scrapy.crawler import CrawlerProcess from scrapy.loader import ItemLoader class EbayItem(scrapy.Item): category = Field(output_processor=TakeFirst()) name = Field(output_processor=TakeFirst()) price = Field(output_processor=TakeFirst()) product_url = Field(output_processor=TakeFirst()) numeral = Field(output_processor=TakeFirst()) values = Field(output_processor=TakeFirst()) class EbaySpider(scrapy.Spider): name = 'ebay' start_urls = { 'english': 'https://www.ebay.com/sch/i.html?_from=R40amp;_nkw=pokemon cardsamp;_sacat=2536amp;LH_TitleDesc=0amp;_sop=16amp;LH_All=1amp;rt=ncamp;Language=Englishamp;_dcat=183454', 'japanese':'https://www.ebay.com/sch/i.html?_from=R40amp;_nkw=pokemon cardsamp;_sacat=2536amp;LH_TitleDesc=0amp;_sop=16amp;LH_All=1amp;_oaa=1amp;rt=ncamp;Language=Japaneseamp;_dcat=183454' } def start_requests(self): for category, url in self.start_urls.items(): yield scrapy.Request( url=url, callback=self.parse, cb_kwargs={ 'category': category } ) def parse(self, response, category): all_cards = response.xpath('//div[@class="s-item__wrapper clearfix"]') for card in all_cards: loader = ItemLoader(EbayItem(), selector=card) loader.add_value('category', category) loader.add_xpath('name', './/h3/text()') loader.add_xpath('price', './/span[@class="s-item__price"]//text()') loader.add_xpath('product_url', './/a[@class="s-item__link"]//@href') yield scrapy.Request( card.xpath('.//a[@class="s-item__link"]//@href').get(), callback=self.parse_product_details, cb_kwargs={'loader': loader} ) def parse_product_details(self, response, loader): numeral = ["character", "condition", "score", "graded"] values = [] character = response.xpath('//div[div[div[span[text()="Character:"]]]]/following-sibling::div//span/text()').get(default='Not-found') values.append(character) condition = response.xpath('//div[div[div[span[text()="Card Condition:"]]]]/following-sibling::div//span/text()').get(default='Unknown') values.append(condition) grade = response.xpath('//div[div[div[span[text()="Grade:"]]]]/following-sibling::div//span/text()').get(default='Unknown') values.append(grade) graded = response.xpath('//div[div[div[span[text()="Graded:"]]]]/following-sibling::div//span/text()').get(default='No') values.append(graded) loader.add_value(None, {'numeral': [numeral], 'values': [values]}) yield loader.load_item() process = CrawlerProcess( settings={ 'FEED_URI': 'test.jl', 'FEED_FORMAT': 'jsonlines' } ) process.crawl(EbaySpider) process.start()
Выход:
{'category': 'english', 'name': 'PSA 10 1st Edition Fossil COMPLETE Set 62/62 Pokemon Cards', 'numeral': ['character', 'condition', 'score', 'graded'], 'price': 'GBP 69,420.00', 'product_url': 'https://www.ebay.com/itm/292969293305?hash=item443654a5f9:g:CrcAAOSwlR9cavzMamp;LH_All=1', 'values': ['Dragonite', 'Near Mint or better', 'Unknown', 'No']} [scrapy.core.scraper] DEBUG: Scraped from lt;200 https://www.ebay.com/itm/175035501592?hash=item28c0edcc18:g:39EAAOSwE~thnY31amp;LH_All=1gt; {'category': 'japanese', 'name': 'Beckett Gem Mint No Rarity Venusaur BGS 9.5 1st edition BCCG Pokemon ' 'Card 1996', 'numeral': ['character', 'condition', 'score', 'graded'], 'price': 'AU $85,000.00', 'product_url': 'https://www.ebay.com/itm/175035501592?hash=item28c0edcc18:g:39EAAOSwE~thnY31amp;LH_All=1', 'values': ['Venusaur', 'Unknown', '9.5', 'Yes']} And so on