Хранение нескольких загрузчиков в списке

#python #scrapy

#питон #скребок

Вопрос:

Я извлекаю данные веб-страницы, и мне нужно сохранить выходные данные некоторых загрузчиков в списке словарей. Например, часть моего вывода будет выглядеть так:

 {"nine:"character","ten":"condition", "eleven":"score", "twelve":"graded":,"v9": "Electabuzz", "v10": "Near Mint", "v11": "8", "v12": "Yes"}  

Вместо этого я хочу сохранить эти значения в виде двух отдельных списков, один для тех, которые начинаются с v , а другой для цифр. Например,

 {"numeral":["character", "condition", "score","graded],"values":["Electabuzz", "Near Mint", "8", "Yes"]}  

Я пытаюсь выполнить это в рамках scrapy, однако я не могу получить результат, подобный приведенному выше, например, вот мой код:

 import scrapy from scrapy.item import Field from itemloaders.processors import TakeFirst from scrapy.crawler import CrawlerProcess  from scrapy.loader import ItemLoader   class EbayItem(scrapy.Item):  category = Field(output_processor=TakeFirst())  name = Field(output_processor=TakeFirst())  price = Field(output_processor=TakeFirst())  product_url = Field(output_processor=TakeFirst())  eleven = Field(output_processor=TakeFirst())  twelve = Field(output_processor=TakeFirst())  v11 = Field(output_processor=TakeFirst())  v12 = Field(output_processor=TakeFirst())  class EbaySpider(scrapy.Spider):  name = 'ebay'  start_urls = {  'english': 'https://www.ebay.com/sch/i.html?_from=R40amp;_nkw=pokemon cardsamp;_sacat=2536amp;LH_TitleDesc=0amp;_sop=16amp;LH_All=1amp;rt=ncamp;Language=Englishamp;_dcat=183454',   'japanese':'https://www.ebay.com/sch/i.html?_from=R40amp;_nkw=pokemon cardsamp;_sacat=2536amp;LH_TitleDesc=0amp;_sop=16amp;LH_All=1amp;_oaa=1amp;rt=ncamp;Language=Japaneseamp;_dcat=183454'    }    def start_requests(self):   for category, url in self.start_urls.items():  yield scrapy.Request(  url=url,   callback=self.parse,  cb_kwargs={  'category': category  }  )   def parse(self, response, category):  all_cards = response.xpath('//div[@class="s-item__wrapper clearfix"]')  for card in all_cards:    loader = ItemLoader(EbayItem(), selector=card)  loader.add_value('category', category)  loader.add_xpath('name', './/h3/text()')  loader.add_xpath('price', './/span[@class="s-item__price"]//text()')  loader.add_xpath('product_url', './/a[@class="s-item__link"]//@href')    yield scrapy.Request(  card.xpath('.//a[@class="s-item__link"]//@href').get(),  callback=self.parse_product_details,  cb_kwargs={'loader': loader}  )   def parse_product_details(self, response, loader):  #content - names  data11 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][5]/div[@class='ux-labels-values__labels'][2]/div[@class='ux-labels-values__labels-content']/div/span//text()").get()  loader.add_value('eleven', data11)  data12 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][6]/div[@class='ux-labels-values__labels'][2]/div[@class='ux-labels-values__labels-content']/div/span//text()").get()  loader.add_value('twelve', data12)   #values  val11 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][5]/div[@class='ux-labels-values__values'][2]/div[@class='ux-labels-values__values-content']/div/span//text()").get()  loader.add_value('v11', val11)  val12 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][6]/div[@class='ux-labels-values__values'][2]/div[@class='ux-labels-values__values-content']/div/span//text()").get()  loader.add_value('v12', val12)   yield loader.load_item()   process = CrawlerProcess(  settings={  'FEED_URI': 'test.jl',  'FEED_FORMAT': 'jsonlines'  } ) process.crawl(EbaySpider) process.start()   

Я также пробовал форматировать loaders подобное:

 data11 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][5]/div[@class='ux-labels-values__labels'][2]/div[@class='ux-labels-values__labels-content']/div/span//text()").get() data12 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][6]/div[@class='ux-labels-values__labels'][2]/div[@class='ux-labels-values__labels-content']/div/span//text()").get()  loader.add_value('eleven', [data11, data12])   #values val11 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][5]/div[@class='ux-labels-values__values'][2]/div[@class='ux-labels-values__values-content']/div/span//text()").get() val12 = response.xpath("//div[@class='ux-layout-section__item ux-layout-section__item--table-view']/div[@class='ux-layout-section__row'][6]/div[@class='ux-labels-values__values'][2]/div[@class='ux-labels-values__values-content']/div/span//text()").get()  loader.add_value('v11', [val11, val12])  

Однако я считаю, что при этом выбираются только значения из последнего ответа и первой переменной в loader.add_value

Кажется, это работает довольно эффективно, когда я это делаю:

 loader.add_value("v11", {"v11":[val11, val12]})  

Ответ №1:

Если я правильно понимаю ваш вопрос, то ваш xpath неверен.

 import scrapy from scrapy.item import Field from itemloaders.processors import TakeFirst from scrapy.crawler import CrawlerProcess  from scrapy.loader import ItemLoader   class EbayItem(scrapy.Item):  category = Field(output_processor=TakeFirst())  name = Field(output_processor=TakeFirst())  price = Field(output_processor=TakeFirst())  product_url = Field(output_processor=TakeFirst())  numeral = Field(output_processor=TakeFirst())  values = Field(output_processor=TakeFirst())   class EbaySpider(scrapy.Spider):  name = 'ebay'  start_urls = {  'english': 'https://www.ebay.com/sch/i.html?_from=R40amp;_nkw=pokemon cardsamp;_sacat=2536amp;LH_TitleDesc=0amp;_sop=16amp;LH_All=1amp;rt=ncamp;Language=Englishamp;_dcat=183454',  'japanese':'https://www.ebay.com/sch/i.html?_from=R40amp;_nkw=pokemon cardsamp;_sacat=2536amp;LH_TitleDesc=0amp;_sop=16amp;LH_All=1amp;_oaa=1amp;rt=ncamp;Language=Japaneseamp;_dcat=183454'  }   def start_requests(self):  for category, url in self.start_urls.items():  yield scrapy.Request(  url=url,  callback=self.parse,  cb_kwargs={  'category': category  }  )   def parse(self, response, category):  all_cards = response.xpath('//div[@class="s-item__wrapper clearfix"]')  for card in all_cards:   loader = ItemLoader(EbayItem(), selector=card)  loader.add_value('category', category)  loader.add_xpath('name', './/h3/text()')  loader.add_xpath('price', './/span[@class="s-item__price"]//text()')  loader.add_xpath('product_url', './/a[@class="s-item__link"]//@href')   yield scrapy.Request(  card.xpath('.//a[@class="s-item__link"]//@href').get(),  callback=self.parse_product_details,  cb_kwargs={'loader': loader}  )   def parse_product_details(self, response, loader):  numeral = ["character", "condition", "score", "graded"]  values = []   character = response.xpath('//div[div[div[span[text()="Character:"]]]]/following-sibling::div//span/text()').get(default='Not-found')  values.append(character)   condition = response.xpath('//div[div[div[span[text()="Card Condition:"]]]]/following-sibling::div//span/text()').get(default='Unknown')  values.append(condition)   grade = response.xpath('//div[div[div[span[text()="Grade:"]]]]/following-sibling::div//span/text()').get(default='Unknown')  values.append(grade)   graded = response.xpath('//div[div[div[span[text()="Graded:"]]]]/following-sibling::div//span/text()').get(default='No')  values.append(graded)   loader.add_value(None, {'numeral': [numeral], 'values': [values]})  yield loader.load_item()   process = CrawlerProcess(  settings={  'FEED_URI': 'test.jl',  'FEED_FORMAT': 'jsonlines'  } ) process.crawl(EbaySpider) process.start()  

Выход:

 {'category': 'english',  'name': 'PSA 10 1st Edition Fossil COMPLETE Set 62/62 Pokemon Cards',  'numeral': ['character', 'condition', 'score', 'graded'],  'price': 'GBP 69,420.00',  'product_url': 'https://www.ebay.com/itm/292969293305?hash=item443654a5f9:g:CrcAAOSwlR9cavzMamp;LH_All=1',  'values': ['Dragonite', 'Near Mint or better', 'Unknown', 'No']} [scrapy.core.scraper] DEBUG: Scraped from lt;200 https://www.ebay.com/itm/175035501592?hash=item28c0edcc18:g:39EAAOSwE~thnY31amp;LH_All=1gt; {'category': 'japanese',  'name': 'Beckett Gem Mint No Rarity Venusaur BGS 9.5 1st edition BCCG Pokemon '  'Card 1996',  'numeral': ['character', 'condition', 'score', 'graded'],  'price': 'AU $85,000.00',  'product_url': 'https://www.ebay.com/itm/175035501592?hash=item28c0edcc18:g:39EAAOSwE~thnY31amp;LH_All=1',  'values': ['Venusaur', 'Unknown', '9.5', 'Yes']}  And so on