#python #web-scraping #scrapy #http-status-code-404
#python #очистка веб-страниц #scrapy #http-status-code-404
Вопрос:
'# -*- coding: utf-8 -*-
import scrapy
import json
class NtsschoolSpider(scrapy.Spider):
name = 'ntsschool'
start_urls = ['https://directory.ntschools.net/#/schools']
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9,ur;q=0.8",
"Referer": "https://directory.ntschools.net/",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
"X-Requested-With": "Fetch",
}
def parse(self, response):
url = 'https://directory.ntschools.net/api/System/GetAllSchools'
yield scrapy.Request(url,
callback = self.parse_api,
headers = self.headers)
def parse_api(self, response):
base_url = 'https://directory.ntschools.net/api/System/GetSchool?itSchoolCode'
raw_data = response.body
data = json.loads(raw_data)
for school in data:
school_code = school['itSchoolCode']
school_url = base_url school_code
request = scrapy.Request(school_url,
callback = self.parse_url,
headers = self.headers )
yield request
def parse_url(self, response):
raw_data = response.body
data = json.loads(raw_data)
yield {
'Name' : data['name'],
'Physical_address': data['physicalAddress']['displayAddress'],
'Postal_address': data['postalAddress']['displayAddress'],
'Email': data['mail'],
'Phone': data['telephoneNumber']
}
'
Ошибка:
2020-11-26 12:18:42 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <404 https://directory.ntschools.net/api/System/GetSchool?itSchoolCodelarapsch>: HTTP status code is not handled or not allowed
2020-11-26 12:18:42 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <404 https://directory.ntschools.net/api/System/GetSchool?itSchoolCodelarrasch>: HTTP status code is not handled or not allowed
2020-11-26 12:18:42 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <404 https://directory.ntschools.net/api/System/GetSchool?itSchoolCodekathesch>: HTTP status code is not handled or not allowed
Комментарии:
1. Посмотрите на URL-адреса, которые получают 404s. Вы видите проблему с ними? В частности, строки запроса,
?itSchoolCodelarapsch
и?itSchoolCodelarrasch
и?itSchoolCodekathesch
?2. Спасибо, что указали мне на ошибку, теперь я понимаю ошибку.
Ответ №1:
Простая опечатка (вы забыли «=» в конце вашего base_url
). Просто добавьте его, и он будет работать:
base_url = 'https://directory.ntschools.net/api/System/GetSchool?itSchoolCode='
Комментарии:
1. Спасибо, я добавляю «=», и он начинает работать.