#python #excel
Вопрос:
Добрый вечер!
Я пытаюсь экспортировать данные JSON, которые я выбрал с помощью Selenium (Предварительный тег), в файл CSV. Я хочу указать конкретные данные и преобразовать их в CSV. Чтобы выбрать другой тип данных, я использую Convtool и конвертирую их.
Например, это весь код JSON со страницы:
{"getUrl":"/395012/Organization/pase11469","className":"Organization","data":{"name":"Förskola Garphyttan","organizationNumber":"212000-1967","centralPhoneNumber":{"value":"019-215030","normalized":" 4619215030","className":"PhoneNumber","isEmpty":false},"faxPhoneNumber":{"value":null,"normalized":null,"className":"PhoneNumber","isEmpty":true},"website":"www.orebro.se","email":{"value":"","className":"Email"},"dateLastModified":"/Date(-62135596800000)/","visitAddress":{"street":"Kilsvägen 2 B","zipCode":"719 40","city":"GARPHYTTAN","countryCode":""},"postalAddress":{"street":"Kilsvägen 2 B","zipCode":"719 40","city":"GARPHYTTAN","countryCode":""},"responsibleCoworker":null,"integrationid":null,"customFields":[],"relation":0,"tags":null,"headOffice":{"id":"pase276386","getUrl":"/395012/Organization/pase276386","className":"Organization","heading":"Örebro kommun","subHeading":"Örebro","categorization":null,"isDeleted":false},"corporateGroup":{"id":"pase276386","getUrl":"/395012/Organization/pase276386","className":"Organization","heading":"Örebro kommun","subHeading":"Örebro","categorization":null,"isDeleted":false},"sharedBody":{"vatNumber":"SE212000196701","lineOfBusiness":"Förskoleutbildning","businessDescription":null,"legalForm":"Kommun","dateOfRegistration":"1975-07-01","legalName":"Örebro kommun","rating":null,"numberOfSubsidaries":2,"numberOfEmployeesRange":">10000","numberOfEmployeesWorkSite":"10 - 19","dividends":null,"employeeGrowth":null,"equityRatio":null,"quickRatio":null,"resultBeforeTax":null,"salariesBoardMembers":null,"salariesOthers":null,"turnoverGrowth":null,"turnover":null,"numberOfEmployees":null,"averageSalaryPerEmployee":null,"turnoverPerEmployee":null,"profitMargin":null,"resultAfterFinancialItems":null,"ebita":null},"temperature":0},"actions":{"getHistory":"/395012/Organization/pase11469/getHistory","getFuture":"/395012/Organization/pase11469/getFuture","getDocuments":"/395012/Organization/pase11469/getDocuments","getEmployees":"/395012/Organization/pase11469/getEmployees","getDeals":"/395012/Organization/pase11469/getDeals","addFileForm":"/395012/Organization/pase11469/addFileForm","addFile":"/395012/Organization/pase11469/addFile","addLink":"/395012/Organization/pase11469/addLink","update":"/395012/Organization/pase11469/update","setResponsibleCoworker":"/395012/Organization/pase11469/setResponsibleCoworker","follow":"/395012/Organization/pase11469/follow","setRelation":"/395012/Organization/pase11469/setRelation","getSuggestedPersons":"/395012/SuggestedPersons/pase11469/getForOrganization","searchEmployees":"/395012/OrganizationSearch/pase11469/employees","getEmployeeTemplate":"/395012/Person/GetTemplate?employerId=pase11469","addToTargetList":"/395012/targetAppender/appendOrganization?organizationId=pase11469","addTag":"/395012/Organization/pase11469/addTag","removeTag":"/395012/Organization/pase11469/removeTag","getUnassignedTags":"/395012/Organization/pase11469/getUnassignedTags","addNewTag":"/395012/Organization/pase11469/addNewTag","createTodo":"/395012/todo/create?organizationId=pase11469","getMeetingTemplate":"/395012/Meeting/getTemplate?OrganizationId=pase11469"},"id":"pase11469","heading":"Förskola Garphyttan","subHeading":"Garphyttan","source":{"data":{},"conflicts":null},"whoIsHere":[],"isTouched":false,"isFollowing":false,"errors":null,"categorization":["branchoffice","subsidiary"],"relatedCategorizations":{"/395012/Organization/pase276386":["headoffice","corporategroup"]}}
Когда я распечатываю его после того, как я его преобразовал:
[[{'f_namn': 'Two Cents AB', 'org_nr': '556424-4381', 'Phone Number': None}]]
[[{'f_namn': 'Two Cents AB', 'org_nr': '556424-4381', 'Phone Number': None}], [{'f_namn': 'Förskola Garphyttan', 'org_nr': '212000-1967', 'Phone Number': ' 4619215030'}]]
Когда я пытаюсь запустить свой код, я получаю эту ошибку:
File "c:UsersTobiasProjectsLime-GOLime-GO.py", line 98, in find_company
xd = pd.json_normalize(prepared_data)
File "C:UsersTobiasAppDataLocalProgramsPythonPython39libsite-packagespandasiojson_normalize.py", line 270, in _json_normalize
if any([isinstance(x, dict) for x in y.values()] for y in data):
File "C:UsersTobiasAppDataLocalProgramsPythonPython39libsite-packagespandasiojson_normalize.py", line 270, in <genexpr>
if any([isinstance(x, dict) for x in y.values()] for y in data):
AttributeError: 'list' object has no attribute 'values'
Это работает, когда я пропускаю конвертер и просто экспортирую все данные в Excel, но там нет структуры, и мне не нужна вся информация.
Мой PY-код:
lst = []
for k in range(11468, 11470, 1):
link = baseLink str(k)
self.driver.get(link)
# Convert what I want
converter = c.list_comp(
{
"f_namn": c.item("data", "name"),
"org_nr": c.item("data", "organizationNumber"),
"Phone Number": c.item("data", "centralPhoneNumber", "normalized"),
}
).gen_converter() # install "black" to see formatted sources
res = self.driver.find_element_by_tag_name("pre").text
lst.append(converter([json.loads(res)]))
prepared_data = lst
xd = pd.json_normalize(prepared_data)
xd.to_excel("list.xlsx", index=False, sheet_name="Sheet2")
Ответ №1:
Я полагаю, что вы чрезмерно завернули данные в список. Кроме того, я бы посоветовал вам разделить свой код на значимые части.
import json
from urllib.parse import urljoin
from convtools import conversion as c
class YourClassName:
base_link = "your base link goes here"
def fetch_page(self, k):
url = urljoin(self.base_link, str(k))
print(f"getting {url}")
self.driver.get(url)
# TODO: handle when there's no element
# add waiting code
element = self.driver.find_element_by_tag_name("pre")
return element and element.text
def iter_pages(self):
for k in range(11468, 11470, 1):
data = self.fetch_page(k)
if data is None:
print(f"failed to load page #{k}, skipping")
continue
yield json.loads(data)
def fetch_all(self):
# you can move this out of the class
convert = c.list_comp(
{
"f_namn": c.item("data", "name"),
"org_nr": c.item("data", "organizationNumber"),
"Phone Number": c.item(
"data", "centralPhoneNumber", "normalized"
),
}
).gen_converter() # install "black" to see formatted sources
data = convert(self.iter_pages())
return data
def dump_to_xlsx(self):
# your to_excel code goes here
YourClassName().fetch_all()