#python #pyspark #apache-spark-sql
#python #pyspark #apache-spark-sql
Вопрос:
У меня есть этот вложенный JSON, из которого я извлекаю данные. Я делаю это с помощью spark.sql(), но как я могу извлекать элементы, которые появляются несколько раз? Я хотел бы иметь строку для каждого из этих конкретных элементов.
Например, вот пример моего файла JSON:
{
"httpStatus": 200,
"httpStatusMessage": "success",
"timestamp": "2020-11-11T19:46:01",
"response": {
"header": {
"fleetId": 10006,
"fleetName": "Naples",
"date": "2020-11-04T00:00:00",
"gpsUnit": "dd",
"speedUnit": "mph",
"accelUnit": "g",
"distanceUnit": "miles"
},
"body": {
"dataProviders": [
{
"dataProviderId": 14,
"drivers": [
{
"driverRef": "b119",
"driverId": 1000611055,
"driverFirstName": "John",
"driverLastName": "Doedoe",
"totalDistance": 238.87,
"vehicles": [
{
"deviceRef": null,
"deviceId": 0,
"vehicleRef": "1LNHL9DKXFG607058",
"vehicleId": 1000621018,
"trips": [
{
"tripId": "2020-11-04-10006-1000616018-10",
"tripDuration": 93.42,
"tripDistanceTravelled": 60.93,
"averageSpeed": 39.1,
"tripStart": {
"longitude": -81.7546463012695,
"latitude": 26.5263614654541,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T15:54:11Z"
},
"tripStop": {
"longitude": -81.7561874389648,
"latitude": 26.5265655517578,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T17:27:36.063Z"
}
},
{
"tripId": "2020-11-04-10006-1000616018-11",
"tripDuration": 60.03,
"tripDistanceTravelled": 32.11,
"averageSpeed": 32.1,
"tripStart": {
"longitude": -81.7561874389648,
"latitude": 26.5265655517578,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T17:46:55.063Z"
},
"tripStop": {
"longitude": -81.7723388671875,
"latitude": 26.2240772247314,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T18:46:57Z"
}
},
{
"tripId": "2020-11-04-10006-1000616018-8",
"tripDuration": 49.77,
"tripDistanceTravelled": 25.56,
"averageSpeed": 30.8,
"tripStart": {
"longitude": -81.7721862792969,
"latitude": 26.2239742279053,
"heading": 0,
"speed": 5.6,
"mileage": 0,
"timestamp": "2020-11-04T14:29:41Z"
},
"tripStop": {
"longitude": -81.7538528442383,
"latitude": 26.5260028839111,
"heading": 0,
"speed": 5.6,
"mileage": 0,
"timestamp": "2020-11-04T15:19:27Z"
}
}
]
},
{
"deviceRef": null,
"deviceId": 0,
"vehicleRef": "1GNSCGKCXKR180027",
"vehicleId": 1000621026,
"trips": [
{
"tripId": "2020-11-04-10006-1000616026-2",
"tripDuration": 59.95,
"tripDistanceTravelled": 30.41,
"averageSpeed": 30.4,
"tripStart": {
"longitude": -81.754524230957,
"latitude": 26.5265655517578,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T01:23:54.063Z"
},
"tripStop": {
"longitude": -81.7721328735352,
"latitude": 26.2240772247314,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T02:23:51.063Z"
}
}
]
}
]
}
]
}
]
}
}
}
Существует три экземпляра trips, а внутри trips есть три экземпляра tripID, tripDuration и т.д…
Когда я извлекаю данные, я начинаю получать все данные в массиве, и мне нужно правильно извлечь их, но я не уверен, как извлечь эти данные и сохранить их в отдельной строке, сопоставив их с родительскими данными.
Вот мой пример кода:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
json_df = spark.read.json('/user/myuser/json_directory/sample.json')
json_df.printSchema()
json_df.createOrReplaceTempView('test_dictionary')
spark.sql('''select * from test_dictionary''').show()
spark.sql('''select
httpStatus
, httpStatusMessage
, test.dataProviderId
, tmp1.driverFirstName
, tmp1.driverId
, tmp1.driverLastName
, tmp1.driverRef
, tmp1.totalDistance
, tmp2.deviceId
, tmp2.vehicleId
, tmp2.vehicleRef
, tmp3.averageSpeed
, tmp3.tripDistanceTravelled
, tmp3.tripDuration
, tmp3.tripId
from test_dictionary
lateral view outer explode (response.body.dataProviders)a as test
lateral view outer explode (response.body.dataProviders.drivers)b as tmp1
lateral view outer explode (tmp1.vehicles)c as tmp2
lateral view outer explode (tmp2.trips)d as tmp3''').show()
результаты выглядят так:
root
|-- httpStatus: long (nullable = true)
|-- httpStatusMessage: string (nullable = true)
|-- response: struct (nullable = true)
| |-- body: struct (nullable = true)
| | |-- dataProviders: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- dataProviderId: long (nullable = true)
| | | | |-- drivers: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- driverFirstName: string (nullable = true)
| | | | | | |-- driverId: long (nullable = true)
| | | | | | |-- driverLastName: string (nullable = true)
| | | | | | |-- driverRef: string (nullable = true)
| | | | | | |-- totalDistance: double (nullable = true)
| | | | | | |-- vehicles: array (nullable = true)
| | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | |-- deviceId: long (nullable = true)
| | | | | | | | |-- deviceRef: string (nullable = true)
| | | | | | | | |-- trips: array (nullable = true)
| | | | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | | | |-- averageSpeed: double (nullable = true)
| | | | | | | | | | |-- tripDistanceTravelled: double (nullable = true)
| | | | | | | | | | |-- tripDuration: double (nullable = true)
| | | | | | | | | | |-- tripId: string (nullable = true)
| | | | | | | | | | |-- tripStart: struct (nullable = true)
| | | | | | | | | | | |-- heading: double (nullable = true)
| | | | | | | | | | | |-- latitude: double (nullable = true)
| | | | | | | | | | | |-- longitude: double (nullable = true)
| | | | | | | | | | | |-- mileage: double (nullable = true)
| | | | | | | | | | | |-- speed: double (nullable = true)
| | | | | | | | | | | |-- timestamp: string (nullable = true)
| | | | | | | | | | |-- tripStop: struct (nullable = true)
| | | | | | | | | | | |-- heading: double (nullable = true)
| | | | | | | | | | | |-- latitude: double (nullable = true)
| | | | | | | | | | | |-- longitude: double (nullable = true)
| | | | | | | | | | | |-- mileage: double (nullable = true)
| | | | | | | | | | | |-- speed: double (nullable = true)
| | | | | | | | | | | |-- timestamp: string (nullable = true)
| | | | | | | | |-- vehicleId: long (nullable = true)
| | | | | | | | |-- vehicleRef: string (nullable = true)
| |-- header: struct (nullable = true)
| | |-- accelUnit: string (nullable = true)
| | |-- date: string (nullable = true)
| | |-- distanceUnit: string (nullable = true)
| | |-- fleetId: long (nullable = true)
| | |-- fleetName: string (nullable = true)
| | |-- gpsUnit: string (nullable = true)
| | |-- speedUnit: string (nullable = true)
|-- timestamp: string (nullable = true)
---------- ----------------- -------------------- -------------------
|httpStatus|httpStatusMessage| response| timestamp|
---------- ----------------- -------------------- -------------------
| 200| success|[[[[14, [[Eric, 1...|2020-11-11T19:46:01|
---------- ----------------- -------------------- -------------------
---------- ----------------- -------------- --------------- ------------ -------------- --------- ------------- -------- -------------------- -------------------- ------------------ --------------------- -------------------- --------------------
|httpStatus|httpStatusMessage|dataProviderId|driverFirstName| driverId|driverLastName|driverRef|totalDistance|deviceId| vehicleId| vehicleRef| averageSpeed|tripDistanceTravelled| tripDuration| tripId|
---------- ----------------- -------------- --------------- ------------ -------------- --------- ------------- -------- -------------------- -------------------- ------------------ --------------------- -------------------- --------------------
| 200| success| 14| [John]|[1000611055]| [Doedoe]| [b119]| [238.87]| [0, 0]|[1000621018, 1000...|[1LNHL9DKXFG60705...|[39.1, 32.1, 30.8]| [60.93, 32.11, 25...|[93.42, 60.03, 49...|[2020-11-04-10006...|
| 200| success| 14| [John]|[1000611055]| [Doedoe]| [b119]| [238.87]| [0, 0]|[1000621018, 1000...|[1LNHL9DKXFG60705...| [30.4]| [30.41]| [59.95]|[2020-11-04-10006...|
---------- ----------------- -------------- --------------- ------------ -------------- --------- ------------- -------- -------------------- -------------------- ------------------ --------------------- -------------------- --------------------
Как мне получить мои результаты, чтобы они выглядели так?:
httpStatus| httpStatusMessage| dataProviderId| driverFirstName driverId driverLastName driverRef| totalDistance| deviceId vehicleId vehicleRef averageSpeed tripDistanceTravelled tripDuration tripId tripStartLongitude latitude heading speed mileage timestamp tripstoplongitude latitude heading speed mileage timestamp
200 success 14 John 1000611055 Doedoe b119 238.87 0 1000621018 1LNHL9DKXFG607058 39.1 60.93 93.42 2020-11-04-10006-1000616018-10" -81 26 0 0 0 2020-11-04T15:54:11Z -81.75618744 26.52656555 0 0 0 2020-11-04T17:27:36.063Z
200 success 14 John 1000611055 Doedoe b119 238.87 0 1000621018 1LNHL9DKXFG607058 32.1 32.11 60.03 2020-11-04-10006-1000616018-11 -81.7 26.5 0 0 0 2020-11-04T17:46:55.063Z -81.77 26.22 0 0 0 2020-11-04T18:46:57Z
200 success 14 John 1000611055 Doedoe b119 238.87 0 1000621018 1LNHL9DKXFG607058 30.8 25.56 49.77 2020-11-04-10006-1000616018-8" -81.7 26.2 0 5.6 0 2020-11-04T14:29:41Z -81.74567899 26.52098 0 5.6 0 2020-11-04T15:19:27Z
200 success 14 John 1000611055 Doedoe b119 238.87 0 1000621026 1GNSCGKCXKR180027 30.4 30.41 59.95 2020-11-04-10006-1000616026-2 -81.7 26.5265 0 0 0 2020-11-04T01:23:54.063Z" -81.7721328 26.22407722 0 0 0 2020-11-04T02:23:51.063Z
Ответ №1:
Проверьте ниже SQL
.
Разнесите все Array
столбцы типа.
spark.sql('''
select
httpStatus,
httpStatusMessage,
dataProviders.dataProviderId,
drivers.driverFirstName,
drivers.driverId,
drivers.driverLastName,
drivers.driverRef,
drivers.totalDistance,
vehicles.deviceId,
vehicles.vehicleId,
vehicles.vehicleRef,
trips.averageSpeed,
trips.tripDistanceTravelled,
trips.tripDuration,
trips.tripId,
trips.tripStart.longitude as tripStart_longitude, // Added tripStart as prefix because tripStart amp; tripStop has same columns.
trips.tripStart.latitude as tripStart_latitude, // Added tripStart as prefix because tripStart amp; tripStop has same columns.
trips.tripStart.heading as tripStart_heading, // Added tripStart as prefix because tripStart amp; tripStop has same columns.
trips.tripStart.speed as tripStart_speed, // Added tripStart as prefix because tripStart amp; tripStop has same columns.
trips.tripStart.mileage as tripStart_mileage, // Added tripStart as prefix because tripStart amp; tripStop has same columns.
trips.tripStart.timestamp as tripStart_timestamp, // Added tripStart as prefix because tripStart amp; tripStop has same columns.
trips.tripStop.longitude as tripStop_longitude, // Added tripStop as prefix because tripStart amp; tripStop has same columns.
trips.tripStop.latitude as tripStop_latitude, // Added tripStop as prefix because tripStart amp; tripStop has same columns.
trips.tripStop.heading as tripStop_heading, // Added tripStop as prefix because tripStart amp; tripStop has same columns.
trips.tripStop.speed as tripStop_speed, // Added tripStop as prefix because tripStart amp; tripStop has same columns.
trips.tripStop.mileage as tripStop_mileage, // Added tripStop as prefix because tripStart amp; tripStop has same columns.
trips.tripStop.timestamp as tripStop_timestamp // Added tripStop as prefix because tripStart amp; tripStop has same columns.
from test_dictionary
lateral view outer explode (response.body.dataProviders) tbl_dataProviders as dataProviders // for dataProviders details
lateral view outer explode (dataProviders.drivers) dataProviders_drivers as drivers // for drivers details
lateral view outer explode (drivers.vehicles) drivers_vehicles as vehicles // for vehicles details
lateral view outer explode (vehicles.trips) vehicles_trips as trips // for trips details
''').show(false)
Конечный результат
---------- ----------------- -------------- --------------- ---------- -------------- --------- ------------- -------- ---------- ----------------- ------------ --------------------- ------------ ------------------------------ ------------------- ------------------ ----------------- --------------- ----------------- ------------------------ ------------------ ----------------- ---------------- -------------- ---------------- ------------------------
|httpStatus|httpStatusMessage|dataProviderId|driverFirstName|driverId |driverLastName|driverRef|totalDistance|deviceId|vehicleId |vehicleRef |averageSpeed|tripDistanceTravelled|tripDuration|tripId |tripStart_longitude|tripStart_latitude|tripStart_heading|tripStart_speed|tripStart_mileage|tripStart_timestamp |tripStop_longitude|tripStop_latitude|tripStop_heading|tripStop_speed|tripStop_mileage|tripStop_timestamp |
---------- ----------------- -------------- --------------- ---------- -------------- --------- ------------- -------- ---------- ----------------- ------------ --------------------- ------------ ------------------------------ ------------------- ------------------ ----------------- --------------- ----------------- ------------------------ ------------------ ----------------- ---------------- -------------- ---------------- ------------------------
|200 |success |14 |John |1000611055|Doedoe |b119 |238.87 |0 |1000621018|1LNHL9DKXFG607058|39.1 |60.93 |93.42 |2020-11-04-10006-1000616018-10|-81.7546463012695 |26.5263614654541 |0 |0.0 |0 |2020-11-04T15:54:11Z |-81.7561874389648 |26.5265655517578 |0 |0.0 |0 |2020-11-04T17:27:36.063Z|
|200 |success |14 |John |1000611055|Doedoe |b119 |238.87 |0 |1000621018|1LNHL9DKXFG607058|32.1 |32.11 |60.03 |2020-11-04-10006-1000616018-11|-81.7561874389648 |26.5265655517578 |0 |0.0 |0 |2020-11-04T17:46:55.063Z|-81.7723388671875 |26.2240772247314 |0 |0.0 |0 |2020-11-04T18:46:57Z |
|200 |success |14 |John |1000611055|Doedoe |b119 |238.87 |0 |1000621018|1LNHL9DKXFG607058|30.8 |25.56 |49.77 |2020-11-04-10006-1000616018-8 |-81.7721862792969 |26.2239742279053 |0 |5.6 |0 |2020-11-04T14:29:41Z |-81.7538528442383 |26.5260028839111 |0 |5.6 |0 |2020-11-04T15:19:27Z |
|200 |success |14 |John |1000611055|Doedoe |b119 |238.87 |0 |1000621026|1GNSCGKCXKR180027|30.4 |30.41 |59.95 |2020-11-04-10006-1000616026-2 |-81.754524230957 |26.5265655517578 |0 |0.0 |0 |2020-11-04T01:23:54.063Z|-81.7721328735352 |26.2240772247314 |0 |0.0 |0 |2020-11-04T02:23:51.063Z|
---------- ----------------- -------------- --------------- ---------- -------------- --------- ------------- -------- ---------- ----------------- ------------ --------------------- ------------ ------------------------------ ------------------- ------------------ ----------------- --------------- ----------------- ------------------------ ------------------ ----------------- ---------------- -------------- ---------------- ------------------------
Комментарии:
1. Я немного смущен. Я не вижу никакой разницы между тем, что у меня было, и тем, что у вас было. Как вы заставили каждый из массивов взорваться? Вы использовали outer explode так же, как и я.
2. Сравните эти две строки с приведенным выше решением
lateral view outer explode (response.body.dataProviders)a as test lateral view outer explode (response.body.dataProviders.drivers)b as tmp1
3. Это должно быть что-то вроде
lateral view outer explode (response.body.dataProviders)a as test lateral view outer explode (test.drivers)b as tmp1
4. просто обратите внимание,
dataProviders
amp;drivers
оба являются типом массива. Если вы хотите получить доступdrivers
к столбцам, вам нужно сначала разорватьdataProviders
, а затем разорватьdrivers
или разорвать дважды, если вы используете так —dataProviders.drivers
5. Ах, хорошо! Теперь это имеет смысл. Спасибо!!