#apache-spark #pyspark
#apache-spark #pyspark
Вопрос:
SELECT DISTINCT
POA_KEY addressIdentifier,
PROV_SPCLTY_CERTFN_STTS_CD spcltyBoardCertificationCode,
PROV_SPCLTY_CERTFN_STTS_CD txnmyBoardCertificationCode,
SPCLTY_CD_VAL specialtyCode,
SPCLTY_CD_VAL_NM specialtyCodeName,
SPCLTY_CD_VAL_DESC specialtyCodeDesc,
SPCLTY_CTGRY_CD_VAL specialtyCategoryCode,
SPCLTY_CTGRY_CD_VAL_NM specialtyCategoryName,
SPCLTY_CTGRY_CD_VAL_DESC specialtyCategoryDesc,
TXNMY_CD_VAL taxonomyCode,
TXNMY_CD_VAL_NM taxonomyCodeName,
TXNMY_CD_VAL_DESC taxonomyCodeDesc
FROM TEST A
Я создаю df из приведенного выше запроса. и создание вложенного df с использованием agg.
contact_df_gp= exprt_df.groupby('addressIdentifier').agg(
f.collect_list(
f.struct('contactListCode','contactListDesc','contactListNm','phoneNumber')
).alias('contactLis'),
f.collect_list(
f.struct('displayUrl','urlName')
).alias('webContactList')
)
Мне нужно переименовать contactListCode -> Code , contactListDesc-> Desc и contactListNm -> Name
Ожидаемый вывод.
{"addressIdentifier":1000105107,"contact":[{"Code":"B","Desc":"BUSINESS","Name":"BUSINESS","phoneNumber":"8037735227"},{"Code":"B","Desc":"BUSINESS","Name":"BUSINESS","phoneNumber":"8037735227"}],"contactweb":[{"displayUrl":"FALSE"},{"displayUrl":"FALSE"}]}
{"addressIdentifier":1000000001,"contact":[{"Code":"B","Desc":"BUSINESS","Name":"BUSINESS","phoneNumber":"7045403667"},{"Code":"B","Desc":"BUSINESS","Name":"BUSINESS","phoneNumber":"7045403667"},{"contactListCode":"B","contactListDesc":"BUSINESS","contactListNm":"BUSINESS","phoneNumber":"7045403667"},{"contactListCode":"B","contactListDesc":"BUSINESS","contactListNm":"BUSINESS","phoneNumber":"7045403667"}],"contactweb":[{"displayUrl":"FALSE"},{"displayUrl":"FALSE"},{"displayUrl":"FALSE"},{"displayUrl":"FALSE"}]}
Ответ №1:
contact_df_gp= exprt_df.groupby('addressIdentifier').agg(
f.collect_list(
f.struct(
f.col('contactListCode').alias('Code'),
f.col('contactListDesc').alias('Desc'),
f.col('contactListNm').alias('Name'),
f.col('phoneNumber')
)
).alias('contactLis'),
f.collect_list(
f.struct('displayUrl','urlName')
).alias('webContactList')
)