#dataframe #select #pyspark #null #pivot
Вопрос:
Ниже приведен мой фрейм данных
--------- ----- -------- --------- -------
| NAME|Actor| Doctor|Professor|Singer |
--------- ----- -------- --------- -------
| Samantha| null|Samantha| null| null|
|Christeen| null| null|Christeen| null|
| Meera| null| null| null| Meera|
| Julia|Julia| null| null| null|
| Priya| null| null| null| Priya|
| Ashley| null| null| Ashley| null|
| Jenny| null| Jenny| null| null|
| Maria|Maria| null| null| null|
| Jane| Jane| null| null| null|
| Ketty| null| null| Ketty| null|
--------- ----- -------- --------- -------
Я хочу выбрать все не нулевые значения у АКТЕРА,ДОКТОРА,ПРОФЕССОРА И ПЕВЦА
Ответ №1:
Это может быть достигнуто с помощью IsNotNull и создания condn
желаемых правил и, наконец, фильтра —
Вы можете дополнительно изменить его в condn
зависимости от ваших требований —
Подготовка Данных
input_str = """
Samantha|None|Samantha| None| None|
Christeen| None| None|Christeen| None|
Meera| None| None| None| Meera|
Julia|Julia| None| None| None|
Priya| None| None| None| Priya|
Ashley| None| None| Ashley| None|
Jenny| None| Jenny| None| None|
Maria|Maria| None| None| None|
Jane| Jane| None| None| None|
Ketty| None| None| Ketty| None|
Aditya| None| None| None| None|
""".split("|")
input_values = list(map(lambda x:x.strip(),input_str))[:-1]
i = 0
n = len(input_values)
input_list = []
while i < n:
input_list = [ tuple(input_values[i:i 5]) ]
i = 5
input_list
[('Samantha', 'None', 'Samantha', 'None', 'None'),
('Christeen', 'None', 'None', 'Christeen', 'None'),
('Meera', 'None', 'None', 'None', 'Meera'),
('Julia', 'Julia', 'None', 'None', 'None'),
('Priya', 'None', 'None', 'None', 'Priya'),
('Ashley', 'None', 'None', 'Ashley', 'None'),
('Jenny', 'None', 'Jenny', 'None', 'None'),
('Maria', 'Maria', 'None', 'None', 'None'),
('Jane', 'Jane', 'None', 'None', 'None'),
('Ketty', 'None', 'None', 'Ketty', 'None'),
('Aditya', 'Aditya', 'Aditya', 'Aditya', 'Aditya')]
Преобразование None в Null
def blank_as_null(x):
return F.when(F.col(x) == "None", None).otherwise(F.col(x))
sparkDF = sql.createDataFrame(input_list,['Name','Actor','Doctor','Professor','Singer'])
to_convert = set(['Actor','Doctor','Professor','Singer'])
sparkDF = reduce(lambda df, x: df.withColumn(x, blank_as_null(x)), to_convert, sparkDF)
sparkDF.show()
--------- ------ -------- --------- ------
| Name| Actor| Doctor|Professor|Singer|
--------- ------ -------- --------- ------
| Samantha| null|Samantha| null| null|
|Christeen| null| null|Christeen| null|
| Meera| null| null| null| Meera|
| Julia| Julia| null| null| null|
| Priya| null| null| null| Priya|
| Ashley| null| null| Ashley| null|
| Jenny| null| Jenny| null| null|
| Maria| Maria| null| null| null|
| Jane| Jane| null| null| null|
| Ketty| null| null| Ketty| null|
| Aditya|Aditya| Aditya| Aditya|Aditya|
--------- ------ -------- --------- ------
Фильтрация Ненулевых Значений
condn = (
(F.col('Actor').isNotNull())
amp; (F.col('Doctor').isNotNull())
amp; (F.col('Professor').isNotNull())
amp; (F.col('Singer').isNotNull())
)
sparkDF.filter(condn).show()
------ ------ ------ --------- ------
| Name| Actor|Doctor|Professor|Singer|
------ ------ ------ --------- ------
|Aditya|Aditya|Aditya| Aditya|Aditya|
------ ------ ------ --------- ------