#python #pandas
#python #pandas
Вопрос:
У меня есть большой фрейм данных с датой:
df = pd.read_json('{"dt":{"0":"2016-05-12","1":"2016-06-01","2":"2016-05-02","3":"2016-05-11","4":"2016-05-30","5":"2016-05-19","6":"2016-05-25","7":"2016-05-09","8":"2016-05-03","9":"2016-05-18"}}')
df
------------
| dt |
------------
| 2016-05-12 |
| 2016-06-01 |
| 2016-05-02 |
| 2016-05-11 |
| 2016-05-30 |
| 2016-05-19 |
| 2016-05-25 |
| 2016-05-09 |
| 2016-05-03 |
| 2016-05-18 |
------------
У меня также есть справочная таблица, которая содержит конкретные праздники. На самом деле эти праздники длятся несколько лет, и поэтому имя не уникально, но комбинация даты и имени является.
dfHolidays = pd.read_json('{"event_dt":{"0":"2015-04-05","1":"2016-03-27","2":"2015-01-01","3":"2015-02-17","4":"2015-02-18","5":"2015-04-03","6":"2015-04-05","7":"2015-04-21","8":"2015-05-01","9":"2015-06-04","10":"2015-09-07","11":"2015-10-12","12":"2015-11-02","13":"2015-11-15","14":"2015-12-25","15":"2016-01-01","16":"2016-02-09","17":"2016-02-10","18":"2016-03-25","19":"2016-03-27","20":"2016-04-21","21":"2016-05-01","22":"2016-05-26","23":"2016-09-07","24":"2016-10-12","25":"2016-11-02","26":"2016-11-15","27":"2016-12-25"},"event_name":{"0":"Easter","1":"Easter","2":"Ano novo","3":"Carnaval","4":"Quarta-feira de cinzas (In\u00edcio da Quaresma)","5":"Sexta-feira Santa","6":"P\u00e1scoa","7":"Tiradentes","8":"Dia Mundial do Trabalho","9":"Corpus Christi","10":"Independ\u00eancia do Brasil","11":"Nossa Senhora Aparecida","12":"Finados","13":"Proclama\u00e7\u00e3o da Rep\u00fablica","14":"Natal","15":"Ano novo","16":"Carnaval","17":"Quarta-feira de cinzas (In\u00edcio da Quaresma)","18":"Sexta-feira Santa","19":"P\u00e1scoa","20":"Tiradentes","21":"Dia Mundial do Trabalho","22":"Corpus Christi","23":"Independ\u00eancia do Brasil","24":"Nossa Senhora Aparecida","25":"Finados","26":"Proclama\u00e7\u00e3o da Rep\u00fablica","27":"Natal"}}')
dfHolidays
------------ --------------
| holiday_dt | holiday_name |
------------ --------------
| 2015-04-05 | Easter |
| 2016-03-27 | Easter |
| 2015-01-01 | Ano novo |
| 2015-02-17 | Carnaval |
------------ --------------
Я хотел бы добавить функции в основной фрейм данных df
- days_until_next_holiday
- days_since_last_holiday
Кроме того, не только рядом с любым праздником, но и с каждым конкретным из перечисленных
- days_until_next_Ano novo, days_since_last_Ano nova
- days_until_next_Carnaval, days_since_last_Carnaval
Ответ №1:
Вы можете использовать pd.merge_asof
для получения ближайшей даты в будущем или прошлом для создания всех этих столбцов. По сути, вам нужно изменить направление и / или susbet dfHolidays
, которые вы используете при слиянии:
Подготовка данных
df = df.sort_values('dt')
dfHolidays = dfHolidays.sort_values('event_dt')
df['dt'] = pd.to_datetime(df['dt'])
dfHolidays['event_dt'] = pd.to_datetime(dfHolidays['event_dt'])
merge_asof
def calc_days(df, dfHolidays, direction):
""" Returns Series of integer # of days between events. Absolute value """
s = pd.merge_asof(df, dfHolidays, left_on='dt', right_on='event_dt',
direction=direction)
s = (s['event_dt'] - s['dt']).dt.days.abs()
return s
df['until_next'] = calc_days(df, dfHolidays, 'forward')
df['since_prev'] = calc_days(df, dfHolidays, 'backward')
df['until_next_Ano'] = calc_days(df, dfHolidays[dfHolidays['event_name'].eq('Ano novo')], 'forward')
df['since_prev_Ano'] = calc_days(df, dfHolidays[dfHolidays['event_name'].eq('Ano novo')], 'backward')
#...
print(df)
dt until_next since_prev until_next_Ano since_prev_Ano
2 2016-05-02 17 8 NaN 129
8 2016-05-03 100 4 NaN 150
7 2016-05-09 1 24 NaN 145
3 2016-05-11 15 10 NaN 131
0 2016-05-12 24 1 NaN 122
9 2016-05-18 98 6 NaN 152
5 2016-05-19 8 17 NaN 138
6 2016-05-25 7 18 NaN 139
4 2016-05-30 14 11 NaN 132
1 2016-06-01 23 2 NaN 123
Поскольку у вашего предоставленного dfHoliday не было будущих дат Ano novo по сравнению с датами в df
, будущее устанавливается в NaN, поскольку ничего не сливается:
dfHolidays[dfHolidays['event_name'].eq('Ano novo')]
# event_dt event_name
#2 2015-01-01 Ano novo
#15 2016-01-01 Ano novo
Ответ №2:
Убедитесь, что вы используете pandas версии 1.1.0 и попробуйте это
dfHolidays = pd.read_json('{"event_dt":{"0":"2015-04-05","1":"2016-03-27","2":"2015-01-01","3":"2015-02-17","4":"2015-02-18","5":"2015-04-03","6":"2015-04-05","7":"2015-04-21","8":"2015-05-01","9":"2015-06-04","10":"2015-09-07","11":"2015-10-12","12":"2015-11-02","13":"2015-11-15","14":"2015-12-25","15":"2016-01-01","16":"2016-02-09","17":"2016-02-10","18":"2016-03-25","19":"2016-03-27","20":"2016-04-21","21":"2016-05-01","22":"2016-05-26","23":"2016-09-07","24":"2016-10-12","25":"2016-11-02","26":"2016-11-15","27":"2016-12-25"},"event_name":{"0":"Easter","1":"Easter","2":"Ano novo","3":"Carnaval","4":"Quarta-feira de cinzas (In\u00edcio da Quaresma)","5":"Sexta-feira Santa","6":"P\u00e1scoa","7":"Tiradentes","8":"Dia Mundial do Trabalho","9":"Corpus Christi","10":"Independ\u00eancia do Brasil","11":"Nossa Senhora Aparecida","12":"Finados","13":"Proclama\u00e7\u00e3o da Rep\u00fablica","14":"Natal","15":"Ano novo","16":"Carnaval","17":"Quarta-feira de cinzas (In\u00edcio da Quaresma)","18":"Sexta-feira Santa","19":"P\u00e1scoa","20":"Tiradentes","21":"Dia Mundial do Trabalho","22":"Corpus Christi","23":"Independ\u00eancia do Brasil","24":"Nossa Senhora Aparecida","25":"Finados","26":"Proclama\u00e7\u00e3o da Rep\u00fablica","27":"Natal"}}')
dfHolidays.event_dt = dfHolidays.event_dt.apply(lambda d: dt.datetime.strptime(d,'%Y-%m-%d'))
df = pd.read_json('{"dt":{"0":"2016-05-12","1":"2016-06-01","2":"2016-05-02","3":"2016-05-11","4":"2016-05-30","5":"2016-05-19","6":"2016-05-25","7":"2016-05-09","8":"2016-05-03","9":"2016-05-18"}}')
df.dt = df.dt.apply(lambda d: dt.datetime.strptime(d,'%Y-%m-%d'))
df[['days_until_next_holiday', 'days_since_last_holiday']] = None
df['days_until_next_' dfHolidays.event_name.values]=None
df['days_since_last_' dfHolidays.event_name.values]=None
for i, r in df.iterrows():
df.loc[i, 'days_until_next_' dfHolidays.event_name] = (dfHolidays.event_dt - r['dt']).apply(lambda x: x.days if x.days>0 else None).to_list()
df.loc[i, 'days_since_last_' dfHolidays.event_name] = (r['dt'] - dfHolidays.event_dt).apply(lambda x: x.days if x.days>0 else None).to_list()
df.loc[i, 'days_until_next_holiday'] = df.loc[i, 'days_until_next_' dfHolidays.event_name].min()
df.loc[i, 'days_since_last_holiday'] = df.loc[i, 'days_since_last_' dfHolidays.event_name].min()
result=pd.DataFrame()
for c in df.columns:
if not df[c].isna().all():
result[c] = df[c]
df = result
print(df)