#python #r #time-series #prediction #statsmodels
#питон #r #временные ряды #предсказание #статс-модели
Вопрос:
Я создаю модель ARIMA как на Python (с использованием моделей состояния), так и на R (с использованием пакета прогнозов). Точечные прогнозы довольно схожи между двумя реализациями. Тем не менее, я получаю очень разные 95% — ные интервалы прогнозирования.
Мои данные и код на Python приведены в блоке 1 ниже.
Мои данные и код в R приведены в блоке 2 ниже.
Самое главное — в соответствии с двумя графиками в конце каждого блока — я получаю гораздо более узкие интервалы прогнозирования в R, чем в Python.
Есть ли какая-то ошибка, которую я совершаю с любым подходом, который приводит к таким результатам?
#########
Блок 1 (Python):
# importing needed modules import pandas as pd from pandas import Timestamp from statsmodels.tsa.statespace.sarimax import SARIMAX, SARIMAXResults import warnings warnings.filterwarnings("ignore") # reproducing disguised but representative data df = {Timestamp('2018-03-25 00:00:00', freq='W-SUN'): 171954, Timestamp('2018-04-01 00:00:00', freq='W-SUN'): 175983, Timestamp('2018-04-08 00:00:00', freq='W-SUN'): 182176, Timestamp('2018-04-15 00:00:00', freq='W-SUN'): 204762, Timestamp('2018-04-22 00:00:00', freq='W-SUN'): 190414, Timestamp('2018-04-29 00:00:00', freq='W-SUN'): 164509, Timestamp('2018-05-06 00:00:00', freq='W-SUN'): 189951, Timestamp('2018-05-13 00:00:00', freq='W-SUN'): 185431, Timestamp('2018-05-20 00:00:00', freq='W-SUN'): 178109, Timestamp('2018-05-27 00:00:00', freq='W-SUN'): 144890, Timestamp('2018-06-03 00:00:00', freq='W-SUN'): 179893, Timestamp('2018-06-10 00:00:00', freq='W-SUN'): 165309, Timestamp('2018-06-17 00:00:00', freq='W-SUN'): 161598, Timestamp('2018-06-24 00:00:00', freq='W-SUN'): 190336, Timestamp('2018-07-01 00:00:00', freq='W-SUN'): 161705, Timestamp('2018-07-08 00:00:00', freq='W-SUN'): 185468, Timestamp('2018-07-15 00:00:00', freq='W-SUN'): 169414, Timestamp('2018-07-22 00:00:00', freq='W-SUN'): 174090, Timestamp('2018-07-29 00:00:00', freq='W-SUN'): 182590, Timestamp('2018-08-05 00:00:00', freq='W-SUN'): 170937, Timestamp('2018-08-12 00:00:00', freq='W-SUN'): 177440, Timestamp('2018-08-19 00:00:00', freq='W-SUN'): 167498, Timestamp('2018-08-26 00:00:00', freq='W-SUN'): 177016, Timestamp('2018-09-02 00:00:00', freq='W-SUN'): 166978, Timestamp('2018-09-09 00:00:00', freq='W-SUN'): 178701, Timestamp('2018-09-16 00:00:00', freq='W-SUN'): 197804, Timestamp('2018-09-23 00:00:00', freq='W-SUN'): 165459, Timestamp('2018-09-30 00:00:00', freq='W-SUN'): 180387, Timestamp('2018-10-07 00:00:00', freq='W-SUN'): 177362, Timestamp('2018-10-14 00:00:00', freq='W-SUN'): 188070, Timestamp('2018-10-21 00:00:00', freq='W-SUN'): 139737, Timestamp('2018-10-28 00:00:00', freq='W-SUN'): 172630, Timestamp('2018-11-04 00:00:00', freq='W-SUN'): 181116, Timestamp('2018-11-11 00:00:00', freq='W-SUN'): 187739, Timestamp('2018-11-18 00:00:00', freq='W-SUN'): 141067, Timestamp('2018-11-25 00:00:00', freq='W-SUN'): 159493, Timestamp('2018-12-02 00:00:00', freq='W-SUN'): 172005, Timestamp('2018-12-09 00:00:00', freq='W-SUN'): 156852, Timestamp('2018-12-16 00:00:00', freq='W-SUN'): 170582, Timestamp('2018-12-23 00:00:00', freq='W-SUN'): 108981, Timestamp('2018-12-30 00:00:00', freq='W-SUN'): 139398, Timestamp('2019-01-06 00:00:00', freq='W-SUN'): 164715, Timestamp('2019-01-13 00:00:00', freq='W-SUN'): 180208, Timestamp('2019-01-20 00:00:00', freq='W-SUN'): 165578, Timestamp('2019-01-27 00:00:00', freq='W-SUN'): 150432, Timestamp('2019-02-03 00:00:00', freq='W-SUN'): 172456, Timestamp('2019-02-10 00:00:00', freq='W-SUN'): 157376, Timestamp('2019-02-17 00:00:00', freq='W-SUN'): 160747, Timestamp('2019-02-24 00:00:00', freq='W-SUN'): 181247, Timestamp('2019-03-03 00:00:00', freq='W-SUN'): 140209, Timestamp('2019-03-10 00:00:00', freq='W-SUN'): 180778, Timestamp('2019-03-17 00:00:00', freq='W-SUN'): 174823, Timestamp('2019-03-24 00:00:00', freq='W-SUN'): 150521, Timestamp('2019-03-31 00:00:00', freq='W-SUN'): 158542, Timestamp('2019-04-07 00:00:00', freq='W-SUN'): 193693, Timestamp('2019-04-14 00:00:00', freq='W-SUN'): 177645, Timestamp('2019-04-21 00:00:00', freq='W-SUN'): 192286, Timestamp('2019-04-28 00:00:00', freq='W-SUN'): 181307, Timestamp('2019-05-05 00:00:00', freq='W-SUN'): 186313, Timestamp('2019-05-12 00:00:00', freq='W-SUN'): 178737, Timestamp('2019-05-19 00:00:00', freq='W-SUN'): 191977, Timestamp('2019-05-26 00:00:00', freq='W-SUN'): 169959, Timestamp('2019-06-02 00:00:00', freq='W-SUN'): 158697, Timestamp('2019-06-09 00:00:00', freq='W-SUN'): 158528, Timestamp('2019-06-16 00:00:00', freq='W-SUN'): 173406, Timestamp('2019-06-23 00:00:00', freq='W-SUN'): 191186, Timestamp('2019-06-30 00:00:00', freq='W-SUN'): 164444, Timestamp('2019-07-07 00:00:00', freq='W-SUN'): 208979, Timestamp('2019-07-14 00:00:00', freq='W-SUN'): 182001, Timestamp('2019-07-21 00:00:00', freq='W-SUN'): 173436, Timestamp('2019-07-28 00:00:00', freq='W-SUN'): 156646, Timestamp('2019-08-04 00:00:00', freq='W-SUN'): 149798, Timestamp('2019-08-11 00:00:00', freq='W-SUN'): 172527, Timestamp('2019-08-18 00:00:00', freq='W-SUN'): 147005, Timestamp('2019-08-25 00:00:00', freq='W-SUN'): 173555, Timestamp('2019-09-01 00:00:00', freq='W-SUN'): 152108, Timestamp('2019-09-08 00:00:00', freq='W-SUN'): 168979, Timestamp('2019-09-15 00:00:00', freq='W-SUN'): 139389, Timestamp('2019-09-22 00:00:00', freq='W-SUN'): 129588, Timestamp('2019-09-29 00:00:00', freq='W-SUN'): 132264, Timestamp('2019-10-06 00:00:00', freq='W-SUN'): 127931, Timestamp('2019-10-13 00:00:00', freq='W-SUN'): 134254, Timestamp('2019-10-20 00:00:00', freq='W-SUN'): 126794, Timestamp('2019-10-27 00:00:00', freq='W-SUN'): 140317, Timestamp('2019-11-03 00:00:00', freq='W-SUN'): 132397, Timestamp('2019-11-10 00:00:00', freq='W-SUN'): 146625, Timestamp('2019-11-17 00:00:00', freq='W-SUN'): 179995, Timestamp('2019-11-24 00:00:00', freq='W-SUN'): 140047, Timestamp('2019-12-01 00:00:00', freq='W-SUN'): 156938, Timestamp('2019-12-08 00:00:00', freq='W-SUN'): 152835, Timestamp('2019-12-15 00:00:00', freq='W-SUN'): 126538, Timestamp('2019-12-22 00:00:00', freq='W-SUN'): 65501, Timestamp('2019-12-29 00:00:00', freq='W-SUN'): 92036, Timestamp('2020-01-05 00:00:00', freq='W-SUN'): 139105, Timestamp('2020-01-12 00:00:00', freq='W-SUN'): 109354, Timestamp('2020-01-19 00:00:00', freq='W-SUN'): 135210, Timestamp('2020-01-26 00:00:00', freq='W-SUN'): 111993, Timestamp('2020-02-02 00:00:00', freq='W-SUN'): 131422, Timestamp('2020-02-09 00:00:00', freq='W-SUN'): 145836, Timestamp('2020-02-16 00:00:00', freq='W-SUN'): 155570, Timestamp('2020-02-23 00:00:00', freq='W-SUN'): 163386, Timestamp('2020-03-01 00:00:00', freq='W-SUN'): 133342, Timestamp('2020-03-08 00:00:00', freq='W-SUN'): 139931, Timestamp('2020-03-15 00:00:00', freq='W-SUN'): 132746, Timestamp('2020-03-22 00:00:00', freq='W-SUN'): 117022, Timestamp('2020-03-29 00:00:00', freq='W-SUN'): 115206, Timestamp('2020-04-05 00:00:00', freq='W-SUN'): 100786, Timestamp('2020-04-12 00:00:00', freq='W-SUN'): 106724, Timestamp('2020-04-19 00:00:00', freq='W-SUN'): 104512, Timestamp('2020-04-26 00:00:00', freq='W-SUN'): 86032, Timestamp('2020-05-03 00:00:00', freq='W-SUN'): 110017, Timestamp('2020-05-10 00:00:00', freq='W-SUN'): 114749, Timestamp('2020-05-17 00:00:00', freq='W-SUN'): 129600, Timestamp('2020-05-24 00:00:00', freq='W-SUN'): 116723, Timestamp('2020-05-31 00:00:00', freq='W-SUN'): 129644, Timestamp('2020-06-07 00:00:00', freq='W-SUN'): 105255, Timestamp('2020-06-14 00:00:00', freq='W-SUN'): 113663, Timestamp('2020-06-21 00:00:00', freq='W-SUN'): 102831, Timestamp('2020-06-28 00:00:00', freq='W-SUN'): 101171, Timestamp('2020-07-05 00:00:00', freq='W-SUN'): 122590, Timestamp('2020-07-12 00:00:00', freq='W-SUN'): 136748, Timestamp('2020-07-19 00:00:00', freq='W-SUN'): 148711, Timestamp('2020-07-26 00:00:00', freq='W-SUN'): 137374, Timestamp('2020-08-02 00:00:00', freq='W-SUN'): 146122, Timestamp('2020-08-09 00:00:00', freq='W-SUN'): 160689, Timestamp('2020-08-16 00:00:00', freq='W-SUN'): 143843, Timestamp('2020-08-23 00:00:00', freq='W-SUN'): 112188, Timestamp('2020-08-30 00:00:00', freq='W-SUN'): 122655, Timestamp('2020-09-06 00:00:00', freq='W-SUN'): 110471, Timestamp('2020-09-13 00:00:00', freq='W-SUN'): 151632, Timestamp('2020-09-20 00:00:00', freq='W-SUN'): 148495, Timestamp('2020-09-27 00:00:00', freq='W-SUN'): 152085, Timestamp('2020-10-04 00:00:00', freq='W-SUN'): 151113, Timestamp('2020-10-11 00:00:00', freq='W-SUN'): 139480, Timestamp('2020-10-18 00:00:00', freq='W-SUN'): 152006, Timestamp('2020-10-25 00:00:00', freq='W-SUN'): 132432, Timestamp('2020-11-01 00:00:00', freq='W-SUN'): 139648, Timestamp('2020-11-08 00:00:00', freq='W-SUN'): 149403, Timestamp('2020-11-15 00:00:00', freq='W-SUN'): 158219, Timestamp('2020-11-22 00:00:00', freq='W-SUN'): 131722, Timestamp('2020-11-29 00:00:00', freq='W-SUN'): 139785, Timestamp('2020-12-06 00:00:00', freq='W-SUN'): 154573, Timestamp('2020-12-13 00:00:00', freq='W-SUN'): 169083, Timestamp('2020-12-20 00:00:00', freq='W-SUN'): 119607, Timestamp('2020-12-27 00:00:00', freq='W-SUN'): 114054, Timestamp('2021-01-03 00:00:00', freq='W-SUN'): 161551} df = pd.Series(df, name='Value') # creating training and test sets test_length = 39 test_start = len(df) - test_length train = df[:test_start] test = df[-test_length:] # instantiate and fit model model = SARIMAX(train, order=(2,1,2), seasonal_order=(1,0,0,52), enforce_stationarity=False, enforce_invertibility=False) model_fit = model.fit(disp=False, dynamic=False, method = 'powell') # producing point forecasts and prediction intervals forecast_obj = model_fit.get_forecast(steps = 39) forecast = forecast_obj.predicted_mean ci_95 = forecast_obj.conf_int(alpha=.05) # combining data; back transforming the results; reformatting the plot data upper_pred = ci_95.iloc[:,1] lower_pred = ci_95.iloc[:,0] forecast.index = test.index upper_pred.index = test.index lower_pred.index = test.index plot_D1 = pd.concat([train,test,forecast,lower_pred,upper_pred], axis=1) plot_D1.columns.values[[0]] = ['Train'] plot_D1.columns.values[[1]] = ['Test'] plot_D1.columns.values[[2]] = ['Forecast'] plot_D1.columns.values[[3]] = ['Lower'] plot_D1.columns.values[[4]] = ['Upper'] plot_D1.plot()
Block 2 (R):
library(forecast) library(Metrics) # reproducing disguised but representative data df lt;- structure(list(index = c("2018-03-25", "2018-04-01", "2018-04-08", "2018-04-15", "2018-04-22", "2018-04-29", "2018-05-06", "2018-05-13", "2018-05-20", "2018-05-27", "2018-06-03", "2018-06-10", "2018-06-17", "2018-06-24", "2018-07-01", "2018-07-08", "2018-07-15", "2018-07-22", "2018-07-29", "2018-08-05", "2018-08-12", "2018-08-19", "2018-08-26", "2018-09-02", "2018-09-09", "2018-09-16", "2018-09-23", "2018-09-30", "2018-10-07", "2018-10-14", "2018-10-21", "2018-10-28", "2018-11-04", "2018-11-11", "2018-11-18", "2018-11-25", "2018-12-02", "2018-12-09", "2018-12-16", "2018-12-23", "2018-12-30", "2019-01-06", "2019-01-13", "2019-01-20", "2019-01-27", "2019-02-03", "2019-02-10", "2019-02-17", "2019-02-24", "2019-03-03", "2019-03-10", "2019-03-17", "2019-03-24", "2019-03-31", "2019-04-07", "2019-04-14", "2019-04-21", "2019-04-28", "2019-05-05", "2019-05-12", "2019-05-19", "2019-05-26", "2019-06-02", "2019-06-09", "2019-06-16", "2019-06-23", "2019-06-30", "2019-07-07", "2019-07-14", "2019-07-21", "2019-07-28", "2019-08-04", "2019-08-11", "2019-08-18", "2019-08-25", "2019-09-01", "2019-09-08", "2019-09-15", "2019-09-22", "2019-09-29", "2019-10-06", "2019-10-13", "2019-10-20", "2019-10-27", "2019-11-03", "2019-11-10", "2019-11-17", "2019-11-24", "2019-12-01", "2019-12-08", "2019-12-15", "2019-12-22", "2019-12-29", "2020-01-05", "2020-01-12", "2020-01-19", "2020-01-26", "2020-02-02", "2020-02-09", "2020-02-16", "2020-02-23", "2020-03-01", "2020-03-08", "2020-03-15", "2020-03-22", "2020-03-29", "2020-04-05", "2020-04-12", "2020-04-19", "2020-04-26", "2020-05-03", "2020-05-10", "2020-05-17", "2020-05-24", "2020-05-31", "2020-06-07", "2020-06-14", "2020-06-21", "2020-06-28", "2020-07-05", "2020-07-12", "2020-07-19", "2020-07-26", "2020-08-02", "2020-08-09", "2020-08-16", "2020-08-23", "2020-08-30", "2020-09-06", "2020-09-13", "2020-09-20", "2020-09-27", "2020-10-04", "2020-10-11", "2020-10-18", "2020-10-25", "2020-11-01", "2020-11-08", "2020-11-15", "2020-11-22", "2020-11-29", "2020-12-06", "2020-12-13", "2020-12-20", "2020-12-27", "2021-01-03"), Value = c(171954L, 175983L, 182176L, 204762L, 190414L, 164509L, 189951L, 185431L, 178109L, 144890L, 179893L, 165309L, 161598L, 190336L, 161705L, 185468L, 169414L, 174090L, 182590L, 170937L, 177440L, 167498L, 177016L, 166978L, 178701L, 197804L, 165459L, 180387L, 177362L, 188070L, 139737L, 172630L, 181116L, 187739L, 141067L, 159493L, 172005L, 156852L, 170582L, 108981L, 139398L, 164715L, 180208L, 165578L, 150432L, 172456L, 157376L, 160747L, 181247L, 140209L, 180778L, 174823L, 150521L, 158542L, 193693L, 177645L, 192286L, 181307L, 186313L, 178737L, 191977L, 169959L, 158697L, 158528L, 173406L, 191186L, 164444L, 208979L, 182001L, 173436L, 156646L, 149798L, 172527L, 147005L, 173555L, 152108L, 168979L, 139389L, 129588L, 132264L, 127931L, 134254L, 126794L, 140317L, 132397L, 146625L, 179995L, 140047L, 156938L, 152835L, 126538L, 65501L, 92036L, 139105L, 109354L, 135210L, 111993L, 131422L, 145836L, 155570L, 163386L, 133342L, 139931L, 132746L, 117022L, 115206L, 100786L, 106724L, 104512L, 86032L, 110017L, 114749L, 129600L, 116723L, 129644L, 105255L, 113663L, 102831L, 101171L, 122590L, 136748L, 148711L, 137374L, 146122L, 160689L, 143843L, 112188L, 122655L, 110471L, 151632L, 148495L, 152085L, 151113L, 139480L, 152006L, 132432L, 139648L, 149403L, 158219L, 131722L, 139785L, 154573L, 169083L, 119607L, 114054L, 161551L)), class = "data.frame", row.names = c(NA,-146L)) # Setting frequency to 7 (a week) df lt;- ts(df$Value, frequency = 7) # train/ test split test_length lt;- 39 train lt;- head(df, length(df) - test_length) test lt;- tail(df, test_length) # creating an arima model custom_model lt;- Arima(train, order = c(2,1,2), list(order=c(1,0,0),period=52), method="ML", include.mean=FALSE) # making forecasts preds_custom lt;- forecast(custom_model, h = test_length, level = c(80, 95)) # plotting results from combined prediction autoplot(preds_custom, main='Custom Model') autolayer(test)