#python #scikit-learn #statsmodels #linearmodels #olsmultiplelinearregression
#python #scikit-learn #statsmodels #linearmodels #olsmultiplelinearregression
Вопрос:
Я пытаюсь построить линейную модель, используя как линейную регрессию Sklearn, так и statsmodels.api.
Подход заключается в отбрасывании переменных, значения p и VIF которых выше нормы (значение p : 0,05, VIF: <5)
столбцы bike_train: август, декабрь, февраль, январь, июль, июнь, март, май, ноябрь, октябрь, сентябрь, понедельник, суббота, воскресенье, четверг, вторник, среда, небольшой снег и дождь, туман и облачность, весна, Лето, Зима, температура, влажность, скорость ветра, bike_count
y_train = bike_train.pop('bike_count')
X_train = bike_train
# Running RFE with the output number of the variable equal to 15
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm, 15)
rfe = rfe.fit(X_train, y_train)
#List of selected varialbles
list(zip(X_train.columns, rfe.support_, rfe.ranking_))
# Variables which have RFE support as true
col = X_train.columns[rfe.support_]
col
Вывод :
Index(['December', 'January', 'July', 'June', 'November', 'October',
'September', 'Sunday', 'Light Snow amp; Rain', 'Mist amp; Cloudy', 'Summer',
'Winter', 'temp', 'humidity', 'windspeed'],
dtype='object')
МОДЕЛЬ 1
# Creating X_train dataframe with RFE selected variables
X_train_rfe = X_train[col]
#Adding a constant
X_train_rfe = sm.add_constant(X_train_rfe)
# Running the linear model
lm = sm.OLS(y_train,X_train_rfe).fit()
#Dropping the constant
X_train_rfe = X_train_rfe.drop(['const'], axis=1)
#Summary of the linear model
print(lm.summary())
# Calculate the VIFs for the new model
vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
МОДЕЛЬ 2
#Dropping January
X_train_new = X_train_rfe.drop(["January"], axis = 1)
#Rebuilding the model without "January"
X_train_lm = sm.add_constant(X_train_new)
lm_new = sm.OLS(y_train,X_train_lm).fit()
X_train_lm = X_train_lm.drop(['const'], axis=1)
print(lm_new.summary())
#checking VIF for new model without January
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
МОДЕЛЬ 3
#Dropping Humidity
X_train_new_1 = X_train_lm.drop(["humidity"], axis = 1)
#Rebuilding the model without "Humidity"
X_train_lm_1 = sm.add_constant(X_train_new_1)
lm_1 = sm.OLS(y_train,X_train_lm_1).fit()
X_train_lm_1 = X_train_lm_1.drop(['const'], axis=1)
print(lm_1.summary())
#checking VIF for new model without Humidity
vif = pd.DataFrame()
X = X_train_new_1
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
MODEL 4
#Dropping Winter
X_train_new_2 = X_train_lm_1.drop(["Winter"], axis = 1)
#Rebuilding the model without "Winter"
X_train_lm_2 = sm.add_constant(X_train_new_2)
lm_2 = sm.OLS(y_train,X_train_lm_2).fit()
X_train_lm_2 = X_train_lm_2.drop(['const'], axis=1)
print(lm_2.summary())
#checking VIF for new model without Winter
vif = pd.DataFrame()
X = X_train_new_2
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
MODEL 5
#Dropping June
X_train_new_3 = X_train_lm_2.drop(["June"], axis = 1)
#Rebuilding the model without "June"
X_train_lm_3 = sm.add_constant(X_train_new_3)
lm_3 = sm.OLS(y_train,X_train_lm_3).fit()
X_train_lm_3 = X_train_lm_3.drop(['const'], axis=1)
print(lm_3.summary())
#checking VIF for new model without June
vif = pd.DataFrame()
X = X_train_new_3
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
МОДЕЛЬ 6
#Dropping July
X_train_new_4 = X_train_lm_3.drop(["July"], axis = 1)
#Rebuilding the model without "July"
X_train_lm_4 = sm.add_constant(X_train_new_4)
lm_4 = sm.OLS(y_train, X_train_lm_4).fit()
X_train_lm_4 = X_train_lm_4.drop(['const'], axis=1)
print(lm_4.summary())
#checking VIF for new model without July
vif = pd.DataFrame()
X = X_train_new_4
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
y_train_pred = lm_4.predict(X_train_lm_4)
Ошибка :
ValueError Traceback (most recent call last)
<ipython-input-38-f48f554d210b> in <module>
----> 1 y_train_pred = lm_4.predict(X_train_lm_4)
C:ProgramDataAnaconda3libsite-packagesstatsmodelsbasemodel.py in predict(self, exog, transform, *args, **kwargs)
1097 exog = np.atleast_2d(exog) # needed in count model shape[1]
1098
-> 1099 predict_results = self.model.predict(self.params, exog, *args,
1100 **kwargs)
1101
C:ProgramDataAnaconda3libsite-packagesstatsmodelsregressionlinear_model.py in predict(self, params, exog)
378 exog = self.exog
379
--> 380 return np.dot(exog, params)
381
382 def get_distribution(self, params, scale, exog=None, dist_class=None):
<__array_function__ internals> in dot(*args, **kwargs)
ValueError: shapes (510,10) and (11,) not aligned: 10 (dim 1) != 11 (dim 0)
Масштабируйте все числовые значения перед созданием модели следующим образом:
scaler = MinMaxScaler()
num_vars=['temp','humidity','windspeed','bike_count']
bike_train[num_vars] = scaler.fit_transform(bike_train[num_vars])
bike_train.head()
Пожалуйста, скажите мне, где я ошибся, заранее спасибо!!
Комментарии:
1. вы удалили константу
X_train_lm_4 = X_train_lm_4.drop(['const'], axis=1)
, необходимую для прогнозирования, т.Е. Несоответствие формы означает, что вам не хватает столбца2. @Josef понял это после публикации вопроса и быстро исправил это!!