#python #pandas #dataframe #machine-learning #scikit-learn
Вопрос:
У меня есть сценарий предварительной обработки, который берет данные из набора данных diamonds и предварительно обрабатывает их. Очевидно, мне это тоже нужно для предварительной обработки меток.
Вот мой код:
# Data Preprocessing import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, OneHotEncoder from icecream import ic def diamond_preprocess(data_dir): data = pd.read_csv(data_dir) cleaned_data = data.drop(['id', 'depth_percent'], axis=1) # Features I don't want x = cleaned_data.drop(['price'], axis=1) # Train data y = cleaned_data['price'] # Label data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=99) numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist() categorical_features = x_train.select_dtypes(include=['object']).columns.tolist() numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), # Fill in missing data with median ('scaler', StandardScaler()) # Scale data ]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Fill in missing data with 'missing' ('onehot', OneHotEncoder(handle_unknown='ignore')) # One hot encode categorical data ]) preprocessor_pipeline = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_features), ('cat', categorical_transformer, categorical_features) ]) # Fit to the training data preprocessor_pipeline.fit(x_train) preprocessor_pipeline.fit(y_train) # Apply the pipeline to the training and test data x_train_pipe = preprocessor_pipeline.transform(x_train) x_test_pipe = preprocessor_pipeline.transform(x_test) y_train_pipe = preprocessor_pipeline.transform(y_train) y_test_pipe = preprocessor_pipeline.transform(y_test) x_train = pd.DataFrame(data=x_train_pipe) x_test = pd.DataFrame(data=x_test_pipe) y_train = pd.DataFrame(data=y_train_pipe) y_test = pd.DataFrame(data=y_test_pipe) return x_train, x_test, y_train, y_test
Я не очень уверен, что мой код верен или что я хорошо понимаю, как работают конвейеры и предварительная обработка в sklearn. По-видимому, переводчик соглашается, так как я получаю эту ошибку:
File "C:Users17574Anaconda3envskraken-gpulibsite-packagessklearncompose_column_transformer.py", line 470, in fit self.fit_transform(X, y=y) File "C:Users17574Anaconda3envskraken-gpulibsite-packagessklearncompose_column_transformer.py", line 502, in fit_transform self._check_n_features(X, reset=True) File "C:Users17574Anaconda3envskraken-gpulibsite-packagessklearnbase.py", line 352, in _check_n_features n_features = X.shape[1] IndexError: tuple index out of range
Как мне правильно обработать свои метки, как я это делал с данными обучения? Объяснение тоже было бы здорово!
Ответ №1:
Вы можете создать дополнительный конвейер для целевого столбца, если хотите применить преобразования отдельно, см. Пример ниже.
import pandas as pd import numpy as np from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, OneHotEncoder # generate the data data = pd.DataFrame({ 'y': [1, 2, np.nan, 4, 5], 'x1': [6, 7, 8, np.nan, np.nan], 'x2': [9, 10, 11, np.nan, np.nan], 'x3': ['a', 'b', 'c', np.nan, np.nan], 'x4': [np.nan, np.nan, 'd', 'e', 'f'] }) # extract the features and target x = data.drop(labels=['y'], axis=1) y = data[['y']] # note that this is a data frame, not a series # split the data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=99) # map the features to the corresponding types (numerical or categorical) numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist() categorical_features = x_train.select_dtypes(include=['object']).columns.tolist() # define the features pipeline numerical_features_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) categorical_features_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) features_pipeline = ColumnTransformer(transformers=[ ('num_features', numerical_features_transformer, numerical_features), ('cat_features', categorical_features_transformer, categorical_features) ]) # define the target pipeline target_pipeline = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()) ]) # fit the pipelines to the training data features_pipeline.fit(x_train) target_pipeline.fit(y_train) # apply the pipelines to the training and test data x_train_pipe = features_pipeline.transform(x_train) x_test_pipe = features_pipeline.transform(x_test) y_train_pipe = target_pipeline.transform(y_train) y_test_pipe = target_pipeline.transform(y_test) x_train = pd.DataFrame(data=x_train_pipe) x_test = pd.DataFrame(data=x_test_pipe) y_train = pd.DataFrame(data=y_train_pipe) y_test = pd.DataFrame(data=y_test_pipe)