реализация столбца признаков, приводящая к точности 0%

#python #tensorflow #keras

#python #тензорный поток #keras

Вопрос:

Для развлечения в выходные дни я пытаюсь применить это руководство keras к другой проблеме. В этом руководстве показано, как использовать функциональные возможности и встроенные в них, чтобы предсказать, будет ли животное принято.

Я взял учебник и пытаюсь понять, могу ли я на основе категориального встраивания предсказать время полета (просто для развлечения, поэтому не уверен, имеет ли смысл проблема).

Я применил код к своему набору данных, и, похоже, он работает, но я получаю точность 0,00% и предупреждение о необходимости переписать эту модель с помощью функционального API.

Вот мой код для воспроизведения проблемы, я не уверен, что я делаю неправильно или пропускаю:

 import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder

dataframe = pd.read_csv('https://raw.githubusercontent.com/ismayc/pnwflights14/master/data/flights.csv')
dataframe = dataframe[dataframe['tailnum'].notna()]
target = 'air_time'
dataframe.head()


train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, label_column, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop(label_column)
    #labels = dataframe[label_column]

    ds = tf.data.Dataset.from_tensor_slices((dataframe.to_dict(orient='list'), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

feature_columns = []

# numeric cols
for header in ['dep_time','dep_delay',  'arr_time', 'arr_delay', 'distance']:
  feature_columns.append(feature_column.numeric_column(header))

# indicator_columns
categorical_columns = [ 'carrier', 'tailnum', 'origin', 'dest'] 
for col_name in categorical_columns:
  categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, dataframe[col_name].unique())
  indicator_column = feature_column.indicator_column(categorical_column)
  feature_columns.append(indicator_column)

# embedding columns
breed1 = feature_column.categorical_column_with_vocabulary_list(
      'flight', dataframe.flight.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
feature_columns.append(breed1_embedding)

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

batch_size = 32
train_ds = df_to_dataset(train, label_column = target, batch_size=batch_size)
val_ds = df_to_dataset(val,label_column = target,  shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, label_column = target, shuffle=False, batch_size=batch_size)


model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)
 

Результат:

 103552 train examples
25888 validation examples
32361 test examples
Epoch 1/10
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'year': <tf.Tensor 'ExpandDims_14:0' shape=(None, 1) dtype=int32>, 'month': <tf.Tensor 'ExpandDims_11:0' shape=(None, 1) dtype=int32>, 'day': <tf.Tensor 'ExpandDims_3:0' shape=(None, 1) dtype=int32>, 'dep_time': <tf.Tensor 'ExpandDims_5:0' shape=(None, 1) dtype=float32>, 'dep_delay': <tf.Tensor 'ExpandDims_4:0' shape=(None, 1) dtype=float32>, 'arr_time': <tf.Tensor 'ExpandDims_1:0' shape=(None, 1) dtype=float32>, 'arr_delay': <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=float32>, 'carrier': <tf.Tensor 'ExpandDims_2:0' shape=(None, 1) dtype=string>, 'tailnum': <tf.Tensor 'ExpandDims_13:0' shape=(None, 1) dtype=string>, 'flight': <tf.Tensor 'ExpandDims_8:0' shape=(None, 1) dtype=int32>, 'origin': <tf.Tensor 'ExpandDims_12:0' shape=(None, 1) dtype=string>, 'dest': <tf.Tensor 'ExpandDims_6:0' shape=(None, 1) dtype=string>, 'distance': <tf.Tensor 'ExpandDims_7:0' shape=(None, 1) dtype=int32>, 'hour': <tf.Tensor 'ExpandDims_9:0' shape=(None, 1) dtype=float32>, 'minute': <tf.Tensor 'ExpandDims_10:0' shape=(None, 1) dtype=float32>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'year': <tf.Tensor 'ExpandDims_14:0' shape=(None, 1) dtype=int32>, 'month': <tf.Tensor 'ExpandDims_11:0' shape=(None, 1) dtype=int32>, 'day': <tf.Tensor 'ExpandDims_3:0' shape=(None, 1) dtype=int32>, 'dep_time': <tf.Tensor 'ExpandDims_5:0' shape=(None, 1) dtype=float32>, 'dep_delay': <tf.Tensor 'ExpandDims_4:0' shape=(None, 1) dtype=float32>, 'arr_time': <tf.Tensor 'ExpandDims_1:0' shape=(None, 1) dtype=float32>, 'arr_delay': <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=float32>, 'carrier': <tf.Tensor 'ExpandDims_2:0' shape=(None, 1) dtype=string>, 'tailnum': <tf.Tensor 'ExpandDims_13:0' shape=(None, 1) dtype=string>, 'flight': <tf.Tensor 'ExpandDims_8:0' shape=(None, 1) dtype=int32>, 'origin': <tf.Tensor 'ExpandDims_12:0' shape=(None, 1) dtype=string>, 'dest': <tf.Tensor 'ExpandDims_6:0' shape=(None, 1) dtype=string>, 'distance': <tf.Tensor 'ExpandDims_7:0' shape=(None, 1) dtype=int32>, 'hour': <tf.Tensor 'ExpandDims_9:0' shape=(None, 1) dtype=float32>, 'minute': <tf.Tensor 'ExpandDims_10:0' shape=(None, 1) dtype=float32>}
Consider rewriting this model with the Functional API.
3227/3236 [============================>.] - ETA: 0s - loss: nan - accuracy: 0.0000e 00WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'year': <tf.Tensor 'ExpandDims_14:0' shape=(None, 1) dtype=int32>, 'month': <tf.Tensor 'ExpandDims_11:0' shape=(None, 1) dtype=int32>, 'day': <tf.Tensor 'ExpandDims_3:0' shape=(None, 1) dtype=int32>, 'dep_time': <tf.Tensor 'ExpandDims_5:0' shape=(None, 1) dtype=float32>, 'dep_delay': <tf.Tensor 'ExpandDims_4:0' shape=(None, 1) dtype=float32>, 'arr_time': <tf.Tensor 'ExpandDims_1:0' shape=(None, 1) dtype=float32>, 'arr_delay': <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=float32>, 'carrier': <tf.Tensor 'ExpandDims_2:0' shape=(None, 1) dtype=string>, 'tailnum': <tf.Tensor 'ExpandDims_13:0' shape=(None, 1) dtype=string>, 'flight': <tf.Tensor 'ExpandDims_8:0' shape=(None, 1) dtype=int32>, 'origin': <tf.Tensor 'ExpandDims_12:0' shape=(None, 1) dtype=string>, 'dest': <tf.Tensor 'ExpandDims_6:0' shape=(None, 1) dtype=string>, 'distance': <tf.Tensor 'ExpandDims_7:0' shape=(None, 1) dtype=int32>, 'hour': <tf.Tensor 'ExpandDims_9:0' shape=(None, 1) dtype=float32>, 'minute': <tf.Tensor 'ExpandDims_10:0' shape=(None, 1) dtype=float32>}
Consider rewriting this model with the Functional API.
3236/3236 [==============================] - 16s 5ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
Epoch 2/10
3236/3236 [==============================] - 15s 5ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
Epoch 3/10
3236/3236 [==============================] - 16s 5ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
Epoch 4/10
3236/3236 [==============================] - 15s 5ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
Epoch 5/10
3236/3236 [==============================] - 15s 5ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
Epoch 6/10
3236/3236 [==============================] - 15s 4ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
Epoch 7/10
3236/3236 [==============================] - 15s 5ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
Epoch 8/10
3236/3236 [==============================] - 15s 5ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
Epoch 9/10
3236/3236 [==============================] - 15s 5ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
Epoch 10/10
3236/3236 [==============================] - 15s 5ms/step - loss: nan - accuracy: 0.0000e 00 - val_loss: nan - val_accuracy: 0.0000e 00
1012/1012 [==============================] - 2s 2ms/step - loss: nan - accuracy: 0.0000e 00
Accuracy 0.0
 

Я думал, что следовал руководству и хорошо его применил, но я не могу понять, где я ошибся. Любые предложения были бы замечательными. Спасибо

Комментарии:

1. Хорошо, я обнаружил, что model.predict всегда выводит массив nan .

2. Кроме того, не имеет смысла выводить точность, в то время как ваша проблема является проблемой регрессии, а не проблемой классификации. Сосредоточьтесь на значении потерь, а не на точности.

Ответ №1:

В основном у вас есть две проблемы в процессе:

  1. 5282 NaN В вашем фрейме данных загружается из flights.csv, если входные данные для модели NaN , то выходные данные модели также NaN для вас потеряны NaN ; Так что вы могли бы заполнить NaN , используя, например dataframe = dataframe.fillna(method='pad')
  2. Прогнозирование времени полета является формой проблемы регрессии, а не проблемой двоичной классификации; Поэтому вам следует изменить аргументы, model.compile например loss=tf.keras.losses.MeanSquaredError() , и metrics=['mae']

Код, который я запускаю на colab:

 import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder

dataframe = pd.read_csv('https://raw.githubusercontent.com/ismayc/pnwflights14/master/data/flights.csv')
dataframe = dataframe[dataframe['tailnum'].notna()]
target = 'air_time'
print(dataframe.isnull().sum().sum(), 'NaN in dataframe')
dataframe = dataframe.fillna(method='pad')
print(dataframe.isnull().sum().sum(), 'NaN after fill')

train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, label_column, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop(label_column)
    #labels = dataframe[label_column]

    ds = tf.data.Dataset.from_tensor_slices((dataframe.to_dict(orient='list'), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

feature_columns = []

# numeric cols
for header in ['dep_time','dep_delay',  'arr_time', 'arr_delay', 'distance']:
  feature_columns.append(feature_column.numeric_column(header))

# indicator_columns
categorical_columns = [ 'carrier', 'tailnum', 'origin', 'dest'] 
for col_name in categorical_columns:
  categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, dataframe[col_name].unique())
  indicator_column = feature_column.indicator_column(categorical_column)
  feature_columns.append(indicator_column)

# embedding columns
breed1 = feature_column.categorical_column_with_vocabulary_list(
      'flight', dataframe.flight.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
feature_columns.append(breed1_embedding)

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

batch_size = 32
train_ds = df_to_dataset(train, label_column = target, batch_size=batch_size)
val_ds = df_to_dataset(val,label_column = target,  shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, label_column = target, shuffle=False, batch_size=batch_size)


model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1, activation='relu')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['mae'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=3)

loss, accuracy = model.evaluate(test_ds)
print("MeanAbsoluteError", accuracy)
 

Что дает мне результаты:

 5282 NaN in dataframe
0 NaN after fill
103552 train examples
25888 validation examples
32361 test examples
Epoch 1/3
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'year': <tf.Tensor 'ExpandDims_14:0' shape=(None, 1) dtype=int32>, 'month': <tf.Tensor 'ExpandDims_11:0' shape=(None, 1) dtype=int32>, 'day': <tf.Tensor 'ExpandDims_3:0' shape=(None, 1) dtype=int32>, 'dep_time': <tf.Tensor 'ExpandDims_5:0' shape=(None, 1) dtype=float32>, 'dep_delay': <tf.Tensor 'ExpandDims_4:0' shape=(None, 1) dtype=float32>, 'arr_time': <tf.Tensor 'ExpandDims_1:0' shape=(None, 1) dtype=float32>, 'arr_delay': <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=float32>, 'carrier': <tf.Tensor 'ExpandDims_2:0' shape=(None, 1) dtype=string>, 'tailnum': <tf.Tensor 'ExpandDims_13:0' shape=(None, 1) dtype=string>, 'flight': <tf.Tensor 'ExpandDims_8:0' shape=(None, 1) dtype=int32>, 'origin': <tf.Tensor 'ExpandDims_12:0' shape=(None, 1) dtype=string>, 'dest': <tf.Tensor 'ExpandDims_6:0' shape=(None, 1) dtype=string>, 'distance': <tf.Tensor 'ExpandDims_7:0' shape=(None, 1) dtype=int32>, 'hour': <tf.Tensor 'ExpandDims_9:0' shape=(None, 1) dtype=float32>, 'minute': <tf.Tensor 'ExpandDims_10:0' shape=(None, 1) dtype=float32>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'year': <tf.Tensor 'ExpandDims_14:0' shape=(None, 1) dtype=int32>, 'month': <tf.Tensor 'ExpandDims_11:0' shape=(None, 1) dtype=int32>, 'day': <tf.Tensor 'ExpandDims_3:0' shape=(None, 1) dtype=int32>, 'dep_time': <tf.Tensor 'ExpandDims_5:0' shape=(None, 1) dtype=float32>, 'dep_delay': <tf.Tensor 'ExpandDims_4:0' shape=(None, 1) dtype=float32>, 'arr_time': <tf.Tensor 'ExpandDims_1:0' shape=(None, 1) dtype=float32>, 'arr_delay': <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=float32>, 'carrier': <tf.Tensor 'ExpandDims_2:0' shape=(None, 1) dtype=string>, 'tailnum': <tf.Tensor 'ExpandDims_13:0' shape=(None, 1) dtype=string>, 'flight': <tf.Tensor 'ExpandDims_8:0' shape=(None, 1) dtype=int32>, 'origin': <tf.Tensor 'ExpandDims_12:0' shape=(None, 1) dtype=string>, 'dest': <tf.Tensor 'ExpandDims_6:0' shape=(None, 1) dtype=string>, 'distance': <tf.Tensor 'ExpandDims_7:0' shape=(None, 1) dtype=int32>, 'hour': <tf.Tensor 'ExpandDims_9:0' shape=(None, 1) dtype=float32>, 'minute': <tf.Tensor 'ExpandDims_10:0' shape=(None, 1) dtype=float32>}
Consider rewriting this model with the Functional API.
3232/3236 [============================>.] - ETA: 0s - loss: 497.8120 - mae: 13.8204WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'year': <tf.Tensor 'ExpandDims_14:0' shape=(None, 1) dtype=int32>, 'month': <tf.Tensor 'ExpandDims_11:0' shape=(None, 1) dtype=int32>, 'day': <tf.Tensor 'ExpandDims_3:0' shape=(None, 1) dtype=int32>, 'dep_time': <tf.Tensor 'ExpandDims_5:0' shape=(None, 1) dtype=float32>, 'dep_delay': <tf.Tensor 'ExpandDims_4:0' shape=(None, 1) dtype=float32>, 'arr_time': <tf.Tensor 'ExpandDims_1:0' shape=(None, 1) dtype=float32>, 'arr_delay': <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=float32>, 'carrier': <tf.Tensor 'ExpandDims_2:0' shape=(None, 1) dtype=string>, 'tailnum': <tf.Tensor 'ExpandDims_13:0' shape=(None, 1) dtype=string>, 'flight': <tf.Tensor 'ExpandDims_8:0' shape=(None, 1) dtype=int32>, 'origin': <tf.Tensor 'ExpandDims_12:0' shape=(None, 1) dtype=string>, 'dest': <tf.Tensor 'ExpandDims_6:0' shape=(None, 1) dtype=string>, 'distance': <tf.Tensor 'ExpandDims_7:0' shape=(None, 1) dtype=int32>, 'hour': <tf.Tensor 'ExpandDims_9:0' shape=(None, 1) dtype=float32>, 'minute': <tf.Tensor 'ExpandDims_10:0' shape=(None, 1) dtype=float32>}
Consider rewriting this model with the Functional API.
3236/3236 [==============================] - 22s 6ms/step - loss: 497.4619 - mae: 13.8162 - val_loss: 99.0488 - val_mae: 6.2621
Epoch 2/3
3236/3236 [==============================] - 20s 6ms/step - loss: 197.7995 - mae: 9.6854 - val_loss: 80.7915 - val_mae: 5.3355
Epoch 3/3
3236/3236 [==============================] - 21s 6ms/step - loss: 179.8991 - mae: 9.1736 - val_loss: 86.6206 - val_mae: 5.6779
1012/1012 [==============================] - 2s 2ms/step - loss: 98.2659 - mae: 5.6766
MeanAbsoluteError 5.676607608795166