#python #tensorflow #machine-learning
Вопрос:
- Все функции являются строками, за исключением «traffic_flow», «uniques» и «total».
- Я пытаюсь обучить модель предсказывать «фактическую» функцию.
- Я использую SparseCategoricalCrossEntropy в качестве функции потерь, потому что общее возможное количество «фактических» функций велико и будет продолжать расти по мере роста данных.
Я предполагаю, что это проблема с моими входными данными. Более конкретно, я думаю, что это может быть проблемой с типом, который мои метки имеют в наборе данных, поэтому я также включаю переменную набора данных.
Структура переменной набора данных:
строка), (регион, tf.строка), (город, tf.строка)]), tf.строка)>
Образец CSV:
"lc_BXXXXXXXXXXXX","lc_CXXXXXXXXXXXX","lc_AXXXXXXXXXXXX","0.25","28","49","macOS","Chrome 90.0","en","example.com","https://example.com/en/","United States","North Carolina","Charlotte","2021-04-27 21:07:52"
"lc_NXXXXXXXXXXXX","lc_CXXXXXXXXXXXX","lc_CXXXXXXXXXXXX","0.10","38","66","Win10","Chrome 90.0","en","example.com","https://example.com/en/","United States","North Carolina","Raleigh","2021-04-28 15:41:19"
"lc_JXXXXXXXXXXXX","lc_LXXXXXXXXXXXX","lc_LXXXXXXXXXXXX","0.23","28","49","macOS","Chrome 90.0","en","example.com","https://example.com/en/","United States","North Carolina","Raleigh","2021-04-28 15:41:39"
Полный код:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental import preprocessing
all_columns = ["current", "predicted", "actual", "traffic_flow", "uniques", "total", "os", "browser", "language", "referrer_domain", "referrer", "country", "region", "city", "created"]
feature_columns = ["current", "predicted", "traffic_flow", "uniques", "total", "os", "browser", "language", "referrer_domain", "referrer", "country", "region", "city"]
dataset = tf.data.experimental.make_csv_dataset(
"stackoverflow.csv",
header=False,
batch_size=32,
column_names=all_columns,
select_columns=feature_columns ['actual'],
label_name="actual",
num_epochs=1,
ignore_errors=False,)
#only using this dataframe to get the number of unique actual rows
dataframe = pd.read_csv("stackoverflow.csv", names=["current", "predicted", "actual", "traffic_flow", "uniques", "total", "os", "browser", "language", "referrer_domain", "referrer", "country", "region", "city", "created"])
#get unique actuals (the target variable)
labels = dataframe.copy().pop("actual")
num_labels = pd.unique(labels).size
#apply featurewise normalization to numerical features
def encode_numerical_feature(feature, name, dataset):
# Create a Normalization layer for our feature
normalizer = preprocessing.Normalization()
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
#take column of shape (N) and make it (N, -1)
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the statistics of the data
normalizer.adapt(feature_ds)
# Normalize the input feature
encoded_feature = normalizer(feature)
return encoded_feature
def encode_categorical_feature(feature, name, dataset, is_string):
# Create a lookup layer which will turn strings into integer indices
lookup = preprocessing.StringLookup(output_mode="int")
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the set of possible string values and assign them a fixed integer index
lookup.adapt(feature_ds)
# Turn the string input into integer indices
encoded_feature = lookup(feature)
#Encode the integer index into a float32 to match the numerical features
encoder = preprocessing.CategoryEncoding(num_tokens=lookup.vocabulary_size())
encoded_feature = encoder(encoded_feature)
return encoded_feature
train_ds = dataset
# Categorical features encoded as string
current = tf.keras.Input(shape=(1,), name="current", dtype="string")
predicted = tf.keras.Input(shape=(1,), name="predicted", dtype="string")
os = tf.keras.Input(shape=(1,), name="os", dtype="string")
browser = tf.keras.Input(shape=(1,), name="browser", dtype="string")
language = tf.keras.Input(shape=(1,), name="language", dtype="string")
referrer_domain = tf.keras.Input(shape=(1,), name="referrer_domain", dtype="string")
referrer = tf.keras.Input(shape=(1,), name="referrer", dtype="string")
country = tf.keras.Input(shape=(1,), name="country", dtype="string")
region = tf.keras.Input(shape=(1,), name="region", dtype="string")
city = tf.keras.Input(shape=(1,), name="city", dtype="string")
#numerical features
traffic_flow = tf.keras.Input(shape=(1,), name="traffic_flow")
uniques = tf.keras.Input(shape=(1,), name="uniques")
total = tf.keras.Input(shape=(1,), name="total")
all_inputs = [
current,
predicted,
os,
browser,
language,
referrer_domain,
referrer,
country,
region,
city,
traffic_flow,
uniques,
total,
]
# String categorical features
current_encoded = encode_categorical_feature(current, "current", train_ds, True)
predicted_encoded = encode_categorical_feature(predicted, "predicted", train_ds, True)
os_encoded = encode_categorical_feature(os, "os", train_ds, True)
browser_encoded = encode_categorical_feature(browser, "browser", train_ds, True)
language_encoded = encode_categorical_feature(language, "language", train_ds, True)
referrer_domain_encoded = encode_categorical_feature(referrer_domain, "referrer_domain", train_ds, True)
referrer_encoded = encode_categorical_feature(referrer, "referrer", train_ds, True)
country_encoded = encode_categorical_feature(country, "country", train_ds, True)
region_encoded = encode_categorical_feature(region, "region", train_ds, True)
city_encoded = encode_categorical_feature(city, "city", train_ds, True)
# Numerical features
traffic_flow_encoded = encode_numerical_feature(traffic_flow, "traffic_flow", train_ds)
uniques_encoded = encode_numerical_feature(uniques, "uniques", train_ds)
total_encoded = encode_numerical_feature(total, "total", train_ds)
all_features = layers.concatenate(
[
current_encoded,
predicted_encoded,
os_encoded,
browser_encoded,
language_encoded,
referrer_domain_encoded,
referrer_encoded,
country_encoded,
region_encoded,
city_encoded,
traffic_flow_encoded,
uniques_encoded,
total_encoded,
]
)
x = layers.Dense(32, activation="relu")(all_features)
output = layers.Dense(num_labels, activation="softmax")(x)
model = tf.keras.Model(all_inputs, output)
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["categorical_accuracy"])
model.fit(dataset, epochs=10)
Комментарии:
1. Добро пожаловать в StackOverflow! Я думаю, что вам нужно задать свой вопрос более точно и добавить к нему больше деталей, напишите точную ошибку в вопросе, пожалуйста.