#python #tensorflow #protocol-buffers #tfrecord
Вопрос:
Почему это https://www.tensorflow.org/tutorials/load_data/tfrecord#tfrecord_files_using_tfdata предлагаете использовать одно значение в сообщении списка вместо всего столбца? Это (согласно моим выводам) значительно более экономично в плане пространства. Что я упускаю?
Код копируется с вышеупомянутого веб-сайта до пунктирной линии, после чего это мой собственный код. Один пример запуска дал мне 843898 для метода обучения и 123994 для моего.
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example
# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.
def _bytes_feature(value):
"""Returns a bytes_list from a string / byte."""
if isinstance(value, type(tf.constant(0))):
value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float_feature(value):
"""Returns a float_list from a float / double."""
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
# The number of observations in the dataset.
n_observations = int(1e4)
# Boolean feature, encoded as False or True.
feature0 = np.random.choice([False, True], n_observations)
# Integer feature, random from 0 to 4.
feature1 = np.random.randint(0, 5, n_observations)
# String feature
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]
# Float feature, from a standard normal distribution
feature3 = np.random.randn(n_observations)
def serialize_example(feature0, feature1, feature2, feature3):
"""
Creates a tf.train.Example message ready to be written to a file.
"""
# Create a dictionary mapping the feature name to the tf.train.Example-compatible
# data type.
feature = {
'feature0': _int64_feature(feature0),
'feature1': _int64_feature(feature1),
'feature2': _bytes_feature(feature2),
'feature3': _float_feature(feature3),
}
# Create a Features message using tf.train.Example.
example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
return example_proto.SerializeToString()
#--------------------------
length = 0
# Example Tensorflow: convert one "row" per iteration to the TFRecord-format
for i in range(n_observations):
se = serialize_example(feature0[i], feature1[i], feature2[i], feature3[i])
length = len(se)
print(length)
# Example Me: Dump the entire column of the corresponding feature into the respective list
def create_example2(feature0, feature1, feature2, feature3):
feature = {
'feature0': Feature(int64_list=Int64List(value=feature0)),
'feature1': Feature(int64_list=Int64List(value=feature1)),
'feature2': Feature(bytes_list=BytesList(value=feature2)),
'feature3': Feature(float_list=FloatList(value=feature3)),
}
example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
return example_proto.SerializeToString()
example2 = create_example2(feature0, feature1, feature2, feature3)
print(len(example2))