Почему предлагается хранить отдельные значения в списках tf.train.example, а не целые столбцы?

#python #tensorflow #protocol-buffers #tfrecord

Вопрос:

Почему это https://www.tensorflow.org/tutorials/load_data/tfrecord#tfrecord_files_using_tfdata предлагаете использовать одно значение в сообщении списка вместо всего столбца? Это (согласно моим выводам) значительно более экономично в плане пространства. Что я упускаю?

Код копируется с вышеупомянутого веб-сайта до пунктирной линии, после чего это мой собственный код. Один пример запуска дал мне 843898 для метода обучения и 123994 для моего.

 import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example



# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

# The number of observations in the dataset.
n_observations = int(1e4)

# Boolean feature, encoded as False or True.
feature0 = np.random.choice([False, True], n_observations)

# Integer feature, random from 0 to 4.
feature1 = np.random.randint(0, 5, n_observations)

# String feature
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]

# Float feature, from a standard normal distribution
feature3 = np.random.randn(n_observations)


def serialize_example(feature0, feature1, feature2, feature3):
  """
  Creates a tf.train.Example message ready to be written to a file.
  """
  # Create a dictionary mapping the feature name to the tf.train.Example-compatible
  # data type.
  feature = {
      'feature0': _int64_feature(feature0),
      'feature1': _int64_feature(feature1),
      'feature2': _bytes_feature(feature2),
      'feature3': _float_feature(feature3),
  }

  # Create a Features message using tf.train.Example.

  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

#--------------------------
length = 0

# Example Tensorflow: convert one "row" per iteration to the TFRecord-format
for i in range(n_observations):
  se = serialize_example(feature0[i], feature1[i], feature2[i], feature3[i])
  length  = len(se)

print(length)

# Example Me: Dump the entire column of the corresponding feature into the respective list
def create_example2(feature0, feature1, feature2, feature3):
  feature = {
    'feature0': Feature(int64_list=Int64List(value=feature0)),
    'feature1': Feature(int64_list=Int64List(value=feature1)),
    'feature2': Feature(bytes_list=BytesList(value=feature2)),
    'feature3': Feature(float_list=FloatList(value=feature3)),
    }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  
  return example_proto.SerializeToString()

example2 = create_example2(feature0, feature1, feature2, feature3)

print(len(example2))