#python #tensorflow #google-colaboratory #tpu
Вопрос:
Я пытаюсь классифицировать изображения cifar10 с помощью Google colab TPU, согласно официальному учебнику.
Однако я получил следующую ошибку.
Ошибка невыполнения: найдено 6 корневых ошибок.
Без использования TPU я не видел никакой ошибки. Может кто-нибудь поделиться некоторыми советами?
Ниже прилагается мой код.
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import VGG16
import tensorflow as tf
import numpy as np
import os
import tensorflow_datasets as tfds
# preparing TPU
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)
# download cifar10 data
ds_test, ds_train = tfds.load('cifar10', split=['test', 'train'], )
# Preprocess the images
def resize_with_crop(ip):
image = ip['image']
label = ip['label']
image = tf.expand_dims(image,0)
label = tf.one_hot(label,10)
label = tf.expand_dims(label,0)
return (image, label)
ds_train_ = ds_train.map(resize_with_crop)
ds_test_ = ds_test.map(resize_with_crop)
with strategy.scope():
model = VGG16(input_shape = (32, 32, 3), weights=None, classes=10)
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics= ['accuracy'])
history = model.fit(ds_train_,
batch_size = 32,
steps_per_epoch = 64,
epochs = 1000,
validation_data = ds_test_,
shuffle = True,)
Ошибка, которую я получил, — это ниже.
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
<ipython-input-2-588bff080f0b> in <module>()
25 epochs = 1000,
26 validation_data = ds_test_,
---> 27 shuffle = True,)
28
29 '''
13 frames
/usr/local/lib/python3.7/dist-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1187 logs = tmp_logs # No error, now safe to assign to logs.
1188 end_step = step data_handler.step_increment
-> 1189 callbacks.on_train_batch_end(end_step, logs)
1190 if self.stop_training:
1191 break
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in on_train_batch_end(self, batch, logs)
433 """
434 if self._should_call_train_batch_hooks:
--> 435 self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
436
437 def on_test_batch_begin(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _call_batch_hook(self, mode, hook, batch, logs)
293 self._call_batch_begin_hook(mode, batch, logs)
294 elif hook == 'end':
--> 295 self._call_batch_end_hook(mode, batch, logs)
296 else:
297 raise ValueError('Unrecognized hook: {}'.format(hook))
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _call_batch_end_hook(self, mode, batch, logs)
313 self._batch_times.append(batch_time)
314
--> 315 self._call_batch_hook_helper(hook_name, batch, logs)
316
317 if len(self._batch_times) >= self._num_batches_for_timing_check:
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _call_batch_hook_helper(self, hook_name, batch, logs)
351 for callback in self.callbacks:
352 hook = getattr(callback, hook_name)
--> 353 hook(batch, logs)
354
355 if self._check_timing:
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in on_train_batch_end(self, batch, logs)
1026
1027 def on_train_batch_end(self, batch, logs=None):
-> 1028 self._batch_update_progbar(batch, logs)
1029
1030 def on_test_batch_end(self, batch, logs=None):
/usr/local/lib/python3.7/dist-packages/keras/callbacks.py in _batch_update_progbar(self, batch, logs)
1098 if self.verbose == 1:
1099 # Only block async when verbose = 1.
-> 1100 logs = tf_utils.sync_to_numpy_or_python_type(logs)
1101 self.progbar.update(self.seen, list(logs.items()), finalize=False)
1102
/usr/local/lib/python3.7/dist-packages/keras/utils/tf_utils.py in sync_to_numpy_or_python_type(tensors)
514 return t # Don't turn ragged or sparse tensors to NumPy.
515
--> 516 return tf.nest.map_structure(_to_single_numpy_or_python_type, tensors)
517
518
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in map_structure(func, *structure, **kwargs)
867
868 return pack_sequence_as(
--> 869 structure[0], [func(*x) for x in entries],
870 expand_composites=expand_composites)
871
/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py in <listcomp>(.0)
867
868 return pack_sequence_as(
--> 869 structure[0], [func(*x) for x in entries],
870 expand_composites=expand_composites)
871
/usr/local/lib/python3.7/dist-packages/keras/utils/tf_utils.py in _to_single_numpy_or_python_type(t)
510 def _to_single_numpy_or_python_type(t):
511 if isinstance(t, tf.Tensor):
--> 512 x = t.numpy()
513 return x.item() if np.ndim(x) == 0 else x
514 return t # Don't turn ragged or sparse tensors to NumPy.
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in numpy(self)
1092 """
1093 # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
-> 1094 maybe_arr = self._numpy() # pylint: disable=protected-access
1095 return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
1096
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
1060 return self._numpy_internal()
1061 except core._NotOkStatusException as e: # pylint: disable=protected-access
-> 1062 six.raise_from(core._status_to_exception(e.code, e.message), None) # pylint: disable=protected-access
1063
1064 @property
/usr/local/lib/python3.7/dist-packages/six.py in raise_from(value, from_value)
UnimplementedError: 6 root error(s) found.
(0) Unimplemented: {{function_node __inference_train_function_127397}} File system scheme '[local]' not implemented (file: '/root/tensorflow_datasets/cifar10/3.0.2/cifar10-train.tfrecord-00000-of-00001')
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNext_2]]
(1) Unimplemented: {{function_node __inference_train_function_127397}} File system scheme '[local]' not implemented (file: '/root/tensorflow_datasets/cifar10/3.0.2/cifar10-train.tfrecord-00000-of-00001')
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNext_6]]
(2) Unimplemented: {{function_node __inference_train_function_127397}} File system scheme '[local]' not implemented (file: '/root/tensorflow_datasets/cifar10/3.0.2/cifar10-train.tfrecord-00000-of-00001')
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNext_3]]
[[cluster_train_function/_execute_6_0/_187]]
(3) Unimplemented: {{function_node __inference_train_function_127397}} File system scheme '[local]' not implemented (file: '/root/tensorflow_datasets/cifar10/3.0.2/cifar10-train.tfrecord-00000-of-00001')
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNext_3]]
[[tpu_compile_succeeded_assert/_17093395999373799140/_5/_159]]
(4) Unimplemented: {{function_node __inference_train_function_127397}} File system scheme '[local]' not implemented (file: '/root/tensorflow_datasets/cifar10/3.0.2/cifar10-train.tfrecord-00000-of-00001')
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNext_3]]
[[tpu_compile_succeeded_assert/_17093395999373799140/_5/_111]]
(5) Unimplemented: {{function_node __inference_train_function_127397}} File system scheme '[local]' not implemented (file: '/root/tensorflow_datasets/cifar10/3.0.2/cifar10-train.tfrecord-00000-of-00001')
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
[[IteratorGetNext_3]]
0 successful operations.
3 derived errors ignored.
Ответ №1:
если вы посмотрите на ошибку, она говорит File system scheme '[local]' not implemented
.
tfds часто не размещает все наборы данных и загружает некоторые из них из исходного источника на ваш локальный компьютер, к которому TPU не может получить доступ.
Облачные ТПУ могут получать доступ к данным только в GCS, так как зарегистрирована только файловая система GCS. Пожалуйста, смотрите: https://cloud.google.com/tpu/docs/troubleshooting#cannot_use_local_filesystem для получения более подробной информации.
Вы можете сделать tfds для загрузки данных в корзину gs (подробности здесь):
# Authenticate your account to access GCS.
from google.colab import auth
auth.authenticate_user()
...
# download cifar10 data to a gs bucket.
ds_test, ds_train = tfds.load('cifar10', split=['test', 'train'], try_gcs=True, data_dir="gs://YOUR_BUCKET_NAME")
Обратите внимание, что недавно введенные виртуальные машины TPU могут получать доступ к локальным файлам. И вы можете создавать виртуальные машины TPU в GCP, но еще не в Colab/Kaggle.