#python #gpu #tensorflow2.0 #tensorflow2
#python #графический процессор #tensorflow2.0 #tensorflow2
Вопрос:
tensorflow2.1 python3.7
Недавно я сменил свои инструменты глубокого обучения с PyTorch на TensorFlow2.
Когда я создаю регрессионную модель на основе LSTM, происходят некоторые странные вещи. Я не могу преобразовать данные numpy в тензор GPU, который я использую в tf.data.Dataset
.
Это код:
import tensorflow
# some instance of my data
data=[[1,2,3,4],[1,3,2,1]]
labels=[[0.1,0.2,0.3,0.4],[0.5,0.5,0.6,0.5]]
# then i build a dataset
class TermWeightDataset(object):
"""Dataset generator 方式读取"""
def __init__(self,data_file,vocab,batch_size=50,max_count=100):
self.vocab = vocab
self.batch_size = batch_size
self.data,self.labels,self.max_count = read_data(data_file,self.vocab,True)
#max count 可以手工指定一个
self.max_count = min(self.max_count,max_count)
self.data = tf.keras.preprocessing.sequence.pad_sequences(self.data,maxlen=self.max_count,padding='post')
self.labels = tf.keras.preprocessing.sequence.pad_sequences(self.labels,maxlen=self.max_count,padding='post',value=0.0,dtype="float64")
# print(self.max_count)
def generate(self):
for example,label in zip(self.data,self.labels):
yield tf.identity(example),tf.identity(label)
def create_datasets(self):
return tf.data.Dataset.from_generator(self.generate,(tf.int64,tf.float32)).padded_batch(self.batch_size,padded_shapes=(self.max_count,self.max_count))
# return tf.data.Dataset.from_tensor_slices((self.data,self.labels)).batch(self.batch_size,drop_remainder=True)
# model
class LSTMBasedModel(tf.keras.Model):
def __init__(self,vocab_size,
input_dim,
hiddien_dim,
output_dim,
embedding_matrix=None
):
super(LSTMBasedModel,self).__init__()
self.vocab_size = vocab_size
self.input_dim = input_dim
self.hidden_dim = hiddien_dim
self.output_dim = output_dim
if isinstance(embedding_matrix,np.ndarray):
emb_init = tf.keras.initializers.Constant(embedding_matrix)
self.embedding = tf.keras.layers.Embedding(self.vocab_size,self.input_dim,embeddings_initializer=emb_init) #trainable=False
else:
self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.input_dim)
self.rnn = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hiddien_dim,return_sequences=False)) #返回最后的hidden state
self.dense = tf.keras.layers.Dense(units = self.output_dim)
def call(self,inputs):
tensor = self.embedding(inputs)
state = self.rnn(tensor)
out = self.dense(state)
return out
Затем я обучаю свою модель в режиме ожидания, обучающий код выглядит следующим образом:
def train_one_step(model,inputs_batch,labels_batch,loss_function,optimizer):
with tf.GradientTape() as tape:
logits = model(inputs_batch)
loss = loss_function(logits,labels_batch)
loss = tf.reduce_mean(loss)
grads = tape.gradient(loss,model.variables)
optimizer.apply_gradients(grads_and_vars=zip(grads,model.variables))
return logits,loss
def train(model,dataset,epochs,loss_function,optimizer,ckpt,writer):
step = 0
for epoch in range(epochs):
min_loss = float("inf")
all_loss = 0
size = 0
for idx, (inputs_batch,labels_batch) in enumerate(dataset):
# print(inputs_batch)
inputs_batch = tf.identity(inputs_batch)
labels_batch = tf.identity(labels_batch)
logits,loss = train_one_step(model,inputs_batch,labels_batch,loss_function,optimizer)
step = 1
size = 1
all_loss = loss.numpy()
if step %100 ==0:
logging.info("epoch:{}, step:{}, loss:{:.4f}".format(epoch,step,loss))
with writer.as_default():
tf.summary.scalar("batch loss",loss,step=step)
if min_loss>loss:
min_loss = loss
ckpt.save()
logging.info("model save:{}".format(step))
with writer.as_default():
logging.info("epoch:{}, epoch loss:{:.4f}".format(epoch,all_loss/size))
tf.summary.scalar("epoch loss",all_loss/size,step=epoch)
Затем мой код запускается и всегда работает на CPU, а не на GPU, а скорость использования GPU всегда равна 0.
Но, когда я предоставляю некоторый случайный data(np.random.rand(100,22))
, графический процессор работает.
Извините за недостающую информацию журнала.
Обновить
2020-09-09 15:41:20.795645: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-09 15:41:25.905156: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libnvinfer.so.6
2020-09-09 15:41:25.935928: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvrtc.so.10.2: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/nvidia/lib64/:/home/hdp-map/cuda-10.0/lib64
2020-09-09 15:41:25.935988: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2020-09-09 15:41:32.316385: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-09 15:41:32.384586: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties:
pciBusID: 0000:86:00.0 name: Tesla K80 computeCapability: 3.7
coreClock: 0.8235GHz coreCount: 13 deviceMemorySize: 11.92GiB deviceMemoryBandwidth: 223.96GiB/s
2020-09-09 15:41:32.384654: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-09 15:41:32.384705: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-09-09 15:41:32.502781: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-09-09 15:41:32.738911: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2020-09-09 15:41:32.932498: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10
2020-09-09 15:41:33.036902: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10
2020-09-09 15:41:33.037047: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-09-09 15:41:33.051746: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1697] Adding visible gpu devices: 0
2020-09-09 15:41:33.171633: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2399770000 Hz
2020-09-09 15:41:33.173189: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x562238f2c200 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-09-09 15:41:33.173261: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
2020-09-09 15:41:33.272368: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56223790cb60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-09-09 15:41:33.272459: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla K80, Compute Capability 3.7
2020-09-09 15:41:33.274399: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties:
pciBusID: 0000:86:00.0 name: Tesla K80 computeCapability: 3.7
coreClock: 0.8235GHz coreCount: 13 deviceMemorySize: 11.92GiB deviceMemoryBandwidth: 223.96GiB/s
2020-09-09 15:41:33.274461: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-09 15:41:33.274494: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-09-09 15:41:33.274533: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-09-09 15:41:33.274559: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2020-09-09 15:41:33.274584: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10
2020-09-09 15:41:33.274618: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10
2020-09-09 15:41:33.274642: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-09-09 15:41:33.277794: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1697] Adding visible gpu devices: 0
2020-09-09 15:41:33.277896: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-09 15:41:39.052053: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1096] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-09-09 15:41:39.052127: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1102] 0
2020-09-09 15:41:39.052147: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] 0: N
2020-09-09 15:41:39.068973: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1241] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11483 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: 0000:86:00.0, compute capability: 3.7)
run.sh: line 13: 21205 Killed /opt/conda/bin/python3 main.py --data data/ --vocab misc/vocab.txt --tensorboard_dir log/ --batch_size 64 --embedding_path misc/glove.vec.txt --lr 0.0001 --output_dir ckpt/ --epochs 20 --gpus 0
Комментарии:
1. Можете ли вы опубликовать полный вывод журнала вашей программы по мере ее запуска? (Все эти информационные сообщения из tensorflow могут содержать некоторую полезную информацию)
2. Я обновляю полный журнал