#python #machine-learning #pytorch #transformer
Вопрос:
Я относительно новичок в трансформаторах и подумал о том, чтобы запрограммировать их с нуля с помощью pytorch в качестве хорошего упражнения. Я уже протестировал модель, и она сработала. Однако при реализации обучения для задач перевода с английского на французский я получаю указанную ошибку при расчете потерь.
Код обучающей функции выглядит следующим образом:
def train_(self, x, y, lr, steps, path=None):
self.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(self.parameters(), lr=lr)
for epoch in range(steps):
for batch_id, (batch_x, batch_y) in enumerate(zip(x, y)):
if torch.cuda.is_available() and self.is_cuda:
batch_x = batch_x.cuda()
batch_y = batch_y.cuda()
out = self(batch_x, batch_y)
print(out.shape, batch_y.shape)
# Embed batch_y so result is comparable
batch_y = self.decoder.word_embedding(batch_y)
print(batch_y.shape)
loss = criterion(out, batch_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Training: epoch {epoch} batch {batch_id} loss {loss}")
Отпечатки фигур дают следующий результат:
torch.Size([2, 8, 256]) torch.Size([2, 8])
torch.Size([2, 8, 256])
С точки зрения размерности я использую размер вложения 256.
При необходимости я также могу предоставить весь код.
Спасибо.
Редактировать:
Вот весь код
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# Hyper-parameters
lr = 0.001
steps = 1000
# Attention head
class AttentionHead(nn.Module):
def __init__(self, embed_dim, head_dim):
super(AttentionHead, self).__init__()
self.embed_dim = embed_dim
self.values_layer = nn.Linear(head_dim, head_dim, bias=False)
self.keys_layer = nn.Linear(head_dim, head_dim, bias=False)
self.queries_layer = nn.Linear(head_dim, head_dim, bias=False)
def forward(self, values, keys, queries, mask=None):
# Send them through the linear layers
values = self.values_layer(values)
keys = self.keys_layer(keys)
queries = self.queries_layer(queries)
# Multiply queries and keys to score matrix
scores = torch.einsum("nah,nbh->nab", queries, keys)
# Keys shape: (n, m, head_dim)
# Queries shape: (n, m, heads_dim)
# Score shape: (n, m, m)
# If needed, then mask the score matrix
if mask is not None:
scores = scores.masked_fill(mask == 0, float("-1e20"))
# Scale the (masked) score matrix
scaled = scores / (self.embed_dim ** (1 / 2))
# Normalize the scaled score matrix
attention = torch.softmax(scaled, dim=1)
# Multiply scores and values to output
out = torch.einsum("nab,nbh->nah", attention, values)
# attention shape: (n, m, m)
# values shape: (n, m, head_dim)
# out shape: (n, m, head_dim)
return out
# Multi head attention mechanism
class MultiHeadAttentionBlock(nn.Module):
def __init__(self, embed_dim, head_num):
super(MultiHeadAttentionBlock, self).__init__()
self.embed_dim = embed_dim
self.head_num = head_num
self.head_dim = embed_dim // head_num
assert (head_num * self.head_dim == embed_dim),
"Embed size is required to be dividable by heads."
self.heads = nn.ModuleList(
[AttentionHead(embed_dim, self.head_dim)
for _ in range(head_num)]
)
self.out_layer = nn.Linear(embed_dim, embed_dim)
def forward(self, values, keys, queries, mask=None):
n = values.shape[0] # Number of examples / batch size
v_dim = values.shape[1] # Quantity of embeddings
k_dim = keys.shape[1]
q_dim = queries.shape[1]
# Split up the values, keys and queries
values = values.reshape(n, v_dim, self.head_num, self.head_dim)
keys = keys.reshape(n, k_dim, self.head_num, self.head_dim)
queries = queries.reshape(n, q_dim, self.head_num, self.head_dim)
# Iterate through heads
for i, head in enumerate(self.heads):
globals()[f"out{i}"] = head(values[:, :, i, :], keys[:, :, i, :], queries[:, :, i, :], mask)
# out shape: (n, m, head_dim)
# Concatenate the output of each head
out = globals()[f"out{0}"]
for i in range(self.head_num - 1):
out = torch.cat((out, globals()[f"out{i 1}"]), dim=2)
# Out shape: (n, m, head_num * head_dim / embed_dim)
# Send output through a last linear layer and return the outcome
out = self.out_layer(out)
return out
# Transformer block
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, head_num, dropout, forward_expansion):
super(TransformerBlock, self).__init__()
self.attention = MultiHeadAttentionBlock(embed_dim, head_num)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.feed_forward = nn.Sequential(
nn.Linear(embed_dim, forward_expansion * embed_dim),
nn.ReLU(),
nn.Linear(forward_expansion * embed_dim, embed_dim)
)
self.dropout = nn.Dropout(dropout)
def forward(self, values, keys, queries, mask=None):
attention = self.attention(values, keys, queries, mask)
x = self.dropout(self.norm1(attention queries))
forward = self.feed_forward(x)
x = self.dropout(self.norm2(forward x))
return x
# Encoder
class Encoder(nn.Module):
def __init__(self, src_vocab_dim, embed_dim, head_num, block_num, dropout, forward_expansion, max_length, device):
super(Encoder, self).__init__()
self.device = device
self.embed_dim = embed_dim
self.word_embedding = nn.Embedding(src_vocab_dim, embed_dim)
self.position_embedding = nn.Embedding(max_length, embed_dim) # max_length: max word length of all data
self.blocks = nn.ModuleList(
[TransformerBlock(embed_dim, head_num, dropout, forward_expansion)
for _ in range(block_num)]
)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
n, seq_length = x.shape # (batch size, max word length of that batch)
positions = torch.arange(0, seq_length).expand(n, seq_length).to(self.device) # 0 - seq_length along dim 1
x = self.dropout(self.word_embedding(x) self.position_embedding(positions))
for block in self.blocks:
x = block(x, x, x)
return x
# Decoder block
class DecoderBlock(nn.Module):
def __init__(self, embed_dim, head_num, dropout, forward_expansion):
super(DecoderBlock, self).__init__()
self.attention = MultiHeadAttentionBlock(embed_dim, head_num)
self.norm = nn.LayerNorm(embed_dim)
self.transformer_block = TransformerBlock(embed_dim, head_num, dropout, forward_expansion)
self.dropout = nn.Dropout(dropout)
def forward(self, x, values, keys, mask):
attention = self.attention(x, x, x, mask)
# As the outputs of the decoder's first self attention block are the queries, the encoder's
# output can be of different size. Only keys and values have to be indentical in size.
queries = self.dropout(self.norm(attention x))
x = self.transformer_block(values, keys, queries)
return x
# Decoder
class Decoder(nn.Module):
def __init__(self, trg_vocab_dim, embed_dim, head_num, block_num, dropout, forward_expansion, max_length, device):
super(Decoder, self).__init__()
self.device = device
self.embed_dim = embed_dim
self.word_embedding = nn.Embedding(trg_vocab_dim, embed_dim)
self.position_embedding = nn.Embedding(max_length, embed_dim)
self.blocks = nn.ModuleList(
[DecoderBlock(embed_dim, head_num, dropout, forward_expansion)
for _ in range(block_num)]
)
self.dropout = nn.Dropout(dropout)
self.out_layer = nn.Linear(embed_dim, embed_dim) # changed embed_dim (second time in bracket) from trg_vocab_dim
def forward(self, x, enc_out, mask):
n, seq_length = x.shape
positions = torch.arange(0, seq_length).expand(n, seq_length).to(self.device) # 0 - seq_length along dim 1
x = self.dropout(self.word_embedding(x) self.position_embedding(positions))
for block in self.blocks:
x = block(x, enc_out, enc_out, mask)
x = self.out_layer(x)
return x
# Transformer
class Transformer(nn.Module):
def __init__(self, src_vocab_dim, trg_vocab_dim, embed_dim, head_num, block_num_enc, block_num_dec,
dropout, forward_expansion, max_length, device):
super(Transformer, self).__init__()
self.device = device
self.encoder = Encoder(src_vocab_dim, embed_dim, head_num, block_num_enc, dropout, forward_expansion, max_length, device)
self.decoder = Decoder(trg_vocab_dim, embed_dim, head_num, block_num_dec, dropout, forward_expansion, max_length, device)
def make_mask(self, y):
n, m = y.shape
mask = torch.tril(torch.ones((m, m))).expand(n, m, m)
return mask.to(self.device)
def forward(self, x, y):
mask = self.make_mask(y)
out_enc = self.encoder(x)
out_dec = self.decoder(y, out_enc, mask)
return out_dec
def train_(self, x, y, lr, steps, path=None):
self.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(self.parameters(), lr=lr)
for epoch in range(steps):
for batch_id, (batch_x, batch_y) in enumerate(zip(x, y)):
if torch.cuda.is_available() and self.is_cuda:
batch_x = batch_x.cuda()
batch_y = batch_y.cuda()
out = self(batch_x, batch_y)
print(out.shape, batch_y.shape)
# Embed batch_y so result is comparable
batch_y = self.decoder.word_embedding(batch_y)
print(batch_y.shape)
loss = criterion(out, batch_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Training: epoch {epoch} batch {batch_id} loss {loss}")
if path is not None:
torch.save(self, path)
# Run
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# use for normal run
# x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(device) # input
# y = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device) # target
# added one bracket for training so this is one batch
x = torch.tensor([[[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]]).to(device) # input
y = torch.tensor([[[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]]).to(device) # target
src_vocab_dim = 10
trg_vocab_dim = 10
model = Transformer(src_vocab_dim, trg_vocab_dim, embed_dim=256, head_num=8, block_num_enc=6, block_num_dec=6,
dropout=0, forward_expansion=4, max_length=100, device=device)
model.train_(x, y, lr, steps)
Вот вся трассировка ошибок:
Traceback (most recent call last):
File "C:/Users/user/PycharmProjects/Transformer/Code.py", line 310, in <module>
model.train_(x, y, lr, steps)
File "C:/Users/user/PycharmProjects/Transformer/Code.py", line 279, in train_
loss = criterion(out, batch_y)
File "C:UsersuserAnaconda3envsTransformerlibsite-packagestorchnnmodulesmodule.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:UsersuserAnaconda3envsTransformerlibsite-packagestorchnnmodulesloss.py", line 948, in forward
ignore_index=self.ignore_index, reduction=self.reduction)
File "C:UsersuserAnaconda3envsTransformerlibsite-packagestorchnnfunctional.py", line 2422, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File "C:UsersuserAnaconda3envsTransformerlibsite-packagestorchnnfunctional.py", line 2228, in nll_loss
out_size, target.size()))
ValueError: Expected target size (2, 256), got torch.Size([2, 8, 256])
По поводу вашего вопроса:
Мои мысли были о том, что функция потерь не может сравнивать выходные данные с целью, если цель также не встроена. При отсутствии встраивания это дает следующие формы:
torch.Size([2, 8]) # target
torch.Size([2, 8, 256]) # output
Комментарии:
1. Пожалуйста, предоставьте полную трассировку ошибок и определение модели. Кроме того, почему вы внедряете свой ярлык?
2. То, как вы тренируете саму модель, неправильно. Попробуйте поискать трансформер Huggingface