Трансформатор PyTorch: Ошибка значения: Ожидаемый размер цели (2, 256), получил факел.Размер([2, 8, 256])

#python #machine-learning #pytorch #transformer

Вопрос:

Я относительно новичок в трансформаторах и подумал о том, чтобы запрограммировать их с нуля с помощью pytorch в качестве хорошего упражнения. Я уже протестировал модель, и она сработала. Однако при реализации обучения для задач перевода с английского на французский я получаю указанную ошибку при расчете потерь.

Код обучающей функции выглядит следующим образом:

     def train_(self, x, y, lr, steps, path=None):

        self.train()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)

        for epoch in range(steps):

            for batch_id, (batch_x, batch_y) in enumerate(zip(x, y)):

                if torch.cuda.is_available() and self.is_cuda:
                    batch_x = batch_x.cuda()
                    batch_y = batch_y.cuda()

                out = self(batch_x, batch_y)
                print(out.shape, batch_y.shape)

                # Embed batch_y so result is comparable
                batch_y = self.decoder.word_embedding(batch_y)
                print(batch_y.shape)

                loss = criterion(out, batch_y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                print(f"Training: epoch {epoch} batch {batch_id} loss {loss}")
 

Отпечатки фигур дают следующий результат:

 torch.Size([2, 8, 256]) torch.Size([2, 8])
torch.Size([2, 8, 256])
 

С точки зрения размерности я использую размер вложения 256.

При необходимости я также могу предоставить весь код.

Спасибо.

Редактировать:

Вот весь код

 # Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


# Hyper-parameters
lr = 0.001
steps = 1000


# Attention head
class AttentionHead(nn.Module):

    def __init__(self, embed_dim, head_dim):

        super(AttentionHead, self).__init__()

        self.embed_dim = embed_dim

        self.values_layer = nn.Linear(head_dim, head_dim, bias=False)
        self.keys_layer = nn.Linear(head_dim, head_dim, bias=False)
        self.queries_layer = nn.Linear(head_dim, head_dim, bias=False)

    def forward(self, values, keys, queries, mask=None):

        # Send them through the linear layers
        values = self.values_layer(values)
        keys = self.keys_layer(keys)
        queries = self.queries_layer(queries)

        # Multiply queries and keys to score matrix
        scores = torch.einsum("nah,nbh->nab", queries, keys)
        # Keys shape: (n, m, head_dim)

        # Queries shape: (n, m, heads_dim)
        # Score shape: (n, m, m)

        # If needed, then mask the score matrix
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float("-1e20"))

        # Scale the (masked) score matrix
        scaled = scores / (self.embed_dim ** (1 / 2))

        # Normalize the scaled score matrix
        attention = torch.softmax(scaled, dim=1)

        # Multiply scores and values to output
        out = torch.einsum("nab,nbh->nah", attention, values)
        # attention shape: (n, m, m)
        # values shape: (n, m, head_dim)
        # out shape: (n, m, head_dim)

        return out


# Multi head attention mechanism
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, embed_dim, head_num):

        super(MultiHeadAttentionBlock, self).__init__()

        self.embed_dim = embed_dim
        self.head_num = head_num
        self.head_dim = embed_dim // head_num
        assert (head_num * self.head_dim == embed_dim), 
            "Embed size is required to be dividable by heads."

        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, self.head_dim)
            for _ in range(head_num)]
        )

        self.out_layer = nn.Linear(embed_dim, embed_dim)

    def forward(self, values, keys, queries, mask=None):

        n = values.shape[0]  # Number of examples / batch size
        v_dim = values.shape[1]  # Quantity of embeddings
        k_dim = keys.shape[1]
        q_dim = queries.shape[1]

        # Split up the values, keys and queries
        values = values.reshape(n, v_dim, self.head_num, self.head_dim)
        keys = keys.reshape(n, k_dim, self.head_num, self.head_dim)
        queries = queries.reshape(n, q_dim, self.head_num, self.head_dim)

        # Iterate through heads
        for i, head in enumerate(self.heads):
            globals()[f"out{i}"] = head(values[:, :, i, :], keys[:, :, i, :], queries[:, :, i, :], mask)
            # out shape: (n, m, head_dim)

        # Concatenate the output of each head
        out = globals()[f"out{0}"]
        for i in range(self.head_num - 1):
            out = torch.cat((out, globals()[f"out{i   1}"]), dim=2)
            # Out shape: (n, m, head_num * head_dim / embed_dim)

        # Send output through a last linear layer and return the outcome
        out = self.out_layer(out)
        return out


# Transformer block
class TransformerBlock(nn.Module):

    def __init__(self, embed_dim, head_num, dropout, forward_expansion):

        super(TransformerBlock, self).__init__()

        self.attention = MultiHeadAttentionBlock(embed_dim, head_num)

        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, forward_expansion * embed_dim),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_dim, embed_dim)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, values, keys, queries, mask=None):

        attention = self.attention(values, keys, queries, mask)
        x = self.dropout(self.norm1(attention   queries))

        forward = self.feed_forward(x)
        x = self.dropout(self.norm2(forward   x))

        return x


# Encoder
class Encoder(nn.Module):

    def __init__(self, src_vocab_dim, embed_dim, head_num, block_num, dropout, forward_expansion, max_length, device):

        super(Encoder, self).__init__()

        self.device = device
        self.embed_dim = embed_dim
        self.word_embedding = nn.Embedding(src_vocab_dim, embed_dim)
        self.position_embedding = nn.Embedding(max_length, embed_dim)  # max_length: max word length of all data

        self.blocks = nn.ModuleList(
            [TransformerBlock(embed_dim, head_num, dropout, forward_expansion)
            for _ in range(block_num)]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        n, seq_length = x.shape  # (batch size, max word length of that batch)

        positions = torch.arange(0, seq_length).expand(n, seq_length).to(self.device)  # 0 - seq_length along dim 1
        x = self.dropout(self.word_embedding(x)   self.position_embedding(positions))

        for block in self.blocks:
            x = block(x, x, x)

        return x


# Decoder block
class DecoderBlock(nn.Module):

    def __init__(self, embed_dim, head_num, dropout, forward_expansion):

        super(DecoderBlock, self).__init__()

        self.attention = MultiHeadAttentionBlock(embed_dim, head_num)
        self.norm = nn.LayerNorm(embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, head_num, dropout, forward_expansion)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, values, keys, mask):

        attention = self.attention(x, x, x, mask)
        # As the outputs of the decoder's first self attention block are the queries, the encoder's
        # output can be of different size. Only keys and values have to be indentical in size.
        queries = self.dropout(self.norm(attention   x))

        x = self.transformer_block(values, keys, queries)

        return x


# Decoder
class Decoder(nn.Module):

    def __init__(self, trg_vocab_dim, embed_dim, head_num, block_num, dropout, forward_expansion, max_length, device):

        super(Decoder, self).__init__()

        self.device = device
        self.embed_dim = embed_dim
        self.word_embedding = nn.Embedding(trg_vocab_dim, embed_dim)
        self.position_embedding = nn.Embedding(max_length, embed_dim)

        self.blocks = nn.ModuleList(
            [DecoderBlock(embed_dim, head_num, dropout, forward_expansion)
             for _ in range(block_num)]
        )

        self.dropout = nn.Dropout(dropout)
        self.out_layer = nn.Linear(embed_dim, embed_dim)  # changed embed_dim (second time in bracket) from trg_vocab_dim

    def forward(self, x, enc_out, mask):

        n, seq_length = x.shape

        positions = torch.arange(0, seq_length).expand(n, seq_length).to(self.device)  # 0 - seq_length along dim 1
        x = self.dropout(self.word_embedding(x)   self.position_embedding(positions))

        for block in self.blocks:
            x = block(x, enc_out, enc_out, mask)

        x = self.out_layer(x)

        return x


# Transformer
class Transformer(nn.Module):

    def __init__(self, src_vocab_dim, trg_vocab_dim, embed_dim, head_num, block_num_enc, block_num_dec,
                 dropout, forward_expansion, max_length, device):

        super(Transformer, self).__init__()

        self.device = device

        self.encoder = Encoder(src_vocab_dim, embed_dim, head_num, block_num_enc, dropout, forward_expansion, max_length, device)
        self.decoder = Decoder(trg_vocab_dim, embed_dim, head_num, block_num_dec, dropout, forward_expansion, max_length, device)

    def make_mask(self, y):

        n, m = y.shape
        mask = torch.tril(torch.ones((m, m))).expand(n, m, m)

        return mask.to(self.device)

    def forward(self, x, y):

        mask = self.make_mask(y)

        out_enc = self.encoder(x)
        out_dec = self.decoder(y, out_enc, mask)

        return out_dec

    def train_(self, x, y, lr, steps, path=None):

        self.train()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)

        for epoch in range(steps):

            for batch_id, (batch_x, batch_y) in enumerate(zip(x, y)):

                if torch.cuda.is_available() and self.is_cuda:
                    batch_x = batch_x.cuda()
                    batch_y = batch_y.cuda()

                out = self(batch_x, batch_y)
                print(out.shape, batch_y.shape)

                # Embed batch_y so result is comparable
                batch_y = self.decoder.word_embedding(batch_y)
                print(batch_y.shape)

                loss = criterion(out, batch_y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                print(f"Training: epoch {epoch} batch {batch_id} loss {loss}")

            if path is not None:
                torch.save(self, path)


# Run
if __name__ == "__main__":

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # use for normal run
    # x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(device)  # input
    # y = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)  # target

    # added one bracket for training so this is one batch
    x = torch.tensor([[[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]]).to(device)  # input
    y = torch.tensor([[[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]]).to(device)  # target

    src_vocab_dim = 10
    trg_vocab_dim = 10

    model = Transformer(src_vocab_dim, trg_vocab_dim, embed_dim=256, head_num=8, block_num_enc=6, block_num_dec=6,
                 dropout=0, forward_expansion=4, max_length=100, device=device)

    model.train_(x, y, lr, steps)
 

Вот вся трассировка ошибок:

 Traceback (most recent call last):
  File "C:/Users/user/PycharmProjects/Transformer/Code.py", line 310, in <module>
    model.train_(x, y, lr, steps)
  File "C:/Users/user/PycharmProjects/Transformer/Code.py", line 279, in train_
    loss = criterion(out, batch_y)
  File "C:UsersuserAnaconda3envsTransformerlibsite-packagestorchnnmodulesmodule.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "C:UsersuserAnaconda3envsTransformerlibsite-packagestorchnnmodulesloss.py", line 948, in forward
    ignore_index=self.ignore_index, reduction=self.reduction)
  File "C:UsersuserAnaconda3envsTransformerlibsite-packagestorchnnfunctional.py", line 2422, in cross_entropy
    return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
  File "C:UsersuserAnaconda3envsTransformerlibsite-packagestorchnnfunctional.py", line 2228, in nll_loss
    out_size, target.size()))
ValueError: Expected target size (2, 256), got torch.Size([2, 8, 256])
 

По поводу вашего вопроса:

Мои мысли были о том, что функция потерь не может сравнивать выходные данные с целью, если цель также не встроена. При отсутствии встраивания это дает следующие формы:

 torch.Size([2, 8])  # target
torch.Size([2, 8, 256])  # output
 

Комментарии:

1. Пожалуйста, предоставьте полную трассировку ошибок и определение модели. Кроме того, почему вы внедряете свой ярлык?

2. То, как вы тренируете саму модель, неправильно. Попробуйте поискать трансформер Huggingface