#python #pytorch #gpu #bert-language-model #tpu
Вопрос:
Я пытаюсь выполнить тонкую настройку, используя модель Берта. Я использую предварительно обученную модель Берта и пайторча.
Проблема в том, что результат GPU и результат TPU немного отличаются. (точность отличается примерно на -2% ~ 2%)
Я использовал тот же набор данных, то же семя. И разница в настройках такова:
device: gpu(RTX2080Ti * 4) vs tpu 2 (1core) pytorch version: torch 1.5(gpu) vs torch 1.10 amp; torch_xla 1.10(tpu) # In the code of tpu setting, I modified some lines. # 1. set the device self.device = xm.xla_device() # 2. optimizer xm.optimizer_step(self.optimizer, barrier=True)
Я установил для параметра pytorch версии TPU значение 1,10, потому что Google vm и tpu не предлагают версию torch 1.15.
И полный код учебного класса выглядит так:
from logging import Logger import torch from sklearn.metrics import accuracy_score, classification_report from torch import nn from torch.optim.adamw import AdamW from torch.utils.data.dataloader import DataLoader from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from transformers import get_linear_schedule_with_warmup from tasks.cola.config import TrainConfig from tasks.cola.model import COLAModel class Trainer: def __init__( self, config: TrainConfig, model: COLAModel, train_data_loader: DataLoader, dev_data_loader: DataLoader, test_data_loader: DataLoader, logger: Logger, summary_writer: SummaryWriter, ): self.config = config if config.use_tpu == True: import torch_xla import torch_xla.core.xla_model as xm # for using tpu self.device = xm.xla_device() self.model = model print('TPU running...') elif config.use_tpu == False: # multi gpu(3) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if (self.device.type == 'cuda') and (torch.cuda.device_count() gt; 1): print('Multi GPU({}) activate'.format(torch.cuda.device_count())) self.model = nn.DataParallel(model, device_ids=[0,1,2,3]) else: self.model = model self.model.to(self.device) self.train_data_loader = train_data_loader self.dev_data_loader = dev_data_loader self.test_data_loader = test_data_loader self.logger = logger self.summary_writer = summary_writer self.criterion = nn.CrossEntropyLoss() self.optimizer = AdamW(model.parameters(), lr=config.learning_rate) self.steps_per_epoch = len(train_data_loader) self.total_steps = self.steps_per_epoch * config.num_epochs self.warmup_steps = config.warmup_step_ratio * self.total_steps self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.total_steps ) self.global_step = 0 def train(self): # train self.logger.info("========== train ==========") self.logger.info(f"device : {self.device}") self.logger.info(f"dataset length/ train : {len(self.train_data_loader.dataset)}") self.logger.info(f"dataset length/ dev : {len(self.dev_data_loader.dataset)}") self.logger.info(f"dataset length/ test : {len(self.test_data_loader.dataset)}") self.logger.info(f"batch size : {self.config.batch_size}") self.logger.info(f"learning rate : {self.config.learning_rate}") self.logger.info(f"dropout prob : {self.config.dropout_prob}") self.logger.info(f"total epoch : {self.config.num_epochs}") self.logger.info(f"steps per epoch : {self.steps_per_epoch}") self.logger.info(f"total steps : {self.total_steps}") self.logger.info(f"warmup steps : {self.warmup_steps}n") for epoch in range(self.config.num_epochs): running_loss = 0.0 train_targets = [] train_predictions = [] for step, data in enumerate(tqdm(self.train_data_loader)): self.model.train() self.global_step = 1 input_token_ids = data[0].to(self.device) attention_mask = data[1].to(self.device) token_type_ids = data[2].to(self.device) labels = data[3].to(self.device) loss, outputs = self._train_step(input_token_ids, attention_mask, token_type_ids, labels) running_loss = loss train_targets.extend(labels.tolist()) train_predictions.extend(outputs.argmax(-1).tolist()) if (step 1) % self.config.logging_interval == 0: train_loss = running_loss / self.config.logging_interval train_acc = accuracy_score(train_targets, train_predictions) self.logger.info(f"Epoch {epoch}, Step {step 1}t| Loss {train_loss:.4f} Acc {train_acc:.4f}") self.summary_writer.add_scalar("cola/train/loss", train_loss, self.global_step) self.summary_writer.add_scalar("cola/train/accuracy", train_acc, self.global_step) running_loss = 0.0 train_targets = [] train_predictions = [] # dev every epoch dev_loss, dev_targets, dev_predictions = self._validation(self.dev_data_loader) dev_report = classification_report(dev_targets, dev_predictions, digits=4) self.logger.info(f"######### DEV REPORT #EP{epoch} #########") self.logger.info(f"Loss {dev_loss:.4f}") self.logger.info(f"n{dev_report}") dev_acc = accuracy_score(dev_targets, dev_predictions) self.summary_writer.add_scalar("cola/dev/loss", dev_loss, self.global_step) self.summary_writer.add_scalar("cola/dev/accuracy", dev_acc, self.global_step) # test every epoch test_loss, test_targets, test_predictions = self._validation(self.test_data_loader) test_report = classification_report(test_targets, test_predictions, digits=4) self.logger.info(f"######### TEST REPORT #EP{epoch} #########") self.logger.info(f"Loss {test_loss:.4f}") self.logger.info(f"n{test_report}") test_acc = accuracy_score(test_targets, test_predictions) self.summary_writer.add_scalar("cola/test/loss", test_loss, self.global_step) self.summary_writer.add_scalar("cola/test/accuracy", test_acc, self.global_step) # output_path = os.path.join(self.config.checkpoint_dir, f"model-epoch-{epoch}.pth") # torch.save(self.model.state_dict(), output_path) # self.logger.info(f"MODEL IS SAVED AT {output_path}n") def _train_step(self, input_token_ids, attention_mask, token_type_ids, labels): self.optimizer.zero_grad() outputs = self.model(input_token_ids, attention_mask, token_type_ids) loss = self.criterion(outputs, labels) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) if self.config.use_tpu == True: # optimizer for TPU (Note: Cloud TPU-specific code!) import torch_xla.core.xla_model as xm # for using tpu xm.optimizer_step(self.optimizer, barrier=True) else: self.optimizer.step() #self.optimizer.step() self.scheduler.step() return loss.item(), outputs def _validation(self, data_loader): self.model.eval() running_loss = 0.0 targets = [] predictions = [] with torch.no_grad(): for data in data_loader: input_token_ids = data[0].to(self.device) attention_mask = data[1].to(self.device) token_type_ids = data[2].to(self.device) labels = data[3].to(self.device) outputs = self.model(input_token_ids, attention_mask, token_type_ids) loss = self.criterion(outputs, labels) running_loss = loss.item() targets.extend(labels.tolist()) predictions.extend(outputs.argmax(-1).tolist()) assert len(targets) == len(predictions) mean_loss = running_loss / len(data_loader) return mean_loss, targets, predictions
Есть ли какой-нибудь способ решить эту проблему?