Intel OpenFL — ошибка времени выполнения: фигуры mat1 и mat2 не могут быть умножены (128×512 и 2048×4096)

# #neural-network #runtime-error #intel #openfl #federated-learning

Вопрос:

Я пытаюсь запустить свой ноутбук (который отлично работает на Google colab или других подобных платформах) на Intel OpenFL, новой платформе для FL Intel. Я использую MNIST с этим преобразованием:

 trf = transforms.Compose(  [transforms.Resize(32),  transforms.RandomHorizontalFlip(),  transforms.ToTensor(),  ])   

и это моя сеть:

 class Net(nn.Module):  def __init__(self):  super(Net, self).__init__()   # calculate same padding:  # (w - k   2*p)/s   1 = o  # =gt; p = (s(o-1) - w   k)/2   self.block_1 = nn.Sequential(  nn.Conv2d(in_channels=1,  out_channels=64,  kernel_size=(3, 3),  stride=(1, 1),  # (1(32-1)- 32   3)/2 = 1  padding=1),  nn.BatchNorm2d(64),  nn.ReLU(),  nn.Conv2d(in_channels=64,  out_channels=64,  kernel_size=(3, 3),  stride=(1, 1),  padding=1),  nn.BatchNorm2d(64),  nn.ReLU(),  nn.MaxPool2d(kernel_size=(2, 2),  stride=(2, 2))  )   self.block_2 = nn.Sequential(  nn.Conv2d(in_channels=64,  out_channels=128,  kernel_size=(3, 3),  stride=(1, 1),  padding=1),  nn.BatchNorm2d(128),  nn.ReLU(),  nn.Conv2d(in_channels=128,  out_channels=128,  kernel_size=(3, 3),  stride=(1, 1),  padding=1),  nn.BatchNorm2d(128),  nn.ReLU(),  nn.MaxPool2d(kernel_size=(2, 2),  stride=(2, 2))  )    self.block_3 = nn.Sequential(  nn.Conv2d(in_channels=128,  out_channels=256,  kernel_size=(3, 3),  stride=(1, 1),  padding=1),  nn.BatchNorm2d(256),  nn.ReLU(),  nn.Conv2d(in_channels=256,  out_channels=256,  kernel_size=(3, 3),  stride=(1, 1),  padding=1),  nn.BatchNorm2d(256),  nn.ReLU(),  nn.Conv2d(in_channels=256,  out_channels=256,  kernel_size=(3, 3),  stride=(1, 1),  padding=1),  nn.BatchNorm2d(256),  nn.ReLU(),  nn.MaxPool2d(kernel_size=(2, 2),  stride=(2, 2))  )   self.block_4 = nn.Sequential(  nn.Conv2d(in_channels=256,  out_channels=512,  kernel_size=(3, 3),  stride=(1, 1),  padding=1),  nn.BatchNorm2d(512),  nn.ReLU(),  nn.Conv2d(in_channels=512,  out_channels=512,  kernel_size=(3, 3),  stride=(1, 1),  padding=1),  nn.BatchNorm2d(512),  nn.ReLU(),  nn.Conv2d(in_channels=512,  out_channels=512,  kernel_size=(3, 3),  stride=(1, 1),  padding=1),  nn.BatchNorm2d(512),  nn.ReLU(),  nn.MaxPool2d(kernel_size=(2, 2),  stride=(2, 2))  )     self.classifier = nn.Sequential(  nn.Linear(2048, 4096),  nn.ReLU(True),  nn.Dropout(p=0.65),  nn.Linear(4096, 4096),  nn.ReLU(True),  nn.Dropout(p=0.65),  nn.Linear(4096, classes)   )   for m in self.modules():  if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):  nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='leaky_relu') # nn.init.xavier_normal_(m.weight)  if m.bias is not None:  m.bias.detach().zero_()   # self.avgpool = nn.AdaptiveAvgPool2d((7, 7))   def forward(self, x):   x = self.block_1(x)  x = self.block_2(x)  x = self.block_3(x)  x = self.block_4(x)  # x = self.avgpool(x)  x = x.view(x.size(0), -1)  x = self.classifier(x)  return x  

Но у меня есть эта ошибка:

 --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) /var/folders/pm/nj7yy3px76n6b62knyrn8r_40000gn/T/ipykernel_10337/666012611.py in lt;modulegt; ----gt; 1 final_model = fx.run_experiment(collaborators,{'aggregator.settings.rounds_to_train':5})  /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/openfl/native/native.py in run_experiment(collaborator_dict, override_config)  282 for col in plan.authorized_cols:  283 collaborator = collaborators[col] --gt; 284 collaborator.run_simulation()  285   286 # Set the weights for the final model  /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/openfl/component/collaborator/collaborator.py in run_simulation(self)  170 self.logger.info(f'Received the following tasks: {tasks}')  171 for task in tasks: --gt; 172 self.do_task(task, round_number)  173 self.logger.info(f'All tasks completed on {self.collaborator_name} '  174 f'for round {round_number}...')  /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/openfl/component/collaborator/collaborator.py in do_task(self, task, round_number)  245 round_num=round_number,  246 input_tensor_dict=input_tensor_dict, --gt; 247 **kwargs)  248   249 # Save global and local output_tensor_dicts to TensorDB  /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/openfl/federated/task/runner_pt.py in validate(self, col_name, round_num, input_tensor_dict, use_tqdm, **kwargs)  106 data, target = pt.tensor(data).to(self.device), pt.tensor(  107 target).to(self.device, dtype=pt.int64) --gt; 108 output = self(data)  109 # get the index of the max log-probability  110 pred = output.argmax(dim=1, keepdim=True)  /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)  887 result = self._slow_forward(*input, **kwargs)  888 else: --gt; 889 result = self.forward(*input, **kwargs)  890 for hook in itertools.chain(  891 _global_forward_hooks.values(),  /var/folders/pm/nj7yy3px76n6b62knyrn8r_40000gn/T/ipykernel_10337/3611293808.py in forward(self, x)  125 # x = self.avgpool(x)  126 x = x.view(x.size(0), -1) --gt; 127 x = self.classifier(x)  128 return x  /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)  887 result = self._slow_forward(*input, **kwargs)  888 else: --gt; 889 result = self.forward(*input, **kwargs)  890 for hook in itertools.chain(  891 _global_forward_hooks.values(),  /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input)  117 def forward(self, input):  118 for module in self: --gt; 119 input = module(input)  120 return input  121   /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)  887 result = self._slow_forward(*input, **kwargs)  888 else: --gt; 889 result = self.forward(*input, **kwargs)  890 for hook in itertools.chain(  891 _global_forward_hooks.values(),  /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/linear.py in forward(self, input)  92   93 def forward(self, input: Tensor) -gt; Tensor: ---gt; 94 return F.linear(input, self.weight, self.bias)  95   96 def extra_repr(self) -gt; str:  /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/functional.py in linear(input, weight, bias)  1751 if has_torch_function_variadic(input, weight):  1752 return handle_torch_function(linear, (input, weight), input, weight, bias=bias) -gt; 1753 return torch._C._nn.linear(input, weight, bias)  1754   1755   RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x512 and 2048x4096)  

Однако точно такая же сеть хорошо работает в Google Colab. Вероятно, я что-то упускаю из виду в OpenFL.