# #neural-network #runtime-error #intel #openfl #federated-learning
Вопрос:
Я пытаюсь запустить свой ноутбук (который отлично работает на Google colab или других подобных платформах) на Intel OpenFL, новой платформе для FL Intel. Я использую MNIST с этим преобразованием:
trf = transforms.Compose( [transforms.Resize(32), transforms.RandomHorizontalFlip(), transforms.ToTensor(), ])
и это моя сеть:
class Net(nn.Module): def __init__(self): super(Net, self).__init__() # calculate same padding: # (w - k 2*p)/s 1 = o # =gt; p = (s(o-1) - w k)/2 self.block_1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(3, 3), stride=(1, 1), # (1(32-1)- 32 3)/2 = 1 padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)) ) self.block_2 = nn.Sequential( nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)) ) self.block_3 = nn.Sequential( nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.BatchNorm2d(256), nn.ReLU(), nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.BatchNorm2d(256), nn.ReLU(), nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.BatchNorm2d(256), nn.ReLU(), nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)) ) self.block_4 = nn.Sequential( nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.BatchNorm2d(512), nn.ReLU(), nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.BatchNorm2d(512), nn.ReLU(), nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.BatchNorm2d(512), nn.ReLU(), nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)) ) self.classifier = nn.Sequential( nn.Linear(2048, 4096), nn.ReLU(True), nn.Dropout(p=0.65), nn.Linear(4096, 4096), nn.ReLU(True), nn.Dropout(p=0.65), nn.Linear(4096, classes) ) for m in self.modules(): if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear): nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='leaky_relu') # nn.init.xavier_normal_(m.weight) if m.bias is not None: m.bias.detach().zero_() # self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) def forward(self, x): x = self.block_1(x) x = self.block_2(x) x = self.block_3(x) x = self.block_4(x) # x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.classifier(x) return x
Но у меня есть эта ошибка:
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) /var/folders/pm/nj7yy3px76n6b62knyrn8r_40000gn/T/ipykernel_10337/666012611.py in lt;modulegt; ----gt; 1 final_model = fx.run_experiment(collaborators,{'aggregator.settings.rounds_to_train':5}) /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/openfl/native/native.py in run_experiment(collaborator_dict, override_config) 282 for col in plan.authorized_cols: 283 collaborator = collaborators[col] --gt; 284 collaborator.run_simulation() 285 286 # Set the weights for the final model /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/openfl/component/collaborator/collaborator.py in run_simulation(self) 170 self.logger.info(f'Received the following tasks: {tasks}') 171 for task in tasks: --gt; 172 self.do_task(task, round_number) 173 self.logger.info(f'All tasks completed on {self.collaborator_name} ' 174 f'for round {round_number}...') /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/openfl/component/collaborator/collaborator.py in do_task(self, task, round_number) 245 round_num=round_number, 246 input_tensor_dict=input_tensor_dict, --gt; 247 **kwargs) 248 249 # Save global and local output_tensor_dicts to TensorDB /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/openfl/federated/task/runner_pt.py in validate(self, col_name, round_num, input_tensor_dict, use_tqdm, **kwargs) 106 data, target = pt.tensor(data).to(self.device), pt.tensor( 107 target).to(self.device, dtype=pt.int64) --gt; 108 output = self(data) 109 # get the index of the max log-probability 110 pred = output.argmax(dim=1, keepdim=True) /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 887 result = self._slow_forward(*input, **kwargs) 888 else: --gt; 889 result = self.forward(*input, **kwargs) 890 for hook in itertools.chain( 891 _global_forward_hooks.values(), /var/folders/pm/nj7yy3px76n6b62knyrn8r_40000gn/T/ipykernel_10337/3611293808.py in forward(self, x) 125 # x = self.avgpool(x) 126 x = x.view(x.size(0), -1) --gt; 127 x = self.classifier(x) 128 return x /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 887 result = self._slow_forward(*input, **kwargs) 888 else: --gt; 889 result = self.forward(*input, **kwargs) 890 for hook in itertools.chain( 891 _global_forward_hooks.values(), /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input) 117 def forward(self, input): 118 for module in self: --gt; 119 input = module(input) 120 return input 121 /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 887 result = self._slow_forward(*input, **kwargs) 888 else: --gt; 889 result = self.forward(*input, **kwargs) 890 for hook in itertools.chain( 891 _global_forward_hooks.values(), /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/modules/linear.py in forward(self, input) 92 93 def forward(self, input: Tensor) -gt; Tensor: ---gt; 94 return F.linear(input, self.weight, self.bias) 95 96 def extra_repr(self) -gt; str: /opt/anaconda3/envs/my_env/lib/python3.7/site-packages/torch/nn/functional.py in linear(input, weight, bias) 1751 if has_torch_function_variadic(input, weight): 1752 return handle_torch_function(linear, (input, weight), input, weight, bias=bias) -gt; 1753 return torch._C._nn.linear(input, weight, bias) 1754 1755 RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x512 and 2048x4096)
Однако точно такая же сеть хорошо работает в Google Colab. Вероятно, я что-то упускаю из виду в OpenFL.