«одна из переменных, необходимых для вычисления градиента, была изменена с помощью операции на месте», когда я пытаюсь запустить dppo (pytorch) в torch 1.8.1

#pytorch #reinforcement-learning

Вопрос:

я пытаюсь запустить подход pytorch для dppo (https://github.com/alexis-jacq/Pytorch-DPPO) но я получил следующую ошибку

env: выиграйте 10 Python 3.7.6 факел 1.8.1 cu111 mujoco-py 1.50.1.68 тренажерный зал 0.18.3

 Error:
UserWarning: Error detected in AddmmBackward. Traceback of forward call that caused the error:
File "", line 1, in
File "E:Aenvsgymlibmultiprocessingspawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "E:Aenvsgymlibmultiprocessingspawn.py", line 118, in _main
return self._bootstrap()
File "E:Aenvsgymlibmultiprocessingprocess.py", line 297, in _bootstrap
self.run()
File "E:Aenvsgymlibmultiprocessingprocess.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "Pytorch-RLPytorch-DPPO-mastertrain.py", line 155, in train
mu_old, sigma_sq_old, v_pred_old = model_old(batch_states)
File "E:Aenvsgymlibsite-packagestorchnnmodulesmodule.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "Pytorch-DPPO-mastermodel.py", line 53, in forward
v1 = self.v(x3)
File "E:Aenvsgymlibsite-packagestorchnnmodulesmodule.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "E:Aenvsgymlibsite-packagestorchnnmoduleslinear.py", line 94, in forward
return F.linear(input, self.weight, self.bias)
File "E:Aenvsgymlibsite-packagestorchnnfunctional.py", line 1753, in linear
return torch._C._nn.linear(input, weight, bias)
(Triggered internally at ..torchcsrcautogradpython_anomaly_mode.cpp:104.)
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
Process Process-4:
Traceback (most recent call last):
File "E:Aenvsgymlibmultiprocessingprocess.py", line 297, in _bootstrap
self.run()
File "E:Aenvsgymlibmultiprocessingprocess.py", line 99, in run
self._target(*self._args, **self.kwargs)
File "Pytorch-DPPO-mastertrain.py", line 197, in train
total_loss.backward(retain_graph=True)
File "E:Aenvsgymlibsite-packagestorchtensor.py", line 245, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "E:Aenvsgymlibsite-packagestorchautograd_init.py", line 147, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [100, 1]], which is output 0 of TBackward, is at version 3; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
 

вот где определяется модель:

 class Model(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(Model, self).__init__()
        h_size_1 = 100
        # h_size_1 = 50
        h_size_2 = 100
        # h_size_2 = 50

        self.p_fc1 = nn.Linear(num_inputs, h_size_1)
        self.p_fc2 = nn.Linear(h_size_1, h_size_2)

        self.v_fc1 = nn.Linear(num_inputs, h_size_1*5)
        self.v_fc2 = nn.Linear(h_size_1*5, h_size_2)

        self.mu = nn.Linear(h_size_2, num_outputs) #100-17
        self.log_std = nn.Parameter(torch.zeros(1, num_outputs))
        self.v = nn.Linear(h_size_2,1)

        for name, p in self.named_parameters():
            # init parameters
            if 'bias' in name:
                # examtensor = p.data.clone().detach()
                # examtensor.fill_(0)
                p.data.fill_(0)
                # p.data = examtensor
            '''
            if 'mu.weight' in name:
                p.data.normal_()
                p.data /= torch.sum(p.data**2,0).expand_as(p.data)'''
        # mode
        self.train()

    def forward(self, inputs):
        # actor  
        # x = F.tanh(self.p_fc1(inputs))
        x = torch.tanh(self.p_fc1(inputs))
        # x = F.tanh(self.p_fc2(x))
        x1 = torch.tanh(self.p_fc2(x))
        mu = self.mu(x1)
        sigma_sq = torch.exp(self.log_std)
        # critic
        # x = F.tanh(self.v_fc1(inputs))
        x2 = torch.tanh(self.v_fc1(inputs))
        # x = F.tanh(self.v_fc2(x))
        x3 = torch.tanh(self.v_fc2(x2))
        v1 = self.v(x3)
        return mu, sigma_sq, v1
 

here is where the model is implemented:

 def train(rank, params, traffic_light, counter, shared_model, shared_grad_buffers, shared_obs_stats, test_n):
    torch.manual_seed(params.seed)
    torch.autograd.set_detect_anomaly(True)
    env = gym.make(params.env_name)
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]
    print('outputnum:',num_outputs)
    model = Model(num_inputs, num_outputs)

    memory = ReplayMemory(params.exploration_size)

    state = env.reset()
    state = Variable(torch.Tensor(state).unsqueeze(0))
    done = True

    episode_length = 0
    while True:
        episode_length  = 1
        model.load_state_dict(shared_model.state_dict())

        w = -1
        av_reward = 0
        nb_runs = 0
        reward_0 = 0
        t = -1
        while w < params.exploration_size:
            t =1
            states = []
            actions = []
            rewards = []
            values = []
            returns = []
            advantages = []
            av_reward = 0
            cum_reward = 0
            cum_done = 0
            # count = 1
            # Perform K steps
            for step in range(params.num_steps):
                w =1
                shared_obs_stats.observes(state)
                state = shared_obs_stats.normalize(state)
                states.append(state)
                mu, sigma_sq, v = model(state)
                eps = torch.randn(mu.size())
                action = (mu   sigma_sq.sqrt()*Variable(eps))
                actions.append(action)
                values.append(v)
                # env_action = action.data.squeeze().numpy()
                env_action = action.data.numpy()
                # print(type(env_action))
                # print(env_action)
                
                # print(count)
                state, reward, done, _ = env.step(env_action)
                # count  =1
                
                done = (done or episode_length >= params.max_episode_length)
                # cum_reward  = reward
                cum_reward = cum_reward  reward
                reward = max(min(reward, 1), -1)
                rewards.append(reward)
                if done:
                    cum_done  = 1
                    # av_reward  = cum_reward
                    av_reward = av_reward  cum_reward
                    cum_reward = 0
                    episode_length = 0
                    state = env.reset()
                state = Variable(torch.Tensor(state).unsqueeze(0))
                if done:
                    break

            # one last step
            R = torch.zeros(1, 1)
            if not done:
                _,_,v = model(state)
                R = v.data
            # compute returns and GAE(lambda) advantages:
            values.append(Variable(R))
            R = Variable(R)
            A = Variable(torch.zeros(1, 1))
            for i in reversed(range(len(rewards))):
                td = rewards[i]   params.gamma*values[i 1].data[0,0] - values[i].data[0,0]
                A = float(td)   params.gamma*params.gae_param*A
                advantages.insert(0, A)
                R = A   values[i]
                returns.insert(0, R)
            # store usefull info:
            memory.push([states, actions, returns, advantages])

        # policy grad updates:
        av_reward /= float(cum_done 1)
        model_old = Model(num_inputs, num_outputs)
        model_old.load_state_dict(model.state_dict())
        if t==0:
            reward_0 = av_reward-(1e-2)

        for k in range(params.num_epoch):
            # load new model
            model.load_state_dict(shared_model.state_dict())
            model.zero_grad()
            # get initial signal
            signal_init = traffic_light.get()
            # new mini_batch
            batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(params.batch_size)
            # old probas

            mu_old, sigma_sq_old, v_pred_old = model_old(batch_states)
            probs_old = normal(batch_actions, mu_old, sigma_sq_old)

            # new probas
            mu, sigma_sq, v_pred = model(batch_states.detach())
            probs = normal(batch_actions, mu, sigma_sq)
            # ratio
            ratio = probs/(1e-10 probs_old)
            # clip loss
            surr1 = ratio * torch.cat([batch_advantages]*num_outputs,1) # surrogate from conservative policy iteration
            surr2 = ratio.clamp(1-params.clip, 1 params.clip) * torch.cat([batch_advantages.detach()]*num_outputs,1)
            
            loss_clip = -torch.mean(torch.min(surr1, surr2))
            # value loss
            vfloss1 = (v_pred - batch_returns)**2
            v_pred_clipped = v_pred_old   (v_pred - v_pred_old).clamp(-params.clip, params.clip)
            vfloss2 = (v_pred_clipped - batch_returns)**2

            loss_value = 0.5*torch.mean(torch.max(vfloss1, vfloss2))
            # entropy
            loss_ent = -params.ent_coeff*torch.mean(probs*torch.log(probs 1e-5))
            # total
            total_loss = (loss_clip   loss_value   loss_ent)
            #print(total_loss.data[0])
            # before step, update old_model:
            model_old.load_state_dict(model.state_dict())
            # prepare for step
            total_loss.backward(retain_graph=True)
            #ensure_shared_grads(model, shared_model)
            shared_grad_buffers.add_gradient(model)

            counter.increment()

            # wait for a new signal to continue
            while traffic_light.get() == signal_init:
                pass

        test_n  = 1
        memory.clear()
 

I googled and some says its caused by inplcae op ,but i cant seems to find any,i havent try to downgrade torch version,but is there any solutions that i dont need to downgrade ?

Before i can get this solved , i am using an other pytorch approach for dppo,(https://github.com/TianhongDai/distributed-ppo).

в нем также есть место, которое вам нужно изменить, чтобы оно работало, если у вас такая же зависть, как у меня . измените имя env_name с Walker2d-v1 на Walker2d-v2 в arguments.py:

 parse.add_argument('--env_name', default='Walker2d-v2', help='environments name')
 

в dppo_agent.py ,изменить

 action = dist.Beta(action_alpha, action_beta).analytic_mean()
 

Для

 action = dist.Beta(action_alpha, action_beta).mean
 

в fun select_actions(self,альфа,бета)

 actions = dist.beta(alpha, beta)
 

Для

 actions = dist.Beta(alpha, beta)
 
 actions_cpu = actions.data.cpu().numpy()[0]
 

Для

 actions_cpu = actions.sample().data.cpu().numpy()[0]
 
 return critic_loss.data.cpu().numpy()[0], actor_loss.data.cpu().numpy()[0]
 

Для

 return critic_loss.data.cpu().numpy(), actor_loss.data.cpu().numpy()
 

по крайней мере, это те изменения, которые мне нужно внести .

Комментарии:

1. Вы обучали его с помощью многопроцессорной обработки? Это может быть проблемой, когда модель изменилась в одном процессе, а другой процесс обнаружил, что она изменилась.

2. Я вообще не могу запустить это репо из-за зависимости mujoco.

3. @NatthaphonHongcharoen я изменил его только на 1 процесс ,но все равно получил ту же ошибку.

4. @NatthaphonHongcharoen я запускаю это на win10, я изо всех сил пытался создать mujco env,но в конце концов я получил его,моя проблема в том,что он продолжает говорить «ошибка: требуется Microsoft Visual C 14.0», и я не могу исправить это с помощью установщика Visual studio ,поэтому я нашел автономный пакет загрузки для vs2015, и я исправил проблему.

Ответ №1:

Можете ли вы попробовать изменить операцию на месте с

 av_reward /= float(cum_done 1)
 

Для

 av_reward = av_reward / float(cum_done 1)
 

Комментарии:

1. я нашел эту операцию вчера и изменил ее,но все равно получил ту же ошибку