#pytorch #reinforcement-learning
Вопрос:
я пытаюсь запустить подход pytorch для dppo (https://github.com/alexis-jacq/Pytorch-DPPO) но я получил следующую ошибку
env: выиграйте 10 Python 3.7.6 факел 1.8.1 cu111 mujoco-py 1.50.1.68 тренажерный зал 0.18.3
Error:
UserWarning: Error detected in AddmmBackward. Traceback of forward call that caused the error:
File "", line 1, in
File "E:Aenvsgymlibmultiprocessingspawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "E:Aenvsgymlibmultiprocessingspawn.py", line 118, in _main
return self._bootstrap()
File "E:Aenvsgymlibmultiprocessingprocess.py", line 297, in _bootstrap
self.run()
File "E:Aenvsgymlibmultiprocessingprocess.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "Pytorch-RLPytorch-DPPO-mastertrain.py", line 155, in train
mu_old, sigma_sq_old, v_pred_old = model_old(batch_states)
File "E:Aenvsgymlibsite-packagestorchnnmodulesmodule.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "Pytorch-DPPO-mastermodel.py", line 53, in forward
v1 = self.v(x3)
File "E:Aenvsgymlibsite-packagestorchnnmodulesmodule.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "E:Aenvsgymlibsite-packagestorchnnmoduleslinear.py", line 94, in forward
return F.linear(input, self.weight, self.bias)
File "E:Aenvsgymlibsite-packagestorchnnfunctional.py", line 1753, in linear
return torch._C._nn.linear(input, weight, bias)
(Triggered internally at ..torchcsrcautogradpython_anomaly_mode.cpp:104.)
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
Process Process-4:
Traceback (most recent call last):
File "E:Aenvsgymlibmultiprocessingprocess.py", line 297, in _bootstrap
self.run()
File "E:Aenvsgymlibmultiprocessingprocess.py", line 99, in run
self._target(*self._args, **self.kwargs)
File "Pytorch-DPPO-mastertrain.py", line 197, in train
total_loss.backward(retain_graph=True)
File "E:Aenvsgymlibsite-packagestorchtensor.py", line 245, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "E:Aenvsgymlibsite-packagestorchautograd_init.py", line 147, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [100, 1]], which is output 0 of TBackward, is at version 3; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
вот где определяется модель:
class Model(nn.Module):
def __init__(self, num_inputs, num_outputs):
super(Model, self).__init__()
h_size_1 = 100
# h_size_1 = 50
h_size_2 = 100
# h_size_2 = 50
self.p_fc1 = nn.Linear(num_inputs, h_size_1)
self.p_fc2 = nn.Linear(h_size_1, h_size_2)
self.v_fc1 = nn.Linear(num_inputs, h_size_1*5)
self.v_fc2 = nn.Linear(h_size_1*5, h_size_2)
self.mu = nn.Linear(h_size_2, num_outputs) #100-17
self.log_std = nn.Parameter(torch.zeros(1, num_outputs))
self.v = nn.Linear(h_size_2,1)
for name, p in self.named_parameters():
# init parameters
if 'bias' in name:
# examtensor = p.data.clone().detach()
# examtensor.fill_(0)
p.data.fill_(0)
# p.data = examtensor
'''
if 'mu.weight' in name:
p.data.normal_()
p.data /= torch.sum(p.data**2,0).expand_as(p.data)'''
# mode
self.train()
def forward(self, inputs):
# actor
# x = F.tanh(self.p_fc1(inputs))
x = torch.tanh(self.p_fc1(inputs))
# x = F.tanh(self.p_fc2(x))
x1 = torch.tanh(self.p_fc2(x))
mu = self.mu(x1)
sigma_sq = torch.exp(self.log_std)
# critic
# x = F.tanh(self.v_fc1(inputs))
x2 = torch.tanh(self.v_fc1(inputs))
# x = F.tanh(self.v_fc2(x))
x3 = torch.tanh(self.v_fc2(x2))
v1 = self.v(x3)
return mu, sigma_sq, v1
here is where the model is implemented:
def train(rank, params, traffic_light, counter, shared_model, shared_grad_buffers, shared_obs_stats, test_n):
torch.manual_seed(params.seed)
torch.autograd.set_detect_anomaly(True)
env = gym.make(params.env_name)
num_inputs = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]
print('outputnum:',num_outputs)
model = Model(num_inputs, num_outputs)
memory = ReplayMemory(params.exploration_size)
state = env.reset()
state = Variable(torch.Tensor(state).unsqueeze(0))
done = True
episode_length = 0
while True:
episode_length = 1
model.load_state_dict(shared_model.state_dict())
w = -1
av_reward = 0
nb_runs = 0
reward_0 = 0
t = -1
while w < params.exploration_size:
t =1
states = []
actions = []
rewards = []
values = []
returns = []
advantages = []
av_reward = 0
cum_reward = 0
cum_done = 0
# count = 1
# Perform K steps
for step in range(params.num_steps):
w =1
shared_obs_stats.observes(state)
state = shared_obs_stats.normalize(state)
states.append(state)
mu, sigma_sq, v = model(state)
eps = torch.randn(mu.size())
action = (mu sigma_sq.sqrt()*Variable(eps))
actions.append(action)
values.append(v)
# env_action = action.data.squeeze().numpy()
env_action = action.data.numpy()
# print(type(env_action))
# print(env_action)
# print(count)
state, reward, done, _ = env.step(env_action)
# count =1
done = (done or episode_length >= params.max_episode_length)
# cum_reward = reward
cum_reward = cum_reward reward
reward = max(min(reward, 1), -1)
rewards.append(reward)
if done:
cum_done = 1
# av_reward = cum_reward
av_reward = av_reward cum_reward
cum_reward = 0
episode_length = 0
state = env.reset()
state = Variable(torch.Tensor(state).unsqueeze(0))
if done:
break
# one last step
R = torch.zeros(1, 1)
if not done:
_,_,v = model(state)
R = v.data
# compute returns and GAE(lambda) advantages:
values.append(Variable(R))
R = Variable(R)
A = Variable(torch.zeros(1, 1))
for i in reversed(range(len(rewards))):
td = rewards[i] params.gamma*values[i 1].data[0,0] - values[i].data[0,0]
A = float(td) params.gamma*params.gae_param*A
advantages.insert(0, A)
R = A values[i]
returns.insert(0, R)
# store usefull info:
memory.push([states, actions, returns, advantages])
# policy grad updates:
av_reward /= float(cum_done 1)
model_old = Model(num_inputs, num_outputs)
model_old.load_state_dict(model.state_dict())
if t==0:
reward_0 = av_reward-(1e-2)
for k in range(params.num_epoch):
# load new model
model.load_state_dict(shared_model.state_dict())
model.zero_grad()
# get initial signal
signal_init = traffic_light.get()
# new mini_batch
batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(params.batch_size)
# old probas
mu_old, sigma_sq_old, v_pred_old = model_old(batch_states)
probs_old = normal(batch_actions, mu_old, sigma_sq_old)
# new probas
mu, sigma_sq, v_pred = model(batch_states.detach())
probs = normal(batch_actions, mu, sigma_sq)
# ratio
ratio = probs/(1e-10 probs_old)
# clip loss
surr1 = ratio * torch.cat([batch_advantages]*num_outputs,1) # surrogate from conservative policy iteration
surr2 = ratio.clamp(1-params.clip, 1 params.clip) * torch.cat([batch_advantages.detach()]*num_outputs,1)
loss_clip = -torch.mean(torch.min(surr1, surr2))
# value loss
vfloss1 = (v_pred - batch_returns)**2
v_pred_clipped = v_pred_old (v_pred - v_pred_old).clamp(-params.clip, params.clip)
vfloss2 = (v_pred_clipped - batch_returns)**2
loss_value = 0.5*torch.mean(torch.max(vfloss1, vfloss2))
# entropy
loss_ent = -params.ent_coeff*torch.mean(probs*torch.log(probs 1e-5))
# total
total_loss = (loss_clip loss_value loss_ent)
#print(total_loss.data[0])
# before step, update old_model:
model_old.load_state_dict(model.state_dict())
# prepare for step
total_loss.backward(retain_graph=True)
#ensure_shared_grads(model, shared_model)
shared_grad_buffers.add_gradient(model)
counter.increment()
# wait for a new signal to continue
while traffic_light.get() == signal_init:
pass
test_n = 1
memory.clear()
I googled and some says its caused by inplcae op ,but i cant seems to find any,i havent try to downgrade torch version,but is there any solutions that i dont need to downgrade ?
Before i can get this solved , i am using an other pytorch approach for dppo,(https://github.com/TianhongDai/distributed-ppo).
в нем также есть место, которое вам нужно изменить, чтобы оно работало, если у вас такая же зависть, как у меня . измените имя env_name с Walker2d-v1 на Walker2d-v2 в arguments.py:
parse.add_argument('--env_name', default='Walker2d-v2', help='environments name')
в dppo_agent.py ,изменить
action = dist.Beta(action_alpha, action_beta).analytic_mean()
Для
action = dist.Beta(action_alpha, action_beta).mean
в fun select_actions(self,альфа,бета)
actions = dist.beta(alpha, beta)
Для
actions = dist.Beta(alpha, beta)
actions_cpu = actions.data.cpu().numpy()[0]
Для
actions_cpu = actions.sample().data.cpu().numpy()[0]
return critic_loss.data.cpu().numpy()[0], actor_loss.data.cpu().numpy()[0]
Для
return critic_loss.data.cpu().numpy(), actor_loss.data.cpu().numpy()
по крайней мере, это те изменения, которые мне нужно внести .
Комментарии:
1. Вы обучали его с помощью многопроцессорной обработки? Это может быть проблемой, когда модель изменилась в одном процессе, а другой процесс обнаружил, что она изменилась.
2. Я вообще не могу запустить это репо из-за зависимости mujoco.
3. @NatthaphonHongcharoen я изменил его только на 1 процесс ,но все равно получил ту же ошибку.
4. @NatthaphonHongcharoen я запускаю это на win10, я изо всех сил пытался создать mujco env,но в конце концов я получил его,моя проблема в том,что он продолжает говорить «ошибка: требуется Microsoft Visual C 14.0», и я не могу исправить это с помощью установщика Visual studio ,поэтому я нашел автономный пакет загрузки для vs2015, и я исправил проблему.
Ответ №1:
Можете ли вы попробовать изменить операцию на месте с
av_reward /= float(cum_done 1)
Для
av_reward = av_reward / float(cum_done 1)
Комментарии:
1. я нашел эту операцию вчера и изменил ее,но все равно получил ту же ошибку