#python #tensorflow
Вопрос:
Я использую этот пост в блоге a3c, чтобы изучить агента. он использует нейронную сеть для оптимизации производительности. но когда доходит до следующего кода, он выдает ошибку. На самом деле это говорит о том, что существует несовместимая форма входных данных и с заполнителями.Но я перепробовал множество разных форм и тоже рассматривал возможность изменения и того, и другого. но все равно я получаю ошибку при запуске части sess.run ().Как мне следует поступить, чтобы это исправить?:
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder_21' with dtype float and shape [?,2]
[[node Placeholder_21 (defined at <ipython-input-462-3c1b764fbd4e>:3) ]]
При печати входных данных, которые находятся в пакетах, я вижу:
print("State shape:", batch_states.shape)
print("Batch states:",batch_states)
print("Batch actions length:",len(batch_actions))
print("Batch Actions:", batch_actions)
print("Batch Rewards:", batch_rewards)
print("Batch Done:", batch_done)
print("Num actions:", n_actions)
State shape: (10, 2)
Batch states: [[1501.87201108 1501.87201108]
[1462.65450863 1462.65450863]
[1480.95616876 1480.95616876]
[1492.24380743 1492.24380743]
[1481.92809598 1481.92809598]
[1480.19257102 1480.19257102]
[1503.54571786 1503.54571786]
[1489.38563414 1489.38563414]
[1541.16797527 1541.16797527]
[1516.04036259 1516.04036259]]
Batch actions length: 10
Batch Actions: [[1. 0. 1. 0. 0.]
[1. 0. 1. 0. 0.]
[1. 0. 1. 0. 0.]
[1. 0. 1. 0. 0.]
[1. 0. 1. 0. 0.]
[1. 0. 1. 0. 0.]
[1. 0. 1. 0. 0.]
[1. 0. 1. 0. 0.]
[1. 0. 1. 0. 0.]
[1. 0. 1. 0. 0.]]
Batch Rewards: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Batch Done: [False False False False False False False False False False]
Num actions: 5
Вот часть кода, в которой я получаю ошибку:
states_ph = tf.placeholder('float32', [None,] list(obs_shape))
next_states_ph = tf.placeholder('float32', [None,] list(obs_shape))
actions_ph = tf.placeholder('int32', (None,n_actions))
rewards_ph = tf.placeholder('float32', (None,))
is_done_ph = tf.placeholder('float32', (None,))
# logits[n_envs, n_actions] and state_values[n_envs, n_actions]
logits, state_values = agent.symbolic_step(states_ph)
next_logits, next_state_values = agent.symbolic_step(next_states_ph)
# There is no next state if the episode is done!
next_state_values = next_state_values * (1 - is_done_ph)
# probabilities and log-probabilities for all actions
probs = tf.nn.softmax(logits, axis=-1) # [n_envs, n_actions]
logprobs = tf.nn.log_softmax(logits, axis=-1) # [n_envs, n_actions]
# log-probabilities only for agent's chosen actions
logp_actions = tf.reduce_sum(logprobs * tf.one_hot(actions_ph, n_actions), axis=-1) # [n_envs,]
# Compute advantage using rewards_ph, state_values and next_state_values.
gamma = 0.99
advantage = rewards_ph gamma * (next_state_values - state_values)
assert advantage.shape.ndims == 1, "please compute advantage for each sample, vector of shape [n_envs,]"
# Compute policy entropy given logits_seq. Mind the "-" sign!
entropy = - tf.reduce_sum(probs * logprobs, 1)
assert entropy.shape.ndims == 1, "please compute pointwise entropy vector of shape [n_envs,] "
# Compute target state values using temporal difference formula. Use rewards_ph and next_step_values
target_state_values = rewards_ph gamma*next_state_values
actor_loss = -tf.reduce_mean(logp_actions * tf.stop_gradient(advantage), axis=0) - 0.001 * tf.reduce_mean(entropy, axis=0)
critic_loss = tf.reduce_mean((state_values - tf.stop_gradient(target_state_values))**2, axis=0)
train_step = tf.train.AdamOptimizer(1e-4).minimize(actor_loss critic_loss)
sess.run(tf.global_variables_initializer())
l_act, l_crit, adv, ent = sess.run([actor_loss, critic_loss, advantage, entropy], feed_dict = {
states_ph: batch_states,
actions_ph: batch_actions,
next_states_ph: batch_states,
rewards_ph: batch_rewards,
is_done_ph: batch_done,
})