actions = np.expand_dims(actions, -1)
// get old policy probabilities and distribution
result = self.main_network.target_network.predict([current_states])
old_policy_distribution = result[1:]
// calculate gradients and apply on both the local policy network and on the global policy network
fetches = [self.main_network.online_network.output_heads[1].kl_divergence,
self.main_network.online_network.output_heads[1].entropy]
total_return = np.expand_dims(total_return, -1)
value_targets = gae_based_value_targets if self.tp.agent.estimate_value_using_gae else total_return
total_loss, policy_losses, unclipped_grads, fetch_result =\
self.main_network.online_network.accumulate_gradients(
[current_states] + [actions] + old_policy_distribution,
[total_return, advantages], additional_fetches=fetches)
self.value_targets.add_sample(value_targets)