def finish_episode():
R = 0
policy_loss = 0
rewards = []
for r in policy.rewards[::-1]:
R = r + args.gamma * R
rewards.insert(0, R)
rewards = torch.Tensor(rewards)
rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
for log_prob, r in zip(policy.saved_actions, rewards):
policy_loss -= (log_prob * reward).sum()
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
del policy.rewards[:]
del policy.saved_actions[:]