2353e2106d704919282490022702f13503becf3c,ch15/04_train_ppo.py,,,#,56
Before Change
loss_value_v.backward()
opt_crt.step()
opt_act.zero_grad()
mu_v, var_v = net_act(states_v)
adv_v = vals_ref_v.unsqueeze(dim=-1) - value_v.detach()
logprob_pi_v = calc_logprob(mu_v, var_v, actions_v)
logprob_old_pi_v = logprob_pi_v.detach()
After Change
opt_crt.step()
adv_v = vals_ref_v.unsqueeze(dim=-1) - value_v.detach()
logprob_old_pi_v = None
sum_loss_policy = 0.0
for _ in range(PPO_EPOCHES):
opt_act.zero_grad()
mu_v, var_v = net_act(states_v)
logprob_pi_v = calc_logprob(mu_v, var_v, actions_v)
if logprob_old_pi_v is None:
logprob_old_pi_v = logprob_pi_v.detach()
surr_obj_v = adv_v * torch.exp(logprob_pi_v - logprob_old_pi_v)
// clipped_surr_v = torch.clamp(surr_obj_v, 1.0 - PPO_EPS, 1.0 + PPO_EPS)
// loss_policy_v = -torch.min(surr_obj_v, clipped_surr_v).mean()
loss_policy_v = -surr_obj_v.mean()
loss_policy_v.backward()
sum_loss_policy += loss_policy_v.data.cpu().numpy()[0]
opt_act.step()
// entropy_loss_v = ENTROPY_BETA * (-(torch.log(2*math.pi*var_v) + 1)/2).mean()
// loss_v = loss_policy_v + entropy_loss_v
// loss_v.backward()
// optimizer.step()
tb_tracker.track("advantage", adv_v, step_idx)
tb_tracker.track("values", value_v, step_idx)
tb_tracker.track("batch_rewards", vals_ref_v, step_idx)
// tb_tracker.track("loss_entropy", entropy_loss_v, step_idx)
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 7
Instances
Project Name: PacktPublishing/Deep-Reinforcement-Learning-Hands-On
Commit Name: 2353e2106d704919282490022702f13503becf3c
Time: 2018-02-10
Author: max.lapan@gmail.com
File Name: ch15/04_train_ppo.py
Class Name:
Method Name:
Project Name: eriklindernoren/PyTorch-YOLOv3
Commit Name: 7fffa98b9166a03b4a53fb40202d97b09e8e9036
Time: 2018-05-29
Author: eriklindernoren@gmail.com
File Name: train.py
Class Name:
Method Name:
Project Name: hunkim/PyTorchZeroToAll
Commit Name: c4610ff26a01a0622bc11dcac0f0812f05c56e0c
Time: 2017-11-02
Author: hunkim@gmail.com
File Name: 12_4_name_classify.py
Class Name:
Method Name: train