a = actor.choose_action(s)
s_new, r, done, info = env.step(a)
s_new = s_new.astype(np.float32)
if done: r = -20
// these may helpful in some tasks
// if abs(s_new[0]) >= env.observation_space.high[0]:
// // cart moves more than 2.4 units from the center
// r = -20
// reward for the distance between cart to the center
// r -= abs(s_new[0]) * .1
all_r.append(r)
td_error = critic.learn(s, r, s_new) // learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)]
actor.learn(s, a, td_error) // learn Policy : true_gradient = grad[logPi(s, a) * td_error]
s = s_new
t += 1
if done or t >= MAX_EP_STEPS:
ep_rs_sum = sum(all_r)
if "running_reward" not in globals():
running_reward = ep_rs_sum
else:
running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
// start rending if running_reward greater than a threshold
// if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
print("Episode: %d reward: %f running_reward %f took: %.5f" % \
(i_episode, ep_rs_sum, running_reward, time.time() - episode_time))
// Early Stopping for quick check
if t >= MAX_EP_STEPS:
print("Early Stopping")
s = env.reset().astype(np.float32)
rall = 0
while True:
env.render()
// a = actor.choose_action(s)