t0 = time.time()
if args.train:
all_episode_reward = []
for i in range(num_episodes):
//// Reset environment and get first new observation
// episode_time = time.time()
s = env.reset() // observation is state, integer 0 ~ 15
rAll = 0
if render: env.render()
for j in range(99): // step index, maximum step is 99
//// Choose an action by greedily (with e chance of random action) from the Q-network
allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy()
a = np.argmax(allQ, 1)
//// e-Greedy Exploration !!! sample random action
if np.random.rand(1) < e:
a[0] = env.action_space.sample()
//// Get new state and reward from environment
s1, r, d, _ = env.step(a[0])
if render: env.render()
//// Obtain the Q" values by feeding the new state through our network
Q1 = qnetwork(np.asarray([to_one_hot(s1, 16)], dtype=np.float32)).numpy()
//// Obtain maxQ" and set our target value for chosen action.
maxQ1 = np.max(Q1) // in Q-Learning, policy is greedy, so we use "max" to select the next action.
targetQ = allQ
targetQ[0, a[0]] = r + lambd * maxQ1
//// Train network using target and predicted Q values
// it is not real target Q value, it is just an estimation,
// but check the Q-Learning update formula:
// Q"(s,a) <- Q(s,a) + alpha(r + lambd * maxQ(s",a") - Q(s, a))
// minimizing |r + lambd * maxQ(s",a") - Q(s, a)|^2 equals to force Q"(s,a) ≈ Q(s,a)
with tf.GradientTape() as tape:
_qvalues = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32))
_loss = tl.cost.mean_squared_error(targetQ, _qvalues, is_mean=False)
grad = tape.gradient(_loss, train_weights)
optimizer.apply_gradients(zip(grad, train_weights))
rAll += r
s = s1
//// Reduce chance of random action if an episode is done.
if d == True:
e = 1. / ((i / 50) + 10) // reduce e, GLIE: Greey in the limit with infinite Exploration
break
//// Note that, the rewards here with random action
running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
// print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \
// (i, num_episodes, rAll, running_reward, time.time() - episode_time))
print("Training | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}" \
.format(i, num_episodes, rAll, time.time() - t0))
if i == 0:
all_episode_reward.append(rAll)
else:
all_episode_reward.append(all_episode_reward[-1] * 0.9 + rAll * 0.1)