35b2c4917344f338eda67c78673cf4064b3b4265,examples/reinforcement_learning/tutorial_DQN.py,,,#,95

Before Change


                rAll += r
                s = s1
                //// Reduce chance of random action if an episode is done.
                if d ==True:
                    e = 1. / ((i / 50) + 10)  // reduce e, GLIE: Greey in the limit with infinite Exploration
                    break

            //// Note that, the rewards here with random action
            running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
            // print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \
            //     (i, num_episodes, rAll, running_reward, time.time() - episode_time))
            print("Episode: {}/{}  | Episode Reward: {:.4f} | Running Average Reward: {:.4f}  | Running Time: {:.4f}"\

After Change



    t0 = time.time()
    if args.train:
        all_episode_reward = []
        for i in range(num_episodes):
            //// Reset environment and get first new observation
            // episode_time = time.time()
            s = env.reset()  // observation is state, integer 0 ~ 15
            rAll = 0
            if render: env.render()
            for j in range(99):  // step index, maximum step is 99
                //// Choose an action by greedily (with e chance of random action) from the Q-network
                allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy()
                a = np.argmax(allQ, 1)

                //// e-Greedy Exploration !!! sample random action
                if np.random.rand(1) < e:
                    a[0] = env.action_space.sample()
                //// Get new state and reward from environment
                s1, r, d, _ = env.step(a[0])
                if render: env.render()
                //// Obtain the Q" values by feeding the new state through our network
                Q1 = qnetwork(np.asarray([to_one_hot(s1, 16)], dtype=np.float32)).numpy()

                //// Obtain maxQ" and set our target value for chosen action.
                maxQ1 = np.max(Q1)  // in Q-Learning, policy is greedy, so we use "max" to select the next action.
                targetQ = allQ
                targetQ[0, a[0]] = r + lambd * maxQ1
                //// Train network using target and predicted Q values
                // it is not real target Q value, it is just an estimation,
                // but check the Q-Learning update formula:
                //    Q"(s,a) <- Q(s,a) + alpha(r + lambd * maxQ(s",a") - Q(s, a))
                // minimizing |r + lambd * maxQ(s",a") - Q(s, a)|^2 equals to force Q"(s,a) ≈ Q(s,a)
                with tf.GradientTape() as tape:
                    _qvalues = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32))
                    _loss = tl.cost.mean_squared_error(targetQ, _qvalues, is_mean=False)
                grad = tape.gradient(_loss, train_weights)
                optimizer.apply_gradients(zip(grad, train_weights))

                rAll += r
                s = s1
                //// Reduce chance of random action if an episode is done.
                if d == True:
                    e = 1. / ((i / 50) + 10)  // reduce e, GLIE: Greey in the limit with infinite Exploration
                    break

            //// Note that, the rewards here with random action
            running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
            // print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \
            //     (i, num_episodes, rAll, running_reward, time.time() - episode_time))
            print("Training  | Episode: {}/{}  | Episode Reward: {:.4f} | Running Time: {:.4f}" \
                  .format(i, num_episodes, rAll, time.time() - t0))

            if i == 0:
                all_episode_reward.append(rAll)
            else:
                all_episode_reward.append(all_episode_reward[-1] * 0.9 + rAll * 0.1)

Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 5

Instances


Project Name: tensorlayer/tensorlayer
Commit Name: 35b2c4917344f338eda67c78673cf4064b3b4265
Time: 2020-02-07
Author: 34995488+Tokarev-TT-33@users.noreply.github.com
File Name: examples/reinforcement_learning/tutorial_DQN.py
Class Name:
Method Name:


Project Name: NifTK/NiftyNet
Commit Name: bd333dd43d69b26015eb3f201afe1772ba701a41
Time: 2018-05-07
Author: wenqi.li@ucl.ac.uk
File Name: niftynet/contrib/dataset_sampler/sampler_uniform_v2.py
Class Name: UniformSampler
Method Name: layer_op


Project Name: ray-project/ray
Commit Name: ac24d1db30976e3fc038051bcf2b46d32be416cb
Time: 2020-12-12
Author: maxfitton@anyscale.com
File Name: dashboard/datacenter.py
Class Name: DataOrganizer
Method Name: _get_actor