def learn(self, s, a, td):
// _, exp_v = self.sess.run([self.train_op, self.exp_v], {self.s: [s], self.a: [a], self.td_error: td[0]})
with tf.GradientTape() as tape:
_logits = self.model([s]).outputs
// _probs = tf.nn.softmax(_logits)
_exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0])
grad = tape.gradient(_exp_v, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
After Change
def learn(self, s, a, td):
with tf.GradientTape() as tape:
_logits = self.model(np.array([s]))
//// cross-entropy loss weighted by td-error (advantage),
// the cross-entropy mearsures the difference of two probability distributions: the predicted logits and sampled action distribution,
// then weighted by the td-error: small difference of real and predict actions for large td-error (advantage); and vice versa.
_exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0])