for i in range(self._hparams.num_blocks):
with tf.variable_scope("num_blocks_{}".format(i)):
with tf.variable_scope("self_attention"):
dec = layers.layer_normalize(self.dec)
dec = layers.multihead_attention(
queries=dec,
keys=dec,
queries_valid_length=tgt_length,
keys_valid_length=tgt_length,
num_units=self._hparams.embedding.dim,
num_heads=self._hparams.num_heads,
dropout_rate=self._hparams.dropout,
causality=True,
scope="self_attention")
dec = tf.layers.dropout(
dec,
rate=self._hparams.dropout,
training=context.is_train())
self.dec += dec
with tf.variable_scope("encdec_attention"):
dec = layers.layer_normalize(self.dec)
dec = layers.multihead_attention(
queries=dec,
keys=encoder_output,
queries_valid_length=tgt_length,
keys_valid_length=src_length,
num_units=self._hparams.embedding.dim,
num_heads=self._hparams.num_heads,
dropout_rate=self._hparams.dropout,
causality=False,
scope="multihead_attention")dec = tf.layers.dropout(
dec,
rate=self._hparams.dropout,
training=context.is_train())
self.dec += dec
poswise_network = FeedForwardNetwork(hparams=self._hparams["poswise_feedforward"])
with tf.variable_scope(poswise_network.variable_scope):
dec = layers.layer_normalize(self.dec)
dec = poswise_network(dec)
dec = tf.layers.dropout(
dec,
rate=self._hparams.dropout,
training=context.is_train())
self.dec += dec
// share the projection weight with word embedding
if self._hparams.share_embed_and_transform:
batch_size, length= tf.shape(self.dec)[0], tf.shape(self.dec)[1]
depth = self.dec.get_shape()[2]
self.dec = tf.reshape(self.dec, [-1, depth])
self.logits = tf.matmul(self.dec, self._output_transform)
self.logits = tf.reshape(self.logits, [batch_size, length, self._vocab_size])
else: