self.dec += dec
with tf.variable_scope("encdec_attention"):
dec = layers.layer_normalize(self.dec)
dec = layers.multihead_attention(
queries=dec,
keys=encoder_output,
queries_valid_length=tgt_length,
keys_valid_length=src_length,
num_units=self._hparams.embedding.dim,
num_heads=self._hparams.num_heads,
dropout_rate=self._hparams.dropout,
causality=False,
scope="multihead_attention")
dec = tf.layers.dropout(
dec,
rate=self._hparams.dropout,
training=context.is_train())
self.dec += dec
poswise_network = FeedForwardNetwork(hparams=self._hparams["poswise_feedforward"])
with tf.variable_scope(poswise_network.variable_scope):
dec = layers.layer_normalize(self.dec)
dec = poswise_network(dec)
dec = tf.layers.dropout(
dec,
rate=self._hparams.dropout,
training=context.is_train())
self.dec += dec
// share the projection weight with word embedding