outputs = output.transpose(0, 1).contiguous()
if state.previous_input is not None:
outputs = outputs[state.previous_input.size(0):]
attn = attn[:, state.previous_input.size(0):].squeeze()
attn = torch.stack([attn])
attns["std"] = attn
if self._copy:
attns["copy"] = attn
After Change
tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1) \
.expand(tgt_batch, tgt_len, tgt_len)
saved_inputs = []
for i in range(self.num_layers):
prev_layer_input = None
if state.previous_input is not None:
prev_layer_input = state.previous_layer_inputs[i]