self.multiattn = nn.ModuleList([onmt.modules.MultiHeadedAttention(8, self.hidden_size) for _ in range(self.layers)])
self.linear_out = nn.ModuleList([BLinear(self.hidden_size, 2*self.hidden_size) for _ in range(self.layers)])
self.linear_final = nn.ModuleList([BLinear(2*self.hidden_size, self.hidden_size) for _ in range(self.layers)])
self.layer_norm = nn.ModuleList([BLayerNorm(self.hidden_size) for _ in range(self.layers)])