// Compute feature combinations
repeated = K.reshape(K.tile(linear_transf_X, [1, N]), (N * N, self.F_)) // (N^2 x F")
tiled = K.tile(linear_transf_X, [N, 1]) // (N^2 x F")
combinations = K.concatenate([repeated, tiled]) // (N^2 x 2F")
combination_slices = K.reshape(combinations, (N, -1, 2 * self.F_)) // (N x N x 2F")
// Attention head
dense = K.squeeze(K.dot(combination_slices, attention_kernel), -1) // a(Wh_i, Wh_j) in the paper (N x N x 1)