S2 = S2.cuda()
labels = labels.cuda()
// Wrap to autograd.Variable
X, S1 = Variable(X), Variable(S1)
S2, labels = Variable(S2), Variable(labels)
// Zero the parameter gradients
optimizer.zero_grad()
After Change
if X.size()[0] != config.batch_size:
continue // Drop those data, if not enough for a batch
// Automaticlly select device to make the code device agnostic
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
X = X.to(device)
S1 = S1.to(device)
S2 = S2.to(device)
labels = labels.to(device)
net = net.to(device)
// Zero the parameter gradients
optimizer.zero_grad()
// Forward pass
outputs, predictions = net(X, S1, S2, config)