Training loss did not increase

I am training PyTorch model for binary classification and my input vector of length 561 [341 is one hot encoding] and the others are features between 0 and 1. my output is [0,1] or [1,0] . My issue is that the training loss is always decrease i tried to try more epochs until 200 but nothing change, I am wondering if I am calculating the loss in wrong way, sometimes training loss is decreasing and test loss is decrease and increase.

Here is my model, I also tried different models with lstm and cnn and the loss always decreasing

class MyRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MyRegression, self).__init__()
        # One layer
        self.linear1 = nn.Linear(input_dim, 128)
        self.linear2 = nn.Linear(128, output_dim)
    def forward(self, x):
        return self.linear2(self.linear1(x))

and the training function


def run_gradient_descent(model, data_train, data_val, batch_size, learning_rate, weight_decay=0, num_epochs=20):
    
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    #criterion = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    iters, losses, train_losses, test_losses = [], [], [], []
    iters_sub, train_acc, val_acc = [], [] ,[]
    print(batch_size)
    
    # weight sampler
    class0, class1 =labels_count(data_train)
    dataset_counts = [class0, class1]
    print(dataset_counts)
    num_samples = sum(dataset_counts)
    labels = [tag for _, tag in data_train]
    #max_value = max(input_list)
    #index = input_list.index(max_value)
    class_weights = [1./dataset_counts[i] for i in range(len(dataset_counts))]
    labels_indics = [i.index(max(i)) for i in labels ]
    weights = [class_weights[i] for i in labels_indics] # labels.max(1, keepdim=True)[1]
    weights = numpy.array(weights)
    samples_weight = torch.from_numpy(weights)
    samples_weigth = samples_weight.double()
    sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, int(num_samples), replacement=True)
    
    
    train_loader = torch.utils.data.DataLoader(
        data_train,
        batch_size=batch_size,
        shuffle=False,
        sampler = sampler,
        collate_fn=lambda d: ([x[0] for x in d], [x[1] for x in d]),
        num_workers=os.cpu_count()//2
    )

    # training
    n = 0 # the number of iterations
    for epoch in tqdm(range(num_epochs), desc="epoch"):
        correct = 0
        total = 0
        for xs, ts in tqdm(train_loader, desc="train"):
            xs = torch.FloatTensor(xs).to(device)
            ts = torch.FloatTensor(ts).to(device)
            # print("batch index {}, 0/1: {}/{}".format(n,ts.tolist().count([1,0]),ts.tolist().count([0,1])))

            # if len(ts) != batch_size:
            #     print("ops")
            #     continue
            model.train()
            
            zs = model(xs)
            zs = zs.to(device)
            loss = criterion(zs, ts)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            iters.append(n)
            loss.detach().cpu()
            
            losses.append(float(loss)/len(ts)) # compute *average* loss
            
            pred = zs.max(1, keepdim=True)[1] # get the index of the max logit
            target = ts.max(1, keepdim=True)[1]
            correct += pred.eq(target).sum().item()
            total += int(ts.shape[0])
            acc = correct / total

            if (n % len(train_loader) == 0) and n>0 and epoch%2==0:
                
                test_acc, test_loss = get_accuracy(model, data_val)
            
                iters_sub.append(n)
                train_acc.append(acc)
                val_acc.append(test_acc)
        
                train_losses.append(sum(losses)/len(losses))
                test_losses.append(test_loss)
                
                print("Epoch", epoch, "train_acc", acc)
                print("Epoch", epoch, "test_acc", test_acc)
                print("Epoch", epoch, "train_loss", sum(losses)/len(losses))
                print("Epoch", epoch, "test_loss", test_loss)

             # increment the iteration number
            n += 1
        torch.save(model.state_dict(), f"{MODEL_NAME}/checkpoint_epoch{epoch}.pt")


    # plotting
    plt.title("Training Curve (batch_size={}, lr={})".format(batch_size, learning_rate))
    plt.plot(iters_sub, train_losses, label="Train")
    plt.plot(iters_sub, test_losses, label="Test")
    plt.legend(loc="best")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    
    plt.savefig(f"{MODEL_NAME}/training_test_loss.png")
    
    # plt.show()
    plt.clf()
    plt.title("Training Curve (batch_size={}, lr={})".format(batch_size, learning_rate))
    plt.plot(iters_sub, train_acc, label="Train")
    plt.plot(iters_sub, val_acc, label="Test")
    plt.xlabel("Iterations")
    plt.ylabel("Accuracy")
    plt.legend(loc="best")
    plt.savefig(f"{MODEL_NAME}/training_acc.png")
    #plt.show()
    return model

main function

model = MyRegression(374, 2)
run_gradient_descent(
    model,
    training_set,
    test_set,
    batch_size= 64,  
    learning_rate=1e-2,
    num_epochs=200
)

Here is part of the training results so you can see that is is decreasing

Epoch 2 train_acc 0.578125
Epoch 2 test_acc 0.7346171218510883
Epoch 2 train_loss 0.003494985813946325
Epoch 2 test_loss 0.00318981208993754
Epoch 4 train_acc 0.671875
Epoch 4 test_acc 0.7021743310868525
Epoch 4 train_loss 0.0034714722261212196
Epoch 4 test_loss 0.0033061892530283398
Epoch 6 train_acc 0.75
Epoch 6 test_acc 0.7614966302787455
Epoch 6 train_loss 0.003462064279302097
Epoch 6 test_loss 0.003087314312623757
Epoch 8 train_acc 0.625
Epoch 8 test_acc 0.7343577405202831
Epoch 8 train_loss 0.0034565126970269753
Epoch 8 test_loss 0.0032059013449951632
Epoch 10 train_acc 0.578125
Epoch 10 test_acc 0.7587194612023667
Epoch 10 train_loss 0.0034528369772701857
Epoch 10 test_loss 0.003112017690331294
Epoch 12 train_acc 0.65625
Epoch 12 test_acc 0.7097187501397528
Epoch 12 train_loss 0.003450584381555143
Epoch 12 test_loss 0.003285413007535127
Epoch 14 train_acc 0.578125
Epoch 14 test_acc 0.7509648538296759
Epoch 14 train_loss 0.0034486886994226553
Epoch 14 test_loss 0.003145160475069196
Epoch 16 train_acc 0.625
Epoch 16 test_acc 0.7629612403794123
Epoch 16 train_loss 0.0034474354597715125
Epoch 16 test_loss 0.003106232365138448
Epoch 18 train_acc 0.703125
Epoch 18 test_acc 0.7527134417666552
Epoch 18 train_loss 0.0034464063646294537
Epoch 18 test_loss 0.0031368749897371824
Epoch 20 train_acc 0.734375
Epoch 20 test_acc 0.6917431767057677
Epoch 20 train_loss 0.0034454527557537763
Epoch 20 test_loss 0.003363367490148118
Epoch 22 train_acc 0.671875
Epoch 22 test_acc 0.7229382538269926
Epoch 22 train_loss 0.003444858143091548
Epoch 22 test_loss 0.003254974437443727
Epoch 24 train_acc 0.703125
Epoch 24 test_acc 0.7514299513883609
Epoch 24 train_loss 0.003444201508544531
Epoch 24 test_loss 0.0031422660971916283

Read more here: Source link