Hello everyone. I’m trying to train loaded pre-trained resnet18 using arcface loss. I have a dataset of about 1,300,000 photos of 10171 people. One epoch takes about an hour to train. Loss starts from about 40, by the end of the epoch it decreases to 20. Over the next 2 epochs, it decreases to 16-17, and this is where the decrease in loss ends, or slows down very much. I tried using the StepLR scheduler every 1000 batches multiplying the learning rate by 0.9 and setting the threshold to 3e-4. This is my first time trying to train a model using ArcFace loss and I don’t know all the nuances yet. Tell me, please, how can I fix the problem and what could be wrong? Following is my code:
This is my model:
class ArcFaceHead(nn.Module):
def __init__(self, num_classes, device=device, embedding_size=16, margin=0.5, scale=30):
super(ArcFaceHead, self).__init__()
self.num_classes = num_classes
self.embedding_size = embedding_size
self.margin = margin
self.scale = scale
self.cos_m = math.cos(margin)
self.sin_m = math.sin(margin)
self.threshold = math.cos(math.pi - margin)
self.mm = self.sin_m * margin
self.weight = nn.Parameter(torch.FloatTensor(num_classes, embedding_size))
nn.init.xavier_uniform_(self.weight)
def forward(self, x, labels):
cosine = F.linear(F.normalize(x), F.normalize(self.weight))
sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
phi = cosine * self.cos_m - sine * self.sin_m
phi = torch.where(cosine > self.threshold, phi, cosine - self.mm)
one_hot = torch.zeros(cosine.size(), device=x.device)
one_hot.scatter_(1, labels.view(-1, 1).long(), 1)
output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
output *= self.scale
return output
class ResNet18ArcFace(nn.Module):
def __init__(self, num_classes, embedding_size=16):
super(ResNet18ArcFace, self).__init__()
self.embedding_size = embedding_size
self.backbone = models.resnet18(pretrained=True)
self.backbone.fc = nn.Sequential(
nn.Linear(512, self.embedding_size),
nn.BatchNorm1d(self.embedding_size)
)
self.head = ArcFaceHead(num_classes, embedding_size=self.embedding_size)
def forward(self, x, labels):
x = self.backbone(x)
x = self.head(x, labels)
return x
My train function:
def train(model, criterion, device, train_loader, optimizer, epoch, best_loss, scheduler, saved=0):
progress_bar = tqdm(total=len(train_loader.dataset), dynamic_ncols=True, leave=False) #, dynamic_ncols=True)
model.train()
for batch_idx, (data, labels) in enumerate(train_loader):
optimizer.zero_grad()
data, labels = data.to(device), labels.to(device)
output = model(data, labels)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
l_rate = get_lr(optimizer)
progress_bar.set_description(f'Epoch: {epoch} Loss: {loss.item():.6f} LR: {l_rate:.7f} Saved: {saved}')
if (batch_idx + 1) % LOG_INTERVAL == 0:
saved += 1
if l_rate > 3e-4:
scheduler.step()
best_loss = loss
epoch = epoch
path = f"/content/drive/My Drive/model_checkpoint/note_arc_checkpoint_loss.pth"
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': best_loss,
}, path)
best_loss = loss
progress_bar.update(len(data))
And my “settings”:
EPOCHS = 50
FEATURE_SIZE = 16
NUM_OF_CLASSES = len(train_dataset.classes)
LR = 0.01 # initial learning rate
LR_STEP = 1
W_DECAY = 0.95
model = ResNet18ArcFace(NUM_OF_CLASSES)
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=W_DECAY)
scheduler = StepLR(optimizer, step_size=LR_STEP, gamma=0.9)
Read more here: Source link