Feb 6

•

Hello,
I tried comparing the BERT and ModernBERT models for a classification task, and I’ve reached the point where I find ModernBERT less performant than BERT. I’m wondering if it might just be an issue with the model initialization during fine-tuning. If anyone has any thoughts on this, I’d love to hear them!

Thanks in advance 🙂

🚀 The Bert code :

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def oneHot(labels,label_unique):
    one_hot = []
    for label in label_unique : 
        if label in labels:
            one_hot.append(1)
        else : 
            one_hot.append(0)
    return one_hot
    
def preprocess_data(new_df, label_unique):
    new_df["one_hot_labels"] = new_df["labels"].apply(lambda x: oneHot(x, label_unique))

   encoding = tokenizer(new_df["text"].tolist(), 
                         truncation=True, 
                         max_length=150, 
                         padding="longest", 
                         return_tensors="pt")

    encoding["labels"] = torch.stack([torch.tensor(o, dtype=torch.float32) for o in new_df["one_hot_labels"]])
    return encoding



encoding = preprocess_data(new_df, label_unique)

dataset = Dataset.from_dict({
    "input_ids": encoding["input_ids"],
    "attention_mask": encoding["attention_mask"],
    "labels": [label.tolist() for label in encoding["labels"]]  
})

dataset = dataset.train_test_split(test_size=0.2, seed = 42)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

class MultiLabelDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = {key: self.dataset[idx][key].clone().detach() for key in ["input_ids", "attention_mask"]}
        item["labels"] = self.dataset[idx]["labels"].clone().detach().to(torch.float32)  # Multi-label, donc float
        return item

train_dataset = MultiLabelDataset(dataset["train"])
test_dataset = MultiLabelDataset(dataset["test"])

BATCH_SIZE = 16 

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    problem_type="multi_label_classification",
    num_labels=len(label_unique),
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3,
).to(device)

🚀 The ModernBert code :



MODEL = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)


def oneHot(labels, label_unique):
    return [1 if label in labels else 0 for label in label_unique]

class ModernBERTDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], truncation=True, max_length=150, padding=False, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),  
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float32),
        }

def preprocess_data(new_df, label_unique):
    texts = new_df["text"].tolist()
    labels = [oneHot(label_list, label_unique) for label_list in new_df["labels"]]
    return ModernBERTDataset(texts, labels)

train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)
train_dataset = preprocess_data(train_df, label_unique)  
test_dataset = preprocess_data(test_df, label_unique) 


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator)


model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    problem_type="multi_label_classification",
    num_labels=len(label_unique),
    # hidden_dropout_prob=0.3,
    # attention_probs_dropout_prob=0.3,

).to(device)
for module in model.modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = 0.3

🚀 The common code :


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
num_labels = len(label_unique)
accuracy_metric = torchmetrics.Accuracy(task="multilabel", num_labels=num_labels).to(device)
f1_metric = torchmetrics.F1Score(task="multilabel", num_labels=num_labels, average="macro").to(device)
precision_metric = torchmetrics.Precision(task="multilabel", num_labels=num_labels, average="macro").to(device)
recall_metric = torchmetrics.Recall(task="multilabel", num_labels=num_labels, average="macro").to(device)

def test():
    y_true = []
    y_pred = []
    model.eval()
    total_loss = 0
    with torch.no_grad(): 
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.sigmoid(logits) > 0.5 


            y_true.append(labels.cpu().numpy())
            y_pred.append(preds.cpu().numpy())

    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)

    test_acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="macro",zero_division=0)
    print(f"Loss: {total_loss / len(train_loader):.4f} - Acc: {test_acc:.4f} - F1: {f1:.4f} - Precision: {precision:.4f} - Recall: {recall:.4f}")
    return test_acc, precision, recall, f1


EPOCHS = 100
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    accuracy_metric.reset()
    f1_metric.reset()
    precision_metric.reset()
    recall_metric.reset()

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()


        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits


        loss = criterion(logits, labels)
        loss.backward() 
        optimizer.step()  

        total_loss += loss.item()

        preds = torch.sigmoid(logits) > 0.5  


        accuracy_metric.update(preds, labels.int())
        f1_metric.update(preds, labels.int())
        precision_metric.update(preds, labels.int())
        recall_metric.update(preds, labels.int())

    train_acc = accuracy_metric.compute().item()
    train_f1 = f1_metric.compute().item()
    train_precision = precision_metric.compute().item()
    train_recall = recall_metric.compute().item()
    history["loss"].append(total_loss / len(train_loader))
    history["accuracy"].append(train_acc)
    history["f1"].append(train_f1)
    history["precision"].append(train_precision)
    history["recall"].append(train_recall)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss / len(train_loader):.4f} - Acc: {train_acc:.4f} - F1: {train_f1:.4f} - Precision: {train_precision:.4f} - Recall: {train_recall:.4f}")
    print("_"*20,"> Test sur les données :")
    acc, prec, rec, f1 =test()
    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

🚀 Bert's result on train:

🚀 Bert's result on test:

🚀 ModernBert's Result on train:

🚀 ModernBert's Result on test:

Thank you for your attention and your help !

kariatouk

Feb 8

I've been playing with original BERT rather than modernBERT so I might not know what I talking about but I think you might have more luck if you froze your BERT model and just focused on the an output NN and the last few output layers. You can loose a lot of information if you overfit the model. Let the transfer learning in the original model do its work and keep the number of Epochs low since need to only be tuning the last few layers.

kariatouk

Feb 8

Whats interesting is how slowly your model trains under ModernBert trains. How big is you training set.

Joseph2805

Mar 3

Hi ! it's a very small data set, it was only to compare the models ( 900 sentences for 9 labels, the train set is also very unbalanced ). I'll try freezing some layers.

answerdotai
/

ModernBERT-base

ModernBert vs Bert for text classification