ModernBert vs Bert for text classification

#64
by Joseph2805 - opened

Hello,
I tried comparing the BERT and ModernBERT models for a classification task, and I’ve reached the point where I find ModernBERT less performant than BERT. I’m wondering if it might just be an issue with the model initialization during fine-tuning. If anyone has any thoughts on this, I’d love to hear them!

Thanks in advance πŸ™‚


πŸš€ The Bert code :

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def oneHot(labels,label_unique):
    one_hot = []
    for label in label_unique : 
        if label in labels:
            one_hot.append(1)
        else : 
            one_hot.append(0)
    return one_hot
    
def preprocess_data(new_df, label_unique):
    new_df["one_hot_labels"] = new_df["labels"].apply(lambda x: oneHot(x, label_unique))

   encoding = tokenizer(new_df["text"].tolist(), 
                         truncation=True, 
                         max_length=150, 
                         padding="longest", 
                         return_tensors="pt")

    encoding["labels"] = torch.stack([torch.tensor(o, dtype=torch.float32) for o in new_df["one_hot_labels"]])
    return encoding



encoding = preprocess_data(new_df, label_unique)

dataset = Dataset.from_dict({
    "input_ids": encoding["input_ids"],
    "attention_mask": encoding["attention_mask"],
    "labels": [label.tolist() for label in encoding["labels"]]  
})

dataset = dataset.train_test_split(test_size=0.2, seed = 42)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

class MultiLabelDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = {key: self.dataset[idx][key].clone().detach() for key in ["input_ids", "attention_mask"]}
        item["labels"] = self.dataset[idx]["labels"].clone().detach().to(torch.float32)  # Multi-label, donc float
        return item

train_dataset = MultiLabelDataset(dataset["train"])
test_dataset = MultiLabelDataset(dataset["test"])

BATCH_SIZE = 16 

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    problem_type="multi_label_classification",
    num_labels=len(label_unique),
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3,
).to(device)

πŸš€ The ModernBert code :



MODEL = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)


def oneHot(labels, label_unique):
    return [1 if label in labels else 0 for label in label_unique]

class ModernBERTDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], truncation=True, max_length=150, padding=False, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),  
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float32),
        }

def preprocess_data(new_df, label_unique):
    texts = new_df["text"].tolist()
    labels = [oneHot(label_list, label_unique) for label_list in new_df["labels"]]
    return ModernBERTDataset(texts, labels)

train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)
train_dataset = preprocess_data(train_df, label_unique)  
test_dataset = preprocess_data(test_df, label_unique) 


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator)


model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    problem_type="multi_label_classification",
    num_labels=len(label_unique),
    # hidden_dropout_prob=0.3,
    # attention_probs_dropout_prob=0.3,

).to(device)
for module in model.modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = 0.3

πŸš€ The common code :


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
num_labels = len(label_unique)
accuracy_metric = torchmetrics.Accuracy(task="multilabel", num_labels=num_labels).to(device)
f1_metric = torchmetrics.F1Score(task="multilabel", num_labels=num_labels, average="macro").to(device)
precision_metric = torchmetrics.Precision(task="multilabel", num_labels=num_labels, average="macro").to(device)
recall_metric = torchmetrics.Recall(task="multilabel", num_labels=num_labels, average="macro").to(device)

def test():
    y_true = []
    y_pred = []
    model.eval()
    total_loss = 0
    with torch.no_grad(): 
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.sigmoid(logits) > 0.5 


            y_true.append(labels.cpu().numpy())
            y_pred.append(preds.cpu().numpy())

    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)

    test_acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="macro",zero_division=0)
    print(f"Loss: {total_loss / len(train_loader):.4f} - Acc: {test_acc:.4f} - F1: {f1:.4f} - Precision: {precision:.4f} - Recall: {recall:.4f}")
    return test_acc, precision, recall, f1

EPOCHS = 100
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    accuracy_metric.reset()
    f1_metric.reset()
    precision_metric.reset()
    recall_metric.reset()

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()


        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits


        loss = criterion(logits, labels)
        loss.backward() 
        optimizer.step()  

        total_loss += loss.item()

        preds = torch.sigmoid(logits) > 0.5  


        accuracy_metric.update(preds, labels.int())
        f1_metric.update(preds, labels.int())
        precision_metric.update(preds, labels.int())
        recall_metric.update(preds, labels.int())

    train_acc = accuracy_metric.compute().item()
    train_f1 = f1_metric.compute().item()
    train_precision = precision_metric.compute().item()
    train_recall = recall_metric.compute().item()
    history["loss"].append(total_loss / len(train_loader))
    history["accuracy"].append(train_acc)
    history["f1"].append(train_f1)
    history["precision"].append(train_precision)
    history["recall"].append(train_recall)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss / len(train_loader):.4f} - Acc: {train_acc:.4f} - F1: {train_f1:.4f} - Precision: {train_precision:.4f} - Recall: {train_recall:.4f}")
    print("_"*20,"> Test sur les donnΓ©es :")
    acc, prec, rec, f1 =test()
    accuracy_list.append(acc)
    precision_list.append(prec)
    recall_list.append(rec)
    f1_list.append(f1)

πŸš€ Bert's result on train:

image.png

πŸš€ Bert's result on test:

image.png

πŸš€ ModernBert's Result on train:

image.png

πŸš€ ModernBert's Result on test:

image.png

Thank you for your attention and your help !

I've been playing with original BERT rather than modernBERT so I might not know what I talking about but I think you might have more luck if you froze your BERT model and just focused on the an output NN and the last few output layers. You can loose a lot of information if you overfit the model. Let the transfer learning in the original model do its work and keep the number of Epochs low since need to only be tuning the last few layers.

Whats interesting is how slowly your model trains under ModernBert trains. How big is you training set.

Sign up or log in to comment