ModernBert vs Bert for text classification
Hello,
I tried comparing the BERT and ModernBERT models for a classification task, and Iβve reached the point where I find ModernBERT less performant than BERT. Iβm wondering if it might just be an issue with the model initialization during fine-tuning. If anyone has any thoughts on this, Iβd love to hear them!
Thanks in advance π
π The Bert code :
tokenizer = AutoTokenizer.from_pretrained(MODEL)
def oneHot(labels,label_unique):
one_hot = []
for label in label_unique :
if label in labels:
one_hot.append(1)
else :
one_hot.append(0)
return one_hot
def preprocess_data(new_df, label_unique):
new_df["one_hot_labels"] = new_df["labels"].apply(lambda x: oneHot(x, label_unique))
encoding = tokenizer(new_df["text"].tolist(),
truncation=True,
max_length=150,
padding="longest",
return_tensors="pt")
encoding["labels"] = torch.stack([torch.tensor(o, dtype=torch.float32) for o in new_df["one_hot_labels"]])
return encoding
encoding = preprocess_data(new_df, label_unique)
dataset = Dataset.from_dict({
"input_ids": encoding["input_ids"],
"attention_mask": encoding["attention_mask"],
"labels": [label.tolist() for label in encoding["labels"]]
})
dataset = dataset.train_test_split(test_size=0.2, seed = 42)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
class MultiLabelDataset(Dataset):
def __init__(self, dataset):
self.dataset = dataset
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
item = {key: self.dataset[idx][key].clone().detach() for key in ["input_ids", "attention_mask"]}
item["labels"] = self.dataset[idx]["labels"].clone().detach().to(torch.float32) # Multi-label, donc float
return item
train_dataset = MultiLabelDataset(dataset["train"])
test_dataset = MultiLabelDataset(dataset["test"])
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL,
problem_type="multi_label_classification",
num_labels=len(label_unique),
hidden_dropout_prob=0.3,
attention_probs_dropout_prob=0.3,
).to(device)
π The ModernBert code :
MODEL = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
def oneHot(labels, label_unique):
return [1 if label in labels else 0 for label in label_unique]
class ModernBERTDataset(Dataset):
def __init__(self, texts, labels):
self.texts = texts
self.labels = labels
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
encoding = tokenizer(self.texts[idx], truncation=True, max_length=150, padding=False, return_tensors="pt")
return {
"input_ids": encoding["input_ids"].squeeze(0),
"attention_mask": encoding["attention_mask"].squeeze(0),
"labels": torch.tensor(self.labels[idx], dtype=torch.float32),
}
def preprocess_data(new_df, label_unique):
texts = new_df["text"].tolist()
labels = [oneHot(label_list, label_unique) for label_list in new_df["labels"]]
return ModernBERTDataset(texts, labels)
train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)
train_dataset = preprocess_data(train_df, label_unique)
test_dataset = preprocess_data(test_df, label_unique)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL,
problem_type="multi_label_classification",
num_labels=len(label_unique),
# hidden_dropout_prob=0.3,
# attention_probs_dropout_prob=0.3,
).to(device)
for module in model.modules():
if isinstance(module, torch.nn.Dropout):
module.p = 0.3
π The common code :
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
num_labels = len(label_unique)
accuracy_metric = torchmetrics.Accuracy(task="multilabel", num_labels=num_labels).to(device)
f1_metric = torchmetrics.F1Score(task="multilabel", num_labels=num_labels, average="macro").to(device)
precision_metric = torchmetrics.Precision(task="multilabel", num_labels=num_labels, average="macro").to(device)
recall_metric = torchmetrics.Recall(task="multilabel", num_labels=num_labels, average="macro").to(device)
def test():
y_true = []
y_pred = []
model.eval()
total_loss = 0
with torch.no_grad():
for batch in test_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
loss = criterion(logits, labels)
total_loss += loss.item()
preds = torch.sigmoid(logits) > 0.5
y_true.append(labels.cpu().numpy())
y_pred.append(preds.cpu().numpy())
y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=0)
test_acc = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="macro",zero_division=0)
print(f"Loss: {total_loss / len(train_loader):.4f} - Acc: {test_acc:.4f} - F1: {f1:.4f} - Precision: {precision:.4f} - Recall: {recall:.4f}")
return test_acc, precision, recall, f1
EPOCHS = 100
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
for epoch in range(EPOCHS):
model.train()
total_loss = 0
accuracy_metric.reset()
f1_metric.reset()
precision_metric.reset()
recall_metric.reset()
for batch in train_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
preds = torch.sigmoid(logits) > 0.5
accuracy_metric.update(preds, labels.int())
f1_metric.update(preds, labels.int())
precision_metric.update(preds, labels.int())
recall_metric.update(preds, labels.int())
train_acc = accuracy_metric.compute().item()
train_f1 = f1_metric.compute().item()
train_precision = precision_metric.compute().item()
train_recall = recall_metric.compute().item()
history["loss"].append(total_loss / len(train_loader))
history["accuracy"].append(train_acc)
history["f1"].append(train_f1)
history["precision"].append(train_precision)
history["recall"].append(train_recall)
print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss / len(train_loader):.4f} - Acc: {train_acc:.4f} - F1: {train_f1:.4f} - Precision: {train_precision:.4f} - Recall: {train_recall:.4f}")
print("_"*20,"> Test sur les donnΓ©es :")
acc, prec, rec, f1 =test()
accuracy_list.append(acc)
precision_list.append(prec)
recall_list.append(rec)
f1_list.append(f1)
π Bert's result on train:
π Bert's result on test:
π ModernBert's Result on train:
π ModernBert's Result on test:
Thank you for your attention and your help !
I've been playing with original BERT rather than modernBERT so I might not know what I talking about but I think you might have more luck if you froze your BERT model and just focused on the an output NN and the last few output layers. You can loose a lot of information if you overfit the model. Let the transfer learning in the original model do its work and keep the number of Epochs low since need to only be tuning the last few layers.
Whats interesting is how slowly your model trains under ModernBert trains. How big is you training set.