In [1]:
import os

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from datasets import load_dataset
from dotenv import load_dotenv
from huggingface_hub import HfFolder, login
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    root_mean_squared_error,
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

load_dotenv()

True

In [2]:
login(token=os.getenv("HUGGINGFACE_API_KEY"))

### Dataset prep

In [3]:
raw_dataset = load_dataset("Forecast-ing/email-clickthrough")

In [4]:
raw_dataset = raw_dataset.rename_column("label", "labels")

In [5]:
raw_dataset["train"].to_pandas()["text"].str.len().max()

3292

In [6]:
(raw_dataset["train"].to_pandas()["text"].str.len() > 2048).mean()

0.2427007299270073

In [7]:
raw_dataset["train"].to_pandas()["labels"].describe()

count    548.000000
mean       2.879635
std        2.423870
min        0.450000
25%        1.510000
50%        2.025000
75%        3.267500
max       25.370000
Name: labels, dtype: float64

In [8]:
raw_dataset = raw_dataset["train"].train_test_split(test_size=0.1, seed=1)

In [9]:
print(f"Train dataset size: {len(raw_dataset['train'])}")
print(f"Test dataset size: {len(raw_dataset['test'])}")

Train dataset size: 493
Test dataset size: 55


### Catboost Benchmark

In [10]:
catboost_train = raw_dataset["train"].to_pandas()
catboost_test = raw_dataset["test"].to_pandas()

In [11]:
text_columns = ["text"]
label = "labels"

In [12]:
train_pool = Pool(
    data=catboost_train[text_columns],
    label=catboost_train[label],
    text_features=text_columns,
)
test_pool = Pool(
    data=catboost_test[text_columns],
    label=catboost_test[label],
    text_features=text_columns,
)

In [13]:
model = CatBoostRegressor(loss_function="RMSE", verbose=100)

model.fit(train_pool, eval_set=test_pool)

Learning rate set to 0.045569
0:	learn: 2.4332854	test: 1.8670741	best: 1.8670741 (0)	total: 60.5ms	remaining: 1m
100:	learn: 1.4972558	test: 1.6247590	best: 1.6048404 (59)	total: 2.5s	remaining: 22.2s
200:	learn: 1.1104040	test: 1.6015944	best: 1.5975296 (197)	total: 4.91s	remaining: 19.5s
300:	learn: 0.8568033	test: 1.6102309	best: 1.5975296 (197)	total: 7.33s	remaining: 17s
400:	learn: 0.7096792	test: 1.6090190	best: 1.5975296 (197)	total: 9.72s	remaining: 14.5s
500:	learn: 0.6056532	test: 1.6083240	best: 1.5975296 (197)	total: 12.1s	remaining: 12s
600:	learn: 0.5298016	test: 1.6175366	best: 1.5975296 (197)	total: 14.5s	remaining: 9.64s
700:	learn: 0.4701467	test: 1.6262668	best: 1.5975296 (197)	total: 16.9s	remaining: 7.23s
800:	learn: 0.4233732	test: 1.6199203	best: 1.5975296 (197)	total: 19.4s	remaining: 4.81s
900:	learn: 0.3837074	test: 1.6104091	best: 1.5975296 (197)	total: 21.8s	remaining: 2.39s
999:	learn: 0.3501113	test: 1.6131207	best: 1.5975296 (197)	total: 24.2s	remaining

<catboost.core.CatBoostRegressor at 0x7fb1061c5bb0>

In [14]:
y_pred = model.predict(test_pool)
y_val = catboost_test[label]

In [15]:
def smape(y_true, y_pred):
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )


def calculate_metrics(y_val, y_pred):
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    smape_value = smape(y_val, y_pred)
    return {
        "mse": mse,
        "rmse": rmse,
        "mae": mae,
        "r2": r2,
        "smape": smape_value,
    }

In [16]:
catboost_metrics = calculate_metrics(y_val, y_pred)

In [17]:
catboost_metrics

{'mse': 2.552100633998035,
 'rmse': 1.5975295408843102,
 'mae': 1.1439370629666958,
 'r2': 0.30127932054387174,
 'smape': 37.63064694052479}

### Fine Tuning Modern Bert

In [18]:
model_id = "answerdotai/ModernBERT-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.model_max_length = 2048

def tokenize(batch):
    return tokenizer(
        batch["text"], padding="max_length", truncation=True, return_tensors="pt"
    )

In [19]:
tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=["text"])

In [20]:
def model_init(trial):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id, num_labels=1, ignore_mismatched_sizes=True, problem_type="regression"
    )
    return model

In [21]:
def gen_training_args(additional_args={}):
    default_args = {
        "output_dir": "./modernBERT-content-regression",
        "per_device_eval_batch_size": 4,
        "per_device_train_batch_size": 4,
        "num_train_epochs": 5,
        "bf16": True,  # bfloat16 training
        "optim": "adamw_torch_fused",  # improved optimizer
        "logging_strategy": "steps",
        "logging_steps": 1,
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "save_total_limit": 1,
        "metric_for_best_model": "rmse",
        "greater_is_better": False,
        "report_to": "tensorboard",
        "push_to_hub": True,
        "hub_private_repo": True,
        "hub_strategy": "every_save",
        "hub_token": HfFolder.get_token(),
    }
    training_args = TrainingArguments(**default_args, **additional_args)
    return training_args

In [22]:
def compute_metrics_for_regression(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(-1, 1)
    results =  calculate_metrics(labels, predictions)
    return results


In [23]:
hp_trainer = Trainer(
    model=None,
    args=gen_training_args(),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_for_regression,
    model_init=model_init,
)

  hp_trainer = Trainer(
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 5e-7, 5e-5, log=True),
    }

In [25]:
best_trial = hp_trainer.hyperparameter_search(
    direction="minimize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=5,
    compute_objective=lambda x: x['eval_rmse'],
)

[I 2025-01-09 12:16:25,726] A new study created in memory with name: no-name-2f3f9073-d130-4bb1-9447-7262f2b7bd75
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.238,4.573008,4.573008,2.138459,1.32454,-0.25201,54.242009
2,3.7685,4.093452,4.093452,2.023228,1.458057,-0.120716,53.77084
3,27.661,3.361875,3.361874,1.833541,1.12667,0.079577,52.641284
4,0.0923,2.759459,2.759459,1.661162,1.040074,0.244508,53.009331
5,0.0203,2.73325,2.73325,1.653254,1.078653,0.251684,54.187167


[I 2025-01-09 12:19:57,000] Trial 0 finished with value: 1.6532543369745685 and parameters: {'learning_rate': 1.9437267223645173e-05}. Best is trial 0 with value: 1.6532543369745685.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.0335,3.730757,3.730757,1.931517,1.167591,-0.021416,46.438679
2,3.0211,3.532418,3.53242,1.879473,1.171051,0.032885,48.273236
3,32.4544,3.670944,3.670944,1.915971,1.159171,-0.005041,48.529482
4,0.0743,3.690546,3.690546,1.921079,1.179955,-0.010407,49.107727
5,0.0988,3.677439,3.677439,1.917665,1.188619,-0.006819,49.251461


[I 2025-01-09 12:23:31,566] Trial 1 finished with value: 1.91766510403085 and parameters: {'learning_rate': 1.5810058165067856e-06}. Best is trial 0 with value: 1.6532543369745685.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.3115,4.09059,4.09059,2.022521,1.229977,-0.119932,50.514507
2,2.6528,4.852318,4.852319,2.202798,1.465739,-0.32848,54.715651
3,24.6264,3.33161,3.33161,1.82527,1.143937,0.087863,51.89842
4,0.2896,2.353773,2.353773,1.534201,1.079125,0.355578,55.779856
5,0.0014,2.629261,2.629261,1.6215,1.166006,0.280154,57.977718


[I 2025-01-09 12:27:05,020] Trial 2 finished with value: 1.6214995462309338 and parameters: {'learning_rate': 2.479942619764035e-05}. Best is trial 2 with value: 1.6214995462309338.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.008,3.590378,3.590379,1.894829,1.149898,0.017017,46.445611
2,2.704,3.476464,3.476464,1.864528,1.125,0.048205,47.319812
3,32.0993,3.543669,3.543668,1.882463,1.123369,0.029805,47.717217
4,0.0582,3.590872,3.590872,1.89496,1.142273,0.016882,48.410091
5,0.0846,3.600572,3.600573,1.897517,1.145824,0.014226,48.548377


[I 2025-01-09 12:30:33,965] Trial 3 finished with value: 1.8975174797770824 and parameters: {'learning_rate': 1.1750268648920993e-06}. Best is trial 2 with value: 1.6214995462309338.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.0856,3.761341,3.761341,1.939418,1.156432,-0.02979,46.601269
2,2.9134,3.756832,3.756831,1.938255,1.238454,-0.028555,49.874967
3,32.2766,3.654472,3.654473,1.911668,1.135091,-0.000531,48.73234
4,0.083,3.665871,3.665871,1.914646,1.162767,-0.003652,49.43971
5,0.0558,3.610057,3.610057,1.900015,1.183222,0.011629,49.474382


[I 2025-01-09 12:34:05,271] Trial 4 finished with value: 1.9000149676084739 and parameters: {'learning_rate': 2.308984942228097e-06}. Best is trial 2 with value: 1.6214995462309338.


In [26]:
best_trial

BestRun(run_id='2', objective=1.6214995462309338, hyperparameters={'learning_rate': 2.479942619764035e-05}, run_summary=None)

### Fit and upload the best Model
We re-fit the model with the best hyperparameters in accordaince with this [forum post](https://discuss.huggingface.co/t/how-to-save-the-best-trials-model-using-trainer-hyperparameter-search/8783/4)

In [27]:
best_trainer = Trainer(
    model=model_init(None),
    args=gen_training_args({**best_trial.hyperparameters}),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics_for_regression,
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
best_trainer.train() 

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.1152,4.084211,4.084211,2.020943,1.219903,-0.118186,49.023473
2,1.239,3.803578,3.803578,1.950276,1.289222,-0.041354,52.775413
3,27.8256,3.245966,3.245967,1.801657,1.102216,0.111311,51.74703
4,0.0001,2.413429,2.413429,1.553521,1.081085,0.339245,52.221513
5,0.1666,2.462405,2.462406,1.569205,1.182182,0.325836,56.61447


TrainOutput(global_step=620, training_loss=4.329616037725622, metrics={'train_runtime': 205.4329, 'train_samples_per_second': 11.999, 'train_steps_per_second': 3.018, 'total_flos': 3359849068769280.0, 'train_loss': 4.329616037725622, 'epoch': 5.0})

In [29]:
best_trainer.evaluate()

{'eval_loss': 2.4624054431915283,
 'eval_mse': 2.4624056816101074,
 'eval_rmse': 1.5692054300218654,
 'eval_mae': 1.182181715965271,
 'eval_r2': 0.325836181640625,
 'eval_smape': 56.61447048187256,
 'eval_runtime': 1.3489,
 'eval_samples_per_second': 40.774,
 'eval_steps_per_second': 10.379,
 'epoch': 5.0}

In [30]:
tokenizer.save_pretrained("modernBERT-content-regression")
best_trainer.create_model_card()
best_trainer.push_to_hub()

events.out.tfevents.1736455080.bazzite:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Forecast-ing/modernBERT-content-regression/commit/16f1dc87782b2735f8fef84a5b10807b6cbe5565', commit_message='End of training', commit_description='', oid='16f1dc87782b2735f8fef84a5b10807b6cbe5565', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Forecast-ing/modernBERT-content-regression', endpoint='https://huggingface.co', repo_type='model', repo_id='Forecast-ing/modernBERT-content-regression'), pr_revision=None, pr_num=None)