diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..53199983ebc33906dd07db26b461142803b111fa
--- /dev/null
+++ b/README.md
@@ -0,0 +1,73 @@
+---
+library_name: transformers
+license: apache-2.0
+base_model: answerdotai/ModernBERT-base
+tags:
+- generated_from_trainer
+metrics:
+- accuracy
+- precision
+- recall
+- f1
+model-index:
+- name: answerdotai-ModernBERT-base-finetuned
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+# answerdotai-ModernBERT-base-finetuned
+
+This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0116
+- Accuracy: 0.9976
+- Precision: 0.9977
+- Recall: 0.9976
+- F1: 0.9976
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 4.244005797262286e-05
+- train_batch_size: 32
+- eval_batch_size: 32
+- seed: 42
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- num_epochs: 7
+
+### Training results
+
+| Training Loss | Epoch | Step  | Validation Loss | Accuracy | Precision | Recall | F1     |
+|:-------------:|:-----:|:-----:|:---------------:|:--------:|:---------:|:------:|:------:|
+| 0.0175        | 1.0   | 1506  | 0.0195          | 0.9971   | 0.9971    | 0.9971 | 0.9971 |
+| 0.0134        | 2.0   | 3012  | 0.0153          | 0.9970   | 0.9970    | 0.9970 | 0.9970 |
+| 0.0           | 3.0   | 4518  | 0.0228          | 0.9976   | 0.9976    | 0.9976 | 0.9976 |
+| 0.0           | 4.0   | 6024  | 0.0270          | 0.9976   | 0.9976    | 0.9976 | 0.9976 |
+| 0.0           | 5.0   | 7530  | 0.0272          | 0.9976   | 0.9976    | 0.9976 | 0.9976 |
+| 0.0           | 6.0   | 9036  | 0.0279          | 0.9975   | 0.9975    | 0.9975 | 0.9975 |
+| 0.0           | 7.0   | 10542 | 0.0283          | 0.9975   | 0.9975    | 0.9975 | 0.9975 |
+
+
+### Framework versions
+
+- Transformers 4.48.0.dev0
+- Pytorch 2.5.1+cu124
+- Datasets 3.2.0
+- Tokenizers 0.21.0
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6db42373842b3d2170c94e43770efb05eece8d27
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd2c8555404b25095196f950baad8216db0404ff16448d62a6d453105d7bd0c7
+size 598439784
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..53db8c652951b05fb4cc2463b5ac012b3537cf3b
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33b0c987e99ad21c3b9517dc831f21fd66bcbcd55d62a62f0a28008a0e8674e2
+size 5432
diff --git a/trial-0/checkpoint-1506/config.json b/trial-0/checkpoint-1506/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-0/checkpoint-1506/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-0/checkpoint-1506/model.safetensors b/trial-0/checkpoint-1506/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..38ac238ee34c059b26319be2afc84b0906a866bb
--- /dev/null
+++ b/trial-0/checkpoint-1506/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68eefa4a9be7b2db68618e1cb44c2cdf2163fb53cc3380fc52767266b121ddd2
+size 598439784
diff --git a/trial-0/checkpoint-1506/optimizer.pt b/trial-0/checkpoint-1506/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6047f5e3f57300ba9e2600b46a7177595090dc1e
--- /dev/null
+++ b/trial-0/checkpoint-1506/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08a1a4cc69805f73befa2723d41c1d97c0a2f799125f15e25de8295d6c23580c
+size 1196967418
diff --git a/trial-0/checkpoint-1506/rng_state.pth b/trial-0/checkpoint-1506/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cf3d91c5392ca6b7d7e0880933b7830a896d7c9e
--- /dev/null
+++ b/trial-0/checkpoint-1506/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
+size 14244
diff --git a/trial-0/checkpoint-1506/scheduler.pt b/trial-0/checkpoint-1506/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a4cea32b21906cb3dbe285f9886b2ec4db548048
--- /dev/null
+++ b/trial-0/checkpoint-1506/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5bddebb63f2196cebff07c6da8f9e668e8379463981f8be40fb7e151e6c09ff
+size 1064
diff --git a/trial-0/checkpoint-1506/trainer_state.json b/trial-0/checkpoint-1506/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4400a234fad0c2009013341eb9a53199e2d87b9
--- /dev/null
+++ b/trial-0/checkpoint-1506/trainer_state.json
@@ -0,0 +1,255 @@
+{
+  "best_metric": 0.02135350927710533,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-0/checkpoint-1506",
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1506,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.033200531208499334,
+      "grad_norm": 11.822611808776855,
+      "learning_rate": 4.4935320035267014e-05,
+      "loss": 0.295,
+      "step": 50
+    },
+    {
+      "epoch": 0.06640106241699867,
+      "grad_norm": 0.11557121574878693,
+      "learning_rate": 4.463495024893502e-05,
+      "loss": 0.0808,
+      "step": 100
+    },
+    {
+      "epoch": 0.099601593625498,
+      "grad_norm": 0.01743650808930397,
+      "learning_rate": 4.433458046260302e-05,
+      "loss": 0.052,
+      "step": 150
+    },
+    {
+      "epoch": 0.13280212483399734,
+      "grad_norm": 4.474731922149658,
+      "learning_rate": 4.4034210676271024e-05,
+      "loss": 0.0491,
+      "step": 200
+    },
+    {
+      "epoch": 0.16600265604249667,
+      "grad_norm": 4.205756664276123,
+      "learning_rate": 4.373384088993902e-05,
+      "loss": 0.0344,
+      "step": 250
+    },
+    {
+      "epoch": 0.199203187250996,
+      "grad_norm": 4.239188194274902,
+      "learning_rate": 4.343347110360703e-05,
+      "loss": 0.0295,
+      "step": 300
+    },
+    {
+      "epoch": 0.23240371845949534,
+      "grad_norm": 0.19662700593471527,
+      "learning_rate": 4.3133101317275027e-05,
+      "loss": 0.0342,
+      "step": 350
+    },
+    {
+      "epoch": 0.2656042496679947,
+      "grad_norm": 0.008393031544983387,
+      "learning_rate": 4.2832731530943025e-05,
+      "loss": 0.0245,
+      "step": 400
+    },
+    {
+      "epoch": 0.29880478087649404,
+      "grad_norm": 0.06995929777622223,
+      "learning_rate": 4.253236174461103e-05,
+      "loss": 0.0281,
+      "step": 450
+    },
+    {
+      "epoch": 0.33200531208499334,
+      "grad_norm": 0.010315222665667534,
+      "learning_rate": 4.223199195827902e-05,
+      "loss": 0.0188,
+      "step": 500
+    },
+    {
+      "epoch": 0.3652058432934927,
+      "grad_norm": 3.1021769046783447,
+      "learning_rate": 4.193162217194703e-05,
+      "loss": 0.018,
+      "step": 550
+    },
+    {
+      "epoch": 0.398406374501992,
+      "grad_norm": 0.00041495164623484015,
+      "learning_rate": 4.1631252385615027e-05,
+      "loss": 0.0053,
+      "step": 600
+    },
+    {
+      "epoch": 0.4316069057104914,
+      "grad_norm": 0.19596342742443085,
+      "learning_rate": 4.133088259928303e-05,
+      "loss": 0.0178,
+      "step": 650
+    },
+    {
+      "epoch": 0.4648074369189907,
+      "grad_norm": 0.0566418319940567,
+      "learning_rate": 4.103051281295103e-05,
+      "loss": 0.0101,
+      "step": 700
+    },
+    {
+      "epoch": 0.49800796812749004,
+      "grad_norm": 0.005816417746245861,
+      "learning_rate": 4.0730143026619036e-05,
+      "loss": 0.0166,
+      "step": 750
+    },
+    {
+      "epoch": 0.5312084993359893,
+      "grad_norm": 2.2474324703216553,
+      "learning_rate": 4.0429773240287035e-05,
+      "loss": 0.0156,
+      "step": 800
+    },
+    {
+      "epoch": 0.5644090305444888,
+      "grad_norm": 0.06311876326799393,
+      "learning_rate": 4.0129403453955033e-05,
+      "loss": 0.0166,
+      "step": 850
+    },
+    {
+      "epoch": 0.5976095617529881,
+      "grad_norm": 0.012764506973326206,
+      "learning_rate": 3.982903366762304e-05,
+      "loss": 0.0175,
+      "step": 900
+    },
+    {
+      "epoch": 0.6308100929614874,
+      "grad_norm": 0.00253055221401155,
+      "learning_rate": 3.952866388129104e-05,
+      "loss": 0.0047,
+      "step": 950
+    },
+    {
+      "epoch": 0.6640106241699867,
+      "grad_norm": 0.03604559600353241,
+      "learning_rate": 3.922829409495904e-05,
+      "loss": 0.016,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6972111553784861,
+      "grad_norm": 0.006498202681541443,
+      "learning_rate": 3.892792430862704e-05,
+      "loss": 0.0055,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7304116865869854,
+      "grad_norm": 0.11296769976615906,
+      "learning_rate": 3.862755452229504e-05,
+      "loss": 0.0122,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7636122177954847,
+      "grad_norm": 0.0005851402529515326,
+      "learning_rate": 3.8327184735963046e-05,
+      "loss": 0.01,
+      "step": 1150
+    },
+    {
+      "epoch": 0.796812749003984,
+      "grad_norm": 0.018440622836351395,
+      "learning_rate": 3.8026814949631044e-05,
+      "loss": 0.0064,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8300132802124834,
+      "grad_norm": 0.0023099363315850496,
+      "learning_rate": 3.772644516329905e-05,
+      "loss": 0.0011,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8632138114209827,
+      "grad_norm": 0.07595626264810562,
+      "learning_rate": 3.742607537696705e-05,
+      "loss": 0.0156,
+      "step": 1300
+    },
+    {
+      "epoch": 0.896414342629482,
+      "grad_norm": 0.0008996099350042641,
+      "learning_rate": 3.7125705590635054e-05,
+      "loss": 0.0103,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9296148738379814,
+      "grad_norm": 3.656134504126385e-05,
+      "learning_rate": 3.682533580430305e-05,
+      "loss": 0.0027,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9628154050464808,
+      "grad_norm": 0.2666904032230377,
+      "learning_rate": 3.652496601797105e-05,
+      "loss": 0.0152,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9960159362549801,
+      "grad_norm": 0.011590929701924324,
+      "learning_rate": 3.622459623163905e-05,
+      "loss": 0.0115,
+      "step": 1500
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9963024809160306,
+      "eval_f1": 0.9962997469825083,
+      "eval_loss": 0.02135350927710533,
+      "eval_precision": 0.9962971957079396,
+      "eval_recall": 0.9963024809160306,
+      "eval_runtime": 34.0647,
+      "eval_samples_per_second": 246.12,
+      "eval_steps_per_second": 7.691,
+      "step": 1506
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 7530,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.641430544259072e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-0/checkpoint-1506/training_args.bin b/trial-0/checkpoint-1506/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0fc19de9741f9cf3edab1a1fa1574f04d82d4230
--- /dev/null
+++ b/trial-0/checkpoint-1506/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f87e0989b8aabc63686d8b1c4f4f6463501f9b534fd10b5dda472e02e5c6d200
+size 5368
diff --git a/trial-1/checkpoint-6022/config.json b/trial-1/checkpoint-6022/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-1/checkpoint-6022/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-1/checkpoint-6022/model.safetensors b/trial-1/checkpoint-6022/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..80faa709e3d36df7440067820aa5d5abd8ad496c
--- /dev/null
+++ b/trial-1/checkpoint-6022/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9376e02caf20a3536db5adaec49e89c8583378974c975bdfa4e4fa72bb7ed87c
+size 598439784
diff --git a/trial-1/checkpoint-6022/optimizer.pt b/trial-1/checkpoint-6022/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a59f32c3ea7e53f052a8ef9cb24760a539fd4a95
--- /dev/null
+++ b/trial-1/checkpoint-6022/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f989a18c3b9f0cb969ade19c78b7d7d4405053c69000081f12d16f8076c4691
+size 1196967418
diff --git a/trial-1/checkpoint-6022/rng_state.pth b/trial-1/checkpoint-6022/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1
--- /dev/null
+++ b/trial-1/checkpoint-6022/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244
diff --git a/trial-1/checkpoint-6022/scheduler.pt b/trial-1/checkpoint-6022/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..857465863c7713d2a96855a82d291ced6f6cc956
--- /dev/null
+++ b/trial-1/checkpoint-6022/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04bd594b0cd8e46cee28cfc34b0ba6a02854df28789c81eb4c180d9356f4de00
+size 1064
diff --git a/trial-1/checkpoint-6022/trainer_state.json b/trial-1/checkpoint-6022/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f02d164201316343f5d90f30868ba4645b8db0aa
--- /dev/null
+++ b/trial-1/checkpoint-6022/trainer_state.json
@@ -0,0 +1,897 @@
+{
+  "best_metric": 0.0445549376308918,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-1/checkpoint-6022",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 6022,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016605778811026237,
+      "grad_norm": 15.757351875305176,
+      "learning_rate": 2.4306427769118723e-06,
+      "loss": 0.6703,
+      "step": 50
+    },
+    {
+      "epoch": 0.033211557622052475,
+      "grad_norm": 14.056926727294922,
+      "learning_rate": 2.425586942863882e-06,
+      "loss": 0.4736,
+      "step": 100
+    },
+    {
+      "epoch": 0.04981733643307871,
+      "grad_norm": 15.678231239318848,
+      "learning_rate": 2.4205311088158915e-06,
+      "loss": 0.338,
+      "step": 150
+    },
+    {
+      "epoch": 0.06642311524410495,
+      "grad_norm": 4.84220552444458,
+      "learning_rate": 2.4154752747679013e-06,
+      "loss": 0.2931,
+      "step": 200
+    },
+    {
+      "epoch": 0.08302889405513118,
+      "grad_norm": 5.182389736175537,
+      "learning_rate": 2.4104194407199107e-06,
+      "loss": 0.251,
+      "step": 250
+    },
+    {
+      "epoch": 0.09963467286615742,
+      "grad_norm": 1.5187151432037354,
+      "learning_rate": 2.4053636066719205e-06,
+      "loss": 0.2133,
+      "step": 300
+    },
+    {
+      "epoch": 0.11624045167718366,
+      "grad_norm": 16.253589630126953,
+      "learning_rate": 2.40030777262393e-06,
+      "loss": 0.1518,
+      "step": 350
+    },
+    {
+      "epoch": 0.1328462304882099,
+      "grad_norm": 6.757865905761719,
+      "learning_rate": 2.3952519385759397e-06,
+      "loss": 0.1508,
+      "step": 400
+    },
+    {
+      "epoch": 0.14945200929923613,
+      "grad_norm": 2.119438886642456,
+      "learning_rate": 2.390196104527949e-06,
+      "loss": 0.1175,
+      "step": 450
+    },
+    {
+      "epoch": 0.16605778811026237,
+      "grad_norm": 15.932334899902344,
+      "learning_rate": 2.3851402704799585e-06,
+      "loss": 0.1401,
+      "step": 500
+    },
+    {
+      "epoch": 0.1826635669212886,
+      "grad_norm": 22.459735870361328,
+      "learning_rate": 2.3800844364319683e-06,
+      "loss": 0.1384,
+      "step": 550
+    },
+    {
+      "epoch": 0.19926934573231483,
+      "grad_norm": 10.65778923034668,
+      "learning_rate": 2.3750286023839777e-06,
+      "loss": 0.1179,
+      "step": 600
+    },
+    {
+      "epoch": 0.2158751245433411,
+      "grad_norm": 6.71965217590332,
+      "learning_rate": 2.3699727683359876e-06,
+      "loss": 0.0782,
+      "step": 650
+    },
+    {
+      "epoch": 0.23248090335436733,
+      "grad_norm": 3.6098344326019287,
+      "learning_rate": 2.364916934287997e-06,
+      "loss": 0.138,
+      "step": 700
+    },
+    {
+      "epoch": 0.24908668216539356,
+      "grad_norm": 2.3249447345733643,
+      "learning_rate": 2.3598611002400068e-06,
+      "loss": 0.1087,
+      "step": 750
+    },
+    {
+      "epoch": 0.2656924609764198,
+      "grad_norm": 15.047837257385254,
+      "learning_rate": 2.354805266192016e-06,
+      "loss": 0.0868,
+      "step": 800
+    },
+    {
+      "epoch": 0.282298239787446,
+      "grad_norm": 6.7322773933410645,
+      "learning_rate": 2.349749432144026e-06,
+      "loss": 0.0954,
+      "step": 850
+    },
+    {
+      "epoch": 0.29890401859847227,
+      "grad_norm": 12.954623222351074,
+      "learning_rate": 2.3446935980960354e-06,
+      "loss": 0.0689,
+      "step": 900
+    },
+    {
+      "epoch": 0.3155097974094985,
+      "grad_norm": 1.4312756061553955,
+      "learning_rate": 2.3396377640480448e-06,
+      "loss": 0.0908,
+      "step": 950
+    },
+    {
+      "epoch": 0.33211557622052473,
+      "grad_norm": 0.21316280961036682,
+      "learning_rate": 2.3345819300000546e-06,
+      "loss": 0.0766,
+      "step": 1000
+    },
+    {
+      "epoch": 0.348721355031551,
+      "grad_norm": 13.642809867858887,
+      "learning_rate": 2.329526095952064e-06,
+      "loss": 0.0533,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3653271338425772,
+      "grad_norm": 14.525202751159668,
+      "learning_rate": 2.324470261904074e-06,
+      "loss": 0.0745,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38193291265360346,
+      "grad_norm": 0.5210687518119812,
+      "learning_rate": 2.319414427856083e-06,
+      "loss": 0.0618,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39853869146462967,
+      "grad_norm": 0.07292640954256058,
+      "learning_rate": 2.314358593808093e-06,
+      "loss": 0.0307,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41514447027565593,
+      "grad_norm": 0.08236780017614365,
+      "learning_rate": 2.309302759760103e-06,
+      "loss": 0.0321,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4317502490866822,
+      "grad_norm": 28.97471809387207,
+      "learning_rate": 2.304246925712112e-06,
+      "loss": 0.0748,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4483560278977084,
+      "grad_norm": 0.4781515896320343,
+      "learning_rate": 2.2991910916641216e-06,
+      "loss": 0.0733,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46496180670873466,
+      "grad_norm": 3.214794397354126,
+      "learning_rate": 2.2941352576161314e-06,
+      "loss": 0.0149,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48156758551976087,
+      "grad_norm": 0.3289443850517273,
+      "learning_rate": 2.289079423568141e-06,
+      "loss": 0.0401,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4981733643307871,
+      "grad_norm": 0.12368986010551453,
+      "learning_rate": 2.28402358952015e-06,
+      "loss": 0.0334,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5147791431418134,
+      "grad_norm": 0.08283340185880661,
+      "learning_rate": 2.27896775547216e-06,
+      "loss": 0.0331,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5313849219528396,
+      "grad_norm": 2.650063991546631,
+      "learning_rate": 2.2739119214241694e-06,
+      "loss": 0.0496,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5479907007638658,
+      "grad_norm": 3.296297311782837,
+      "learning_rate": 2.2688560873761792e-06,
+      "loss": 0.0365,
+      "step": 1650
+    },
+    {
+      "epoch": 0.564596479574892,
+      "grad_norm": 0.032304324209690094,
+      "learning_rate": 2.263800253328189e-06,
+      "loss": 0.005,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5812022583859183,
+      "grad_norm": 0.003552216337993741,
+      "learning_rate": 2.2587444192801985e-06,
+      "loss": 0.0183,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5978080371969445,
+      "grad_norm": 0.0315885953605175,
+      "learning_rate": 2.253688585232208e-06,
+      "loss": 0.0184,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6144138160079707,
+      "grad_norm": 0.004702410195022821,
+      "learning_rate": 2.2486327511842177e-06,
+      "loss": 0.0346,
+      "step": 1850
+    },
+    {
+      "epoch": 0.631019594818997,
+      "grad_norm": 0.07862639427185059,
+      "learning_rate": 2.243576917136227e-06,
+      "loss": 0.0296,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6476253736300233,
+      "grad_norm": 0.3578585982322693,
+      "learning_rate": 2.2385210830882364e-06,
+      "loss": 0.0266,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6642311524410495,
+      "grad_norm": 0.045335959643125534,
+      "learning_rate": 2.2334652490402463e-06,
+      "loss": 0.032,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6808369312520757,
+      "grad_norm": 1.6869137287139893,
+      "learning_rate": 2.2284094149922557e-06,
+      "loss": 0.0297,
+      "step": 2050
+    },
+    {
+      "epoch": 0.697442710063102,
+      "grad_norm": 0.6017621755599976,
+      "learning_rate": 2.2233535809442655e-06,
+      "loss": 0.0119,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7140484888741282,
+      "grad_norm": 0.13145552575588226,
+      "learning_rate": 2.2182977468962753e-06,
+      "loss": 0.0157,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7306542676851544,
+      "grad_norm": 0.00971242692321539,
+      "learning_rate": 2.2132419128482847e-06,
+      "loss": 0.0099,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7472600464961807,
+      "grad_norm": 0.5801131725311279,
+      "learning_rate": 2.208186078800294e-06,
+      "loss": 0.0235,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7638658253072069,
+      "grad_norm": 0.008363746106624603,
+      "learning_rate": 2.203130244752304e-06,
+      "loss": 0.0275,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7804716041182331,
+      "grad_norm": 0.23013177514076233,
+      "learning_rate": 2.1980744107043133e-06,
+      "loss": 0.0022,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7970773829292593,
+      "grad_norm": 0.044313572347164154,
+      "learning_rate": 2.1930185766563227e-06,
+      "loss": 0.0185,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8136831617402857,
+      "grad_norm": 0.008519169874489307,
+      "learning_rate": 2.1879627426083325e-06,
+      "loss": 0.0023,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8302889405513119,
+      "grad_norm": 0.0008576350519433618,
+      "learning_rate": 2.182906908560342e-06,
+      "loss": 0.0062,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8468947193623381,
+      "grad_norm": 0.56068354845047,
+      "learning_rate": 2.1778510745123517e-06,
+      "loss": 0.0106,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8635004981733644,
+      "grad_norm": 33.770652770996094,
+      "learning_rate": 2.1727952404643615e-06,
+      "loss": 0.0298,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8801062769843906,
+      "grad_norm": 0.0006891911034472287,
+      "learning_rate": 2.167739406416371e-06,
+      "loss": 0.0046,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8967120557954168,
+      "grad_norm": 0.000691475928761065,
+      "learning_rate": 2.1626835723683803e-06,
+      "loss": 0.0014,
+      "step": 2700
+    },
+    {
+      "epoch": 0.913317834606443,
+      "grad_norm": 0.022216275334358215,
+      "learning_rate": 2.15762773832039e-06,
+      "loss": 0.0152,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9299236134174693,
+      "grad_norm": 0.0004267705953679979,
+      "learning_rate": 2.1525719042723995e-06,
+      "loss": 0.0117,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9465293922284955,
+      "grad_norm": 0.016712836921215057,
+      "learning_rate": 2.147516070224409e-06,
+      "loss": 0.0009,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9631351710395217,
+      "grad_norm": 23.74860382080078,
+      "learning_rate": 2.1424602361764187e-06,
+      "loss": 0.0233,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9797409498505479,
+      "grad_norm": 0.0039037028327584267,
+      "learning_rate": 2.137404402128428e-06,
+      "loss": 0.0193,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9963467286615743,
+      "grad_norm": 0.0023961260449141264,
+      "learning_rate": 2.132348568080438e-06,
+      "loss": 0.0068,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9921278625954199,
+      "eval_f1": 0.9921278625954199,
+      "eval_loss": 0.046909503638744354,
+      "eval_precision": 0.9921278625954199,
+      "eval_recall": 0.9921278625954199,
+      "eval_runtime": 36.762,
+      "eval_samples_per_second": 228.061,
+      "eval_steps_per_second": 14.254,
+      "step": 3011
+    },
+    {
+      "epoch": 1.0129525074726005,
+      "grad_norm": 0.0033601378090679646,
+      "learning_rate": 2.1272927340324478e-06,
+      "loss": 0.0005,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0295582862836268,
+      "grad_norm": 0.038166940212249756,
+      "learning_rate": 2.122236899984457e-06,
+      "loss": 0.0002,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0461640650946529,
+      "grad_norm": 0.0003456630220171064,
+      "learning_rate": 2.1171810659364666e-06,
+      "loss": 0.0139,
+      "step": 3150
+    },
+    {
+      "epoch": 1.0627698439056792,
+      "grad_norm": 0.004587268922477961,
+      "learning_rate": 2.1121252318884764e-06,
+      "loss": 0.0001,
+      "step": 3200
+    },
+    {
+      "epoch": 1.0793756227167055,
+      "grad_norm": 0.08502045273780823,
+      "learning_rate": 2.1070693978404858e-06,
+      "loss": 0.0216,
+      "step": 3250
+    },
+    {
+      "epoch": 1.0959814015277316,
+      "grad_norm": 0.10945820808410645,
+      "learning_rate": 2.102013563792495e-06,
+      "loss": 0.0256,
+      "step": 3300
+    },
+    {
+      "epoch": 1.112587180338758,
+      "grad_norm": 0.03236968442797661,
+      "learning_rate": 2.096957729744505e-06,
+      "loss": 0.005,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1291929591497842,
+      "grad_norm": 0.007731316145509481,
+      "learning_rate": 2.0919018956965144e-06,
+      "loss": 0.0101,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1457987379608103,
+      "grad_norm": 0.00674546230584383,
+      "learning_rate": 2.086846061648524e-06,
+      "loss": 0.0051,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1624045167718366,
+      "grad_norm": 0.004380326252430677,
+      "learning_rate": 2.081790227600534e-06,
+      "loss": 0.0039,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1790102955828627,
+      "grad_norm": 0.031456008553504944,
+      "learning_rate": 2.0767343935525434e-06,
+      "loss": 0.0001,
+      "step": 3550
+    },
+    {
+      "epoch": 1.195616074393889,
+      "grad_norm": 0.017602458596229553,
+      "learning_rate": 2.071678559504553e-06,
+      "loss": 0.006,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2122218532049154,
+      "grad_norm": 0.009589639492332935,
+      "learning_rate": 2.0666227254565626e-06,
+      "loss": 0.001,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2288276320159415,
+      "grad_norm": 0.003254746785387397,
+      "learning_rate": 2.061566891408572e-06,
+      "loss": 0.0,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2454334108269678,
+      "grad_norm": 0.0011986729223281145,
+      "learning_rate": 2.056511057360582e-06,
+      "loss": 0.0126,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2620391896379939,
+      "grad_norm": 0.006293583195656538,
+      "learning_rate": 2.0514552233125912e-06,
+      "loss": 0.0006,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2786449684490202,
+      "grad_norm": 0.11370380967855453,
+      "learning_rate": 2.0463993892646006e-06,
+      "loss": 0.0252,
+      "step": 3850
+    },
+    {
+      "epoch": 1.2952507472600465,
+      "grad_norm": 0.0018469190690666437,
+      "learning_rate": 2.0413435552166104e-06,
+      "loss": 0.0004,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3118565260710726,
+      "grad_norm": 0.0002411604655208066,
+      "learning_rate": 2.0362877211686202e-06,
+      "loss": 0.003,
+      "step": 3950
+    },
+    {
+      "epoch": 1.328462304882099,
+      "grad_norm": 4.065009852638468e-05,
+      "learning_rate": 2.0312318871206296e-06,
+      "loss": 0.0165,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3450680836931252,
+      "grad_norm": 0.005062599666416645,
+      "learning_rate": 2.0261760530726395e-06,
+      "loss": 0.0028,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3616738625041513,
+      "grad_norm": 0.017400013282895088,
+      "learning_rate": 2.021120219024649e-06,
+      "loss": 0.001,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3782796413151777,
+      "grad_norm": 0.05683843046426773,
+      "learning_rate": 2.0160643849766582e-06,
+      "loss": 0.0124,
+      "step": 4150
+    },
+    {
+      "epoch": 1.394885420126204,
+      "grad_norm": 0.0027029893826693296,
+      "learning_rate": 2.011008550928668e-06,
+      "loss": 0.0003,
+      "step": 4200
+    },
+    {
+      "epoch": 1.41149119893723,
+      "grad_norm": 0.002034110017120838,
+      "learning_rate": 2.0059527168806775e-06,
+      "loss": 0.0073,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4280969777482564,
+      "grad_norm": 0.001398180378600955,
+      "learning_rate": 2.000896882832687e-06,
+      "loss": 0.0044,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4447027565592827,
+      "grad_norm": 0.00037716259248554707,
+      "learning_rate": 1.9958410487846967e-06,
+      "loss": 0.0228,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4613085353703088,
+      "grad_norm": 0.015627387911081314,
+      "learning_rate": 1.9907852147367065e-06,
+      "loss": 0.0114,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4779143141813351,
+      "grad_norm": 0.008964600041508675,
+      "learning_rate": 1.985729380688716e-06,
+      "loss": 0.0032,
+      "step": 4450
+    },
+    {
+      "epoch": 1.4945200929923614,
+      "grad_norm": 0.003252738853916526,
+      "learning_rate": 1.9806735466407257e-06,
+      "loss": 0.0082,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5111258718033875,
+      "grad_norm": 0.00012037971464451402,
+      "learning_rate": 1.975617712592735e-06,
+      "loss": 0.0001,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5277316506144138,
+      "grad_norm": 0.010974590666592121,
+      "learning_rate": 1.9705618785447445e-06,
+      "loss": 0.0,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5443374294254402,
+      "grad_norm": 0.08398176729679108,
+      "learning_rate": 1.9655060444967543e-06,
+      "loss": 0.0002,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5609432082364663,
+      "grad_norm": 0.03629281371831894,
+      "learning_rate": 1.9604502104487637e-06,
+      "loss": 0.006,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5775489870474926,
+      "grad_norm": 0.00034110501292161644,
+      "learning_rate": 1.955394376400773e-06,
+      "loss": 0.0003,
+      "step": 4750
+    },
+    {
+      "epoch": 1.594154765858519,
+      "grad_norm": 0.0027959852013736963,
+      "learning_rate": 1.950338542352783e-06,
+      "loss": 0.0,
+      "step": 4800
+    },
+    {
+      "epoch": 1.610760544669545,
+      "grad_norm": 0.0001677741383900866,
+      "learning_rate": 1.9452827083047927e-06,
+      "loss": 0.0023,
+      "step": 4850
+    },
+    {
+      "epoch": 1.627366323480571,
+      "grad_norm": 0.055583104491233826,
+      "learning_rate": 1.940226874256802e-06,
+      "loss": 0.0225,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6439721022915976,
+      "grad_norm": 8.664117194712162e-05,
+      "learning_rate": 1.935171040208812e-06,
+      "loss": 0.0009,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6605778811026237,
+      "grad_norm": 0.0017323939828202128,
+      "learning_rate": 1.9301152061608213e-06,
+      "loss": 0.008,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6771836599136498,
+      "grad_norm": 0.0034425491467118263,
+      "learning_rate": 1.9250593721128307e-06,
+      "loss": 0.0,
+      "step": 5050
+    },
+    {
+      "epoch": 1.6937894387246761,
+      "grad_norm": 6.076216959627345e-05,
+      "learning_rate": 1.9200035380648405e-06,
+      "loss": 0.0041,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7103952175357025,
+      "grad_norm": 0.0018082900205627084,
+      "learning_rate": 1.91494770401685e-06,
+      "loss": 0.0017,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7270009963467285,
+      "grad_norm": 0.008552160114049911,
+      "learning_rate": 1.9098918699688593e-06,
+      "loss": 0.0137,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7436067751577549,
+      "grad_norm": 0.08908296376466751,
+      "learning_rate": 1.9048360359208694e-06,
+      "loss": 0.0092,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7602125539687812,
+      "grad_norm": 0.002973488997668028,
+      "learning_rate": 1.8997802018728788e-06,
+      "loss": 0.0002,
+      "step": 5300
+    },
+    {
+      "epoch": 1.7768183327798073,
+      "grad_norm": 0.005116044543683529,
+      "learning_rate": 1.8947243678248884e-06,
+      "loss": 0.0079,
+      "step": 5350
+    },
+    {
+      "epoch": 1.7934241115908336,
+      "grad_norm": 0.002092874376103282,
+      "learning_rate": 1.889668533776898e-06,
+      "loss": 0.0,
+      "step": 5400
+    },
+    {
+      "epoch": 1.81002989040186,
+      "grad_norm": 0.0070649790577590466,
+      "learning_rate": 1.8846126997289076e-06,
+      "loss": 0.0,
+      "step": 5450
+    },
+    {
+      "epoch": 1.826635669212886,
+      "grad_norm": 0.001974167302250862,
+      "learning_rate": 1.879556865680917e-06,
+      "loss": 0.016,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8432414480239123,
+      "grad_norm": 0.0012006360339000821,
+      "learning_rate": 1.8745010316329268e-06,
+      "loss": 0.0,
+      "step": 5550
+    },
+    {
+      "epoch": 1.8598472268349386,
+      "grad_norm": 0.006318301893770695,
+      "learning_rate": 1.8694451975849362e-06,
+      "loss": 0.0,
+      "step": 5600
+    },
+    {
+      "epoch": 1.8764530056459647,
+      "grad_norm": 0.0020722977351397276,
+      "learning_rate": 1.8643893635369458e-06,
+      "loss": 0.0104,
+      "step": 5650
+    },
+    {
+      "epoch": 1.893058784456991,
+      "grad_norm": 0.0874456912279129,
+      "learning_rate": 1.8593335294889556e-06,
+      "loss": 0.0023,
+      "step": 5700
+    },
+    {
+      "epoch": 1.9096645632680174,
+      "grad_norm": 0.00042386740096844733,
+      "learning_rate": 1.854277695440965e-06,
+      "loss": 0.0105,
+      "step": 5750
+    },
+    {
+      "epoch": 1.9262703420790435,
+      "grad_norm": 0.05140538513660431,
+      "learning_rate": 1.8492218613929746e-06,
+      "loss": 0.0008,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9428761208900698,
+      "grad_norm": 0.00046465068589895964,
+      "learning_rate": 1.8441660273449842e-06,
+      "loss": 0.0176,
+      "step": 5850
+    },
+    {
+      "epoch": 1.959481899701096,
+      "grad_norm": 0.001875279936939478,
+      "learning_rate": 1.8391101932969938e-06,
+      "loss": 0.0002,
+      "step": 5900
+    },
+    {
+      "epoch": 1.9760876785121222,
+      "grad_norm": 0.0012590339174494147,
+      "learning_rate": 1.8340543592490032e-06,
+      "loss": 0.001,
+      "step": 5950
+    },
+    {
+      "epoch": 1.9926934573231485,
+      "grad_norm": 25.133811950683594,
+      "learning_rate": 1.828998525201013e-06,
+      "loss": 0.0229,
+      "step": 6000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.995706106870229,
+      "eval_f1": 0.9956269879098661,
+      "eval_loss": 0.0445549376308918,
+      "eval_precision": 0.9956596696711074,
+      "eval_recall": 0.995706106870229,
+      "eval_runtime": 38.3077,
+      "eval_samples_per_second": 218.859,
+      "eval_steps_per_second": 13.679,
+      "step": 6022
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 24088,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-1/checkpoint-6022/training_args.bin b/trial-1/checkpoint-6022/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4087ec56e47476361d22dfe17ea11d79a64f155b
--- /dev/null
+++ b/trial-1/checkpoint-6022/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:161830f01fe4451cf2afb08516c24e569c5b229b44b735c51814ae17b5494e10
+size 5368
diff --git a/trial-2/checkpoint-6022/config.json b/trial-2/checkpoint-6022/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-2/checkpoint-6022/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-2/checkpoint-6022/model.safetensors b/trial-2/checkpoint-6022/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..34f8425ab92253d1fda1671277bdad1214b2fbc3
--- /dev/null
+++ b/trial-2/checkpoint-6022/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33d8242e8a21a76a0ad8b21949fe7bd68e94de5ce2da543a151336909fcb8e83
+size 598439784
diff --git a/trial-2/checkpoint-6022/optimizer.pt b/trial-2/checkpoint-6022/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8c5acf7f402f99bbcf6d7ad4b4a890b5bb0c4d5d
--- /dev/null
+++ b/trial-2/checkpoint-6022/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c89405c1def95fb7d1e0ff7deac188ca136134ebd620d1451c9f0d4ed557d77a
+size 1196967418
diff --git a/trial-2/checkpoint-6022/rng_state.pth b/trial-2/checkpoint-6022/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1
--- /dev/null
+++ b/trial-2/checkpoint-6022/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244
diff --git a/trial-2/checkpoint-6022/scheduler.pt b/trial-2/checkpoint-6022/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..346ec0135f9cbb01269c18402c5bb87704740a2f
--- /dev/null
+++ b/trial-2/checkpoint-6022/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daebe5b6f96508652ee77aa623e80e4943a4ab7b8acffe2720aa77d58c2624f9
+size 1064
diff --git a/trial-2/checkpoint-6022/trainer_state.json b/trial-2/checkpoint-6022/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..22b3bdb366ae8c61b15e647467848d653a68713f
--- /dev/null
+++ b/trial-2/checkpoint-6022/trainer_state.json
@@ -0,0 +1,897 @@
+{
+  "best_metric": 0.031979888677597046,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-2/checkpoint-6022",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 6022,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016605778811026237,
+      "grad_norm": 21.788597106933594,
+      "learning_rate": 5.429575351871404e-06,
+      "loss": 0.5789,
+      "step": 50
+    },
+    {
+      "epoch": 0.033211557622052475,
+      "grad_norm": 20.038349151611328,
+      "learning_rate": 5.416664391316233e-06,
+      "loss": 0.37,
+      "step": 100
+    },
+    {
+      "epoch": 0.04981733643307871,
+      "grad_norm": 23.927526473999023,
+      "learning_rate": 5.403753430761063e-06,
+      "loss": 0.25,
+      "step": 150
+    },
+    {
+      "epoch": 0.06642311524410495,
+      "grad_norm": 4.1712799072265625,
+      "learning_rate": 5.390842470205893e-06,
+      "loss": 0.1921,
+      "step": 200
+    },
+    {
+      "epoch": 0.08302889405513118,
+      "grad_norm": 6.138601303100586,
+      "learning_rate": 5.3779315096507225e-06,
+      "loss": 0.1365,
+      "step": 250
+    },
+    {
+      "epoch": 0.09963467286615742,
+      "grad_norm": 0.9431160092353821,
+      "learning_rate": 5.3650205490955514e-06,
+      "loss": 0.1473,
+      "step": 300
+    },
+    {
+      "epoch": 0.11624045167718366,
+      "grad_norm": 25.303245544433594,
+      "learning_rate": 5.352109588540381e-06,
+      "loss": 0.0875,
+      "step": 350
+    },
+    {
+      "epoch": 0.1328462304882099,
+      "grad_norm": 14.83379077911377,
+      "learning_rate": 5.33919862798521e-06,
+      "loss": 0.111,
+      "step": 400
+    },
+    {
+      "epoch": 0.14945200929923613,
+      "grad_norm": 0.2346535325050354,
+      "learning_rate": 5.32628766743004e-06,
+      "loss": 0.0722,
+      "step": 450
+    },
+    {
+      "epoch": 0.16605778811026237,
+      "grad_norm": 19.045169830322266,
+      "learning_rate": 5.31337670687487e-06,
+      "loss": 0.1236,
+      "step": 500
+    },
+    {
+      "epoch": 0.1826635669212886,
+      "grad_norm": 10.871609687805176,
+      "learning_rate": 5.300465746319699e-06,
+      "loss": 0.1018,
+      "step": 550
+    },
+    {
+      "epoch": 0.19926934573231483,
+      "grad_norm": 8.278830528259277,
+      "learning_rate": 5.287554785764528e-06,
+      "loss": 0.0608,
+      "step": 600
+    },
+    {
+      "epoch": 0.2158751245433411,
+      "grad_norm": 3.4486818313598633,
+      "learning_rate": 5.274643825209358e-06,
+      "loss": 0.0684,
+      "step": 650
+    },
+    {
+      "epoch": 0.23248090335436733,
+      "grad_norm": 9.789453506469727,
+      "learning_rate": 5.261732864654187e-06,
+      "loss": 0.0826,
+      "step": 700
+    },
+    {
+      "epoch": 0.24908668216539356,
+      "grad_norm": 0.013454285450279713,
+      "learning_rate": 5.248821904099017e-06,
+      "loss": 0.0672,
+      "step": 750
+    },
+    {
+      "epoch": 0.2656924609764198,
+      "grad_norm": 0.8878294825553894,
+      "learning_rate": 5.2359109435438465e-06,
+      "loss": 0.0472,
+      "step": 800
+    },
+    {
+      "epoch": 0.282298239787446,
+      "grad_norm": 15.41006088256836,
+      "learning_rate": 5.222999982988676e-06,
+      "loss": 0.0616,
+      "step": 850
+    },
+    {
+      "epoch": 0.29890401859847227,
+      "grad_norm": 0.04324938729405403,
+      "learning_rate": 5.210089022433506e-06,
+      "loss": 0.0215,
+      "step": 900
+    },
+    {
+      "epoch": 0.3155097974094985,
+      "grad_norm": 0.011849366128444672,
+      "learning_rate": 5.197178061878335e-06,
+      "loss": 0.0398,
+      "step": 950
+    },
+    {
+      "epoch": 0.33211557622052473,
+      "grad_norm": 0.0020897299982607365,
+      "learning_rate": 5.184267101323165e-06,
+      "loss": 0.0294,
+      "step": 1000
+    },
+    {
+      "epoch": 0.348721355031551,
+      "grad_norm": 0.00038467388367280364,
+      "learning_rate": 5.171356140767994e-06,
+      "loss": 0.0328,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3653271338425772,
+      "grad_norm": 0.0022064056247472763,
+      "learning_rate": 5.158445180212823e-06,
+      "loss": 0.0216,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38193291265360346,
+      "grad_norm": 0.012603014707565308,
+      "learning_rate": 5.145534219657653e-06,
+      "loss": 0.0293,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39853869146462967,
+      "grad_norm": 0.002970542525872588,
+      "learning_rate": 5.132623259102483e-06,
+      "loss": 0.0133,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41514447027565593,
+      "grad_norm": 0.09289965778589249,
+      "learning_rate": 5.119712298547312e-06,
+      "loss": 0.0189,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4317502490866822,
+      "grad_norm": 0.030116688460111618,
+      "learning_rate": 5.106801337992142e-06,
+      "loss": 0.0266,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4483560278977084,
+      "grad_norm": 23.291847229003906,
+      "learning_rate": 5.0938903774369705e-06,
+      "loss": 0.0378,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46496180670873466,
+      "grad_norm": 0.00580954784527421,
+      "learning_rate": 5.0809794168818e-06,
+      "loss": 0.0002,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48156758551976087,
+      "grad_norm": 0.0036250711418688297,
+      "learning_rate": 5.06806845632663e-06,
+      "loss": 0.0297,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4981733643307871,
+      "grad_norm": 0.0013630707981064916,
+      "learning_rate": 5.05515749577146e-06,
+      "loss": 0.0114,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5147791431418134,
+      "grad_norm": 0.025447094812989235,
+      "learning_rate": 5.042246535216289e-06,
+      "loss": 0.0019,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5313849219528396,
+      "grad_norm": 18.81841468811035,
+      "learning_rate": 5.0293355746611185e-06,
+      "loss": 0.0286,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5479907007638658,
+      "grad_norm": 0.0033424277789890766,
+      "learning_rate": 5.016424614105948e-06,
+      "loss": 0.0393,
+      "step": 1650
+    },
+    {
+      "epoch": 0.564596479574892,
+      "grad_norm": 0.039123374968767166,
+      "learning_rate": 5.003513653550777e-06,
+      "loss": 0.0186,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5812022583859183,
+      "grad_norm": 0.0005275913863442838,
+      "learning_rate": 4.990602692995607e-06,
+      "loss": 0.0003,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5978080371969445,
+      "grad_norm": 0.005070064682513475,
+      "learning_rate": 4.977691732440437e-06,
+      "loss": 0.01,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6144138160079707,
+      "grad_norm": 0.003932475112378597,
+      "learning_rate": 4.9647807718852664e-06,
+      "loss": 0.0222,
+      "step": 1850
+    },
+    {
+      "epoch": 0.631019594818997,
+      "grad_norm": 0.6544032692909241,
+      "learning_rate": 4.951869811330095e-06,
+      "loss": 0.0138,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6476253736300233,
+      "grad_norm": 0.008768323808908463,
+      "learning_rate": 4.938958850774925e-06,
+      "loss": 0.0056,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6642311524410495,
+      "grad_norm": 0.0021180976182222366,
+      "learning_rate": 4.926047890219754e-06,
+      "loss": 0.0049,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6808369312520757,
+      "grad_norm": 0.002039346843957901,
+      "learning_rate": 4.913136929664584e-06,
+      "loss": 0.0142,
+      "step": 2050
+    },
+    {
+      "epoch": 0.697442710063102,
+      "grad_norm": 0.012900142930448055,
+      "learning_rate": 4.9002259691094136e-06,
+      "loss": 0.0105,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7140484888741282,
+      "grad_norm": 0.0022153747268021107,
+      "learning_rate": 4.887315008554243e-06,
+      "loss": 0.0142,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7306542676851544,
+      "grad_norm": 0.001426122267730534,
+      "learning_rate": 4.874404047999072e-06,
+      "loss": 0.0068,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7472600464961807,
+      "grad_norm": 0.0008603449095971882,
+      "learning_rate": 4.861493087443902e-06,
+      "loss": 0.0119,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7638658253072069,
+      "grad_norm": 0.0006780526018701494,
+      "learning_rate": 4.848582126888731e-06,
+      "loss": 0.0108,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7804716041182331,
+      "grad_norm": 0.014527379535138607,
+      "learning_rate": 4.835671166333561e-06,
+      "loss": 0.0002,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7970773829292593,
+      "grad_norm": 0.00022624376288149506,
+      "learning_rate": 4.8227602057783904e-06,
+      "loss": 0.0092,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8136831617402857,
+      "grad_norm": 0.0044932495802640915,
+      "learning_rate": 4.80984924522322e-06,
+      "loss": 0.0001,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8302889405513119,
+      "grad_norm": 0.0009355309884995222,
+      "learning_rate": 4.79693828466805e-06,
+      "loss": 0.0002,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8468947193623381,
+      "grad_norm": 0.12550997734069824,
+      "learning_rate": 4.784027324112879e-06,
+      "loss": 0.0024,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8635004981733644,
+      "grad_norm": 0.02399071305990219,
+      "learning_rate": 4.771116363557709e-06,
+      "loss": 0.0099,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8801062769843906,
+      "grad_norm": 0.008470265194773674,
+      "learning_rate": 4.7582054030025375e-06,
+      "loss": 0.0157,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8967120557954168,
+      "grad_norm": 3.967735028709285e-05,
+      "learning_rate": 4.745294442447367e-06,
+      "loss": 0.0013,
+      "step": 2700
+    },
+    {
+      "epoch": 0.913317834606443,
+      "grad_norm": 0.0005532742943614721,
+      "learning_rate": 4.732383481892197e-06,
+      "loss": 0.0025,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9299236134174693,
+      "grad_norm": 9.227233022102155e-06,
+      "learning_rate": 4.719472521337027e-06,
+      "loss": 0.0028,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9465293922284955,
+      "grad_norm": 0.280258446931839,
+      "learning_rate": 4.706561560781856e-06,
+      "loss": 0.0004,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9631351710395217,
+      "grad_norm": 27.427757263183594,
+      "learning_rate": 4.6936506002266855e-06,
+      "loss": 0.0127,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9797409498505479,
+      "grad_norm": 176.85423278808594,
+      "learning_rate": 4.680739639671514e-06,
+      "loss": 0.0298,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9963467286615743,
+      "grad_norm": 0.00011263355554547161,
+      "learning_rate": 4.667828679116344e-06,
+      "loss": 0.001,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9963024809160306,
+      "eval_f1": 0.9962431632227496,
+      "eval_loss": 0.04071500524878502,
+      "eval_precision": 0.9962693439313673,
+      "eval_recall": 0.9963024809160306,
+      "eval_runtime": 38.0003,
+      "eval_samples_per_second": 220.63,
+      "eval_steps_per_second": 13.789,
+      "step": 3011
+    },
+    {
+      "epoch": 1.0129525074726005,
+      "grad_norm": 0.05092976614832878,
+      "learning_rate": 4.654917718561174e-06,
+      "loss": 0.018,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0295582862836268,
+      "grad_norm": 3.4633874747669324e-05,
+      "learning_rate": 4.642006758006004e-06,
+      "loss": 0.0,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0461640650946529,
+      "grad_norm": 8.058391540544108e-05,
+      "learning_rate": 4.629095797450833e-06,
+      "loss": 0.0,
+      "step": 3150
+    },
+    {
+      "epoch": 1.0627698439056792,
+      "grad_norm": 0.00043129033292643726,
+      "learning_rate": 4.616184836895662e-06,
+      "loss": 0.0,
+      "step": 3200
+    },
+    {
+      "epoch": 1.0793756227167055,
+      "grad_norm": 0.012417804449796677,
+      "learning_rate": 4.603273876340492e-06,
+      "loss": 0.0204,
+      "step": 3250
+    },
+    {
+      "epoch": 1.0959814015277316,
+      "grad_norm": 0.07707448303699493,
+      "learning_rate": 4.590362915785321e-06,
+      "loss": 0.0089,
+      "step": 3300
+    },
+    {
+      "epoch": 1.112587180338758,
+      "grad_norm": 0.0019856118597090244,
+      "learning_rate": 4.577451955230151e-06,
+      "loss": 0.0003,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1291929591497842,
+      "grad_norm": 0.0003844090970233083,
+      "learning_rate": 4.564540994674981e-06,
+      "loss": 0.0,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1457987379608103,
+      "grad_norm": 0.004796341527253389,
+      "learning_rate": 4.55163003411981e-06,
+      "loss": 0.0054,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1624045167718366,
+      "grad_norm": 0.0021394495852291584,
+      "learning_rate": 4.538719073564639e-06,
+      "loss": 0.0001,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1790102955828627,
+      "grad_norm": 0.00016287445032503456,
+      "learning_rate": 4.525808113009469e-06,
+      "loss": 0.0017,
+      "step": 3550
+    },
+    {
+      "epoch": 1.195616074393889,
+      "grad_norm": 0.005753168836236,
+      "learning_rate": 4.512897152454298e-06,
+      "loss": 0.0132,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2122218532049154,
+      "grad_norm": 0.00012519631127361208,
+      "learning_rate": 4.499986191899128e-06,
+      "loss": 0.0,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2288276320159415,
+      "grad_norm": 0.0009526669164188206,
+      "learning_rate": 4.487075231343957e-06,
+      "loss": 0.0083,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2454334108269678,
+      "grad_norm": 6.90124070388265e-05,
+      "learning_rate": 4.474164270788787e-06,
+      "loss": 0.0114,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2620391896379939,
+      "grad_norm": 0.0029422417283058167,
+      "learning_rate": 4.461253310233616e-06,
+      "loss": 0.0001,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2786449684490202,
+      "grad_norm": 1.6564589738845825,
+      "learning_rate": 4.448342349678446e-06,
+      "loss": 0.0065,
+      "step": 3850
+    },
+    {
+      "epoch": 1.2952507472600465,
+      "grad_norm": 4.6906425268389285e-05,
+      "learning_rate": 4.435431389123275e-06,
+      "loss": 0.0,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3118565260710726,
+      "grad_norm": 1.4456440112553537e-05,
+      "learning_rate": 4.4225204285681046e-06,
+      "loss": 0.0,
+      "step": 3950
+    },
+    {
+      "epoch": 1.328462304882099,
+      "grad_norm": 4.6707005822099745e-05,
+      "learning_rate": 4.409609468012934e-06,
+      "loss": 0.0227,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3450680836931252,
+      "grad_norm": 4.7155015636235476e-05,
+      "learning_rate": 4.396698507457763e-06,
+      "loss": 0.0002,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3616738625041513,
+      "grad_norm": 0.01696430891752243,
+      "learning_rate": 4.383787546902593e-06,
+      "loss": 0.0188,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3782796413151777,
+      "grad_norm": 0.0008329456904903054,
+      "learning_rate": 4.370876586347423e-06,
+      "loss": 0.0178,
+      "step": 4150
+    },
+    {
+      "epoch": 1.394885420126204,
+      "grad_norm": 9.179511835100129e-05,
+      "learning_rate": 4.3579656257922525e-06,
+      "loss": 0.0,
+      "step": 4200
+    },
+    {
+      "epoch": 1.41149119893723,
+      "grad_norm": 2.924172622442711e-05,
+      "learning_rate": 4.3450546652370814e-06,
+      "loss": 0.0013,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4280969777482564,
+      "grad_norm": 0.015076125971972942,
+      "learning_rate": 4.332143704681911e-06,
+      "loss": 0.0104,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4447027565592827,
+      "grad_norm": 5.385762415244244e-05,
+      "learning_rate": 4.31923274412674e-06,
+      "loss": 0.014,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4613085353703088,
+      "grad_norm": 0.0007110639126040041,
+      "learning_rate": 4.30632178357157e-06,
+      "loss": 0.0126,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4779143141813351,
+      "grad_norm": 0.00014339391782414168,
+      "learning_rate": 4.2934108230164e-06,
+      "loss": 0.0003,
+      "step": 4450
+    },
+    {
+      "epoch": 1.4945200929923614,
+      "grad_norm": 0.0006024091853760183,
+      "learning_rate": 4.280499862461229e-06,
+      "loss": 0.0118,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5111258718033875,
+      "grad_norm": 0.0002353072923142463,
+      "learning_rate": 4.267588901906058e-06,
+      "loss": 0.0086,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5277316506144138,
+      "grad_norm": 0.0008946498855948448,
+      "learning_rate": 4.254677941350888e-06,
+      "loss": 0.0,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5443374294254402,
+      "grad_norm": 7.315174298128113e-05,
+      "learning_rate": 4.241766980795717e-06,
+      "loss": 0.0003,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5609432082364663,
+      "grad_norm": 9.232313459506258e-05,
+      "learning_rate": 4.228856020240547e-06,
+      "loss": 0.0001,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5775489870474926,
+      "grad_norm": 1.4020029084349517e-05,
+      "learning_rate": 4.2159450596853765e-06,
+      "loss": 0.0,
+      "step": 4750
+    },
+    {
+      "epoch": 1.594154765858519,
+      "grad_norm": 4.0607475966680795e-05,
+      "learning_rate": 4.203034099130206e-06,
+      "loss": 0.0,
+      "step": 4800
+    },
+    {
+      "epoch": 1.610760544669545,
+      "grad_norm": 4.69290571345482e-05,
+      "learning_rate": 4.190123138575036e-06,
+      "loss": 0.0177,
+      "step": 4850
+    },
+    {
+      "epoch": 1.627366323480571,
+      "grad_norm": 0.14096687734127045,
+      "learning_rate": 4.177212178019865e-06,
+      "loss": 0.0115,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6439721022915976,
+      "grad_norm": 0.00020342542848084122,
+      "learning_rate": 4.164301217464695e-06,
+      "loss": 0.0001,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6605778811026237,
+      "grad_norm": 0.0002786288969218731,
+      "learning_rate": 4.151390256909524e-06,
+      "loss": 0.0,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6771836599136498,
+      "grad_norm": 2.8438846129574813e-05,
+      "learning_rate": 4.138479296354353e-06,
+      "loss": 0.0032,
+      "step": 5050
+    },
+    {
+      "epoch": 1.6937894387246761,
+      "grad_norm": 5.944320037087891e-06,
+      "learning_rate": 4.125568335799183e-06,
+      "loss": 0.0001,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7103952175357025,
+      "grad_norm": 0.005958211608231068,
+      "learning_rate": 4.112657375244013e-06,
+      "loss": 0.0,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7270009963467285,
+      "grad_norm": 0.002004456939175725,
+      "learning_rate": 4.099746414688842e-06,
+      "loss": 0.0106,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7436067751577549,
+      "grad_norm": 0.0008562383009120822,
+      "learning_rate": 4.086835454133672e-06,
+      "loss": 0.0081,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7602125539687812,
+      "grad_norm": 0.03570560738444328,
+      "learning_rate": 4.0739244935785005e-06,
+      "loss": 0.025,
+      "step": 5300
+    },
+    {
+      "epoch": 1.7768183327798073,
+      "grad_norm": 0.001486024702899158,
+      "learning_rate": 4.06101353302333e-06,
+      "loss": 0.0145,
+      "step": 5350
+    },
+    {
+      "epoch": 1.7934241115908336,
+      "grad_norm": 0.0015331929316744208,
+      "learning_rate": 4.04810257246816e-06,
+      "loss": 0.0001,
+      "step": 5400
+    },
+    {
+      "epoch": 1.81002989040186,
+      "grad_norm": 0.004162834957242012,
+      "learning_rate": 4.03519161191299e-06,
+      "loss": 0.0005,
+      "step": 5450
+    },
+    {
+      "epoch": 1.826635669212886,
+      "grad_norm": 0.0003064811462536454,
+      "learning_rate": 4.022280651357819e-06,
+      "loss": 0.0,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8432414480239123,
+      "grad_norm": 0.000830256671179086,
+      "learning_rate": 4.0093696908026485e-06,
+      "loss": 0.0034,
+      "step": 5550
+    },
+    {
+      "epoch": 1.8598472268349386,
+      "grad_norm": 0.001540405093692243,
+      "learning_rate": 3.996458730247478e-06,
+      "loss": 0.0,
+      "step": 5600
+    },
+    {
+      "epoch": 1.8764530056459647,
+      "grad_norm": 0.011221639811992645,
+      "learning_rate": 3.983547769692307e-06,
+      "loss": 0.0116,
+      "step": 5650
+    },
+    {
+      "epoch": 1.893058784456991,
+      "grad_norm": 0.0031693174969404936,
+      "learning_rate": 3.970636809137137e-06,
+      "loss": 0.0061,
+      "step": 5700
+    },
+    {
+      "epoch": 1.9096645632680174,
+      "grad_norm": 7.828649540897459e-05,
+      "learning_rate": 3.957725848581967e-06,
+      "loss": 0.0,
+      "step": 5750
+    },
+    {
+      "epoch": 1.9262703420790435,
+      "grad_norm": 0.00892726145684719,
+      "learning_rate": 3.9448148880267964e-06,
+      "loss": 0.0003,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9428761208900698,
+      "grad_norm": 0.0033830904867500067,
+      "learning_rate": 3.931903927471625e-06,
+      "loss": 0.0007,
+      "step": 5850
+    },
+    {
+      "epoch": 1.959481899701096,
+      "grad_norm": 0.017441514879465103,
+      "learning_rate": 3.918992966916455e-06,
+      "loss": 0.0109,
+      "step": 5900
+    },
+    {
+      "epoch": 1.9760876785121222,
+      "grad_norm": 0.006790176033973694,
+      "learning_rate": 3.906082006361284e-06,
+      "loss": 0.0101,
+      "step": 5950
+    },
+    {
+      "epoch": 1.9926934573231485,
+      "grad_norm": 0.0004248483164701611,
+      "learning_rate": 3.893171045806114e-06,
+      "loss": 0.0103,
+      "step": 6000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9959446564885496,
+      "eval_f1": 0.9958827988724177,
+      "eval_loss": 0.031979888677597046,
+      "eval_precision": 0.9958978797187497,
+      "eval_recall": 0.9959446564885496,
+      "eval_runtime": 37.4063,
+      "eval_samples_per_second": 224.134,
+      "eval_steps_per_second": 14.008,
+      "step": 6022
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 21077,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 7,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-2/checkpoint-6022/training_args.bin b/trial-2/checkpoint-6022/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0df248ab0a69e9e5a85d1cc73b799d697b96402c
--- /dev/null
+++ b/trial-2/checkpoint-6022/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9657a8731817c986f017540c64090098467c35e79328bfa7cab093c33da6a8e9
+size 5368
diff --git a/trial-3/checkpoint-1506/config.json b/trial-3/checkpoint-1506/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-3/checkpoint-1506/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-3/checkpoint-1506/model.safetensors b/trial-3/checkpoint-1506/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..215b630a963742d9d4bbfed6eb6e55d3b754920c
--- /dev/null
+++ b/trial-3/checkpoint-1506/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:577af3b8b0a6d7db7f2ff1054a5c4c43704103dd0ed797800f9d9582a3237033
+size 598439784
diff --git a/trial-3/checkpoint-1506/optimizer.pt b/trial-3/checkpoint-1506/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a6762b6abd952285de010b6b0370cd57b63be85
--- /dev/null
+++ b/trial-3/checkpoint-1506/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:309810681fe0458054a9e76c6bfbb6fc2862ae83f89b084906874442e8913f57
+size 1196967418
diff --git a/trial-3/checkpoint-1506/rng_state.pth b/trial-3/checkpoint-1506/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cf3d91c5392ca6b7d7e0880933b7830a896d7c9e
--- /dev/null
+++ b/trial-3/checkpoint-1506/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
+size 14244
diff --git a/trial-3/checkpoint-1506/scheduler.pt b/trial-3/checkpoint-1506/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f131c33e989612c23ac3cf1568fa31b1782a8ae8
--- /dev/null
+++ b/trial-3/checkpoint-1506/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77511df67542c270c7a8ed9a3ae9f0a88d6822756582e31cb89e7ee9b503abfb
+size 1064
diff --git a/trial-3/checkpoint-1506/trainer_state.json b/trial-3/checkpoint-1506/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..394b21d4c91185d069ba4c10fe805b462822da99
--- /dev/null
+++ b/trial-3/checkpoint-1506/trainer_state.json
@@ -0,0 +1,255 @@
+{
+  "best_metric": 0.03509189188480377,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-3/checkpoint-1506",
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1506,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.033200531208499334,
+      "grad_norm": 6.976862907409668,
+      "learning_rate": 2.8972663455552343e-06,
+      "loss": 0.5378,
+      "step": 50
+    },
+    {
+      "epoch": 0.06640106241699867,
+      "grad_norm": 3.674832344055176,
+      "learning_rate": 2.8648439379281615e-06,
+      "loss": 0.3375,
+      "step": 100
+    },
+    {
+      "epoch": 0.099601593625498,
+      "grad_norm": 2.678229570388794,
+      "learning_rate": 2.8324215303010886e-06,
+      "loss": 0.2213,
+      "step": 150
+    },
+    {
+      "epoch": 0.13280212483399734,
+      "grad_norm": 6.4370551109313965,
+      "learning_rate": 2.7999991226740153e-06,
+      "loss": 0.1558,
+      "step": 200
+    },
+    {
+      "epoch": 0.16600265604249667,
+      "grad_norm": 6.4544525146484375,
+      "learning_rate": 2.767576715046943e-06,
+      "loss": 0.1457,
+      "step": 250
+    },
+    {
+      "epoch": 0.199203187250996,
+      "grad_norm": 2.4753177165985107,
+      "learning_rate": 2.7351543074198696e-06,
+      "loss": 0.1349,
+      "step": 300
+    },
+    {
+      "epoch": 0.23240371845949534,
+      "grad_norm": 3.116945743560791,
+      "learning_rate": 2.7027318997927968e-06,
+      "loss": 0.1144,
+      "step": 350
+    },
+    {
+      "epoch": 0.2656042496679947,
+      "grad_norm": 10.000889778137207,
+      "learning_rate": 2.670309492165724e-06,
+      "loss": 0.0942,
+      "step": 400
+    },
+    {
+      "epoch": 0.29880478087649404,
+      "grad_norm": 0.3915446996688843,
+      "learning_rate": 2.637887084538651e-06,
+      "loss": 0.0841,
+      "step": 450
+    },
+    {
+      "epoch": 0.33200531208499334,
+      "grad_norm": 0.7093335390090942,
+      "learning_rate": 2.605464676911578e-06,
+      "loss": 0.0815,
+      "step": 500
+    },
+    {
+      "epoch": 0.3652058432934927,
+      "grad_norm": 5.660763263702393,
+      "learning_rate": 2.5730422692845053e-06,
+      "loss": 0.058,
+      "step": 550
+    },
+    {
+      "epoch": 0.398406374501992,
+      "grad_norm": 9.372917175292969,
+      "learning_rate": 2.5406198616574325e-06,
+      "loss": 0.0521,
+      "step": 600
+    },
+    {
+      "epoch": 0.4316069057104914,
+      "grad_norm": 6.086747169494629,
+      "learning_rate": 2.5081974540303596e-06,
+      "loss": 0.0671,
+      "step": 650
+    },
+    {
+      "epoch": 0.4648074369189907,
+      "grad_norm": 5.661391735076904,
+      "learning_rate": 2.4757750464032863e-06,
+      "loss": 0.0354,
+      "step": 700
+    },
+    {
+      "epoch": 0.49800796812749004,
+      "grad_norm": 1.4707638025283813,
+      "learning_rate": 2.443352638776214e-06,
+      "loss": 0.0386,
+      "step": 750
+    },
+    {
+      "epoch": 0.5312084993359893,
+      "grad_norm": 7.550576686859131,
+      "learning_rate": 2.4109302311491406e-06,
+      "loss": 0.0363,
+      "step": 800
+    },
+    {
+      "epoch": 0.5644090305444888,
+      "grad_norm": 11.072442054748535,
+      "learning_rate": 2.3785078235220678e-06,
+      "loss": 0.0254,
+      "step": 850
+    },
+    {
+      "epoch": 0.5976095617529881,
+      "grad_norm": 0.3040500581264496,
+      "learning_rate": 2.346085415894995e-06,
+      "loss": 0.018,
+      "step": 900
+    },
+    {
+      "epoch": 0.6308100929614874,
+      "grad_norm": 11.503410339355469,
+      "learning_rate": 2.313663008267922e-06,
+      "loss": 0.0302,
+      "step": 950
+    },
+    {
+      "epoch": 0.6640106241699867,
+      "grad_norm": 0.7599239945411682,
+      "learning_rate": 2.281240600640849e-06,
+      "loss": 0.0267,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6972111553784861,
+      "grad_norm": 0.21025581657886505,
+      "learning_rate": 2.2488181930137764e-06,
+      "loss": 0.0211,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7304116865869854,
+      "grad_norm": 11.052717208862305,
+      "learning_rate": 2.2163957853867035e-06,
+      "loss": 0.0112,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7636122177954847,
+      "grad_norm": 0.0778539627790451,
+      "learning_rate": 2.1839733777596302e-06,
+      "loss": 0.0212,
+      "step": 1150
+    },
+    {
+      "epoch": 0.796812749003984,
+      "grad_norm": 0.050592467188835144,
+      "learning_rate": 2.151550970132558e-06,
+      "loss": 0.0082,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8300132802124834,
+      "grad_norm": 0.04680703952908516,
+      "learning_rate": 2.1191285625054845e-06,
+      "loss": 0.008,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8632138114209827,
+      "grad_norm": 127.69743347167969,
+      "learning_rate": 2.0867061548784117e-06,
+      "loss": 0.0192,
+      "step": 1300
+    },
+    {
+      "epoch": 0.896414342629482,
+      "grad_norm": 0.013791153207421303,
+      "learning_rate": 2.0542837472513392e-06,
+      "loss": 0.0063,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9296148738379814,
+      "grad_norm": 0.011688283644616604,
+      "learning_rate": 2.021861339624266e-06,
+      "loss": 0.0068,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9628154050464808,
+      "grad_norm": 14.885448455810547,
+      "learning_rate": 1.989438931997193e-06,
+      "loss": 0.004,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9960159362549801,
+      "grad_norm": 0.38216766715049744,
+      "learning_rate": 1.9570165243701202e-06,
+      "loss": 0.0069,
+      "step": 1500
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.992604961832061,
+      "eval_f1": 0.9926480803352735,
+      "eval_loss": 0.03509189188480377,
+      "eval_precision": 0.9927020529431649,
+      "eval_recall": 0.992604961832061,
+      "eval_runtime": 31.6693,
+      "eval_samples_per_second": 264.736,
+      "eval_steps_per_second": 8.273,
+      "step": 1506
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 4518,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.641430544259072e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-3/checkpoint-1506/training_args.bin b/trial-3/checkpoint-1506/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..007bd7f3de5345c09391c213b1b1e412ba04ab11
--- /dev/null
+++ b/trial-3/checkpoint-1506/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ed06b7fefd178dad53ae3fef61fd304580c1d532a37d5010e58ca8f39e302fa
+size 5368
diff --git a/trial-4/checkpoint-3011/config.json b/trial-4/checkpoint-3011/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-4/checkpoint-3011/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-4/checkpoint-3011/model.safetensors b/trial-4/checkpoint-3011/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..962e142faf8e034b148e7e4d0c0bbe22787b4c06
--- /dev/null
+++ b/trial-4/checkpoint-3011/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6998cd19c83cb7aad4574fdf2f2d1d911f7f01e8d94fcb558dc40e5561e3d188
+size 598439784
diff --git a/trial-4/checkpoint-3011/optimizer.pt b/trial-4/checkpoint-3011/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d7e34671467adb09e9efd41322ae8421a33e01bd
--- /dev/null
+++ b/trial-4/checkpoint-3011/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0517bca24af0d5ed5988e5100a9e9f6f59df1b0d3e7ca53764baa7878d5d5e3
+size 1196967418
diff --git a/trial-4/checkpoint-3011/rng_state.pth b/trial-4/checkpoint-3011/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cf3d91c5392ca6b7d7e0880933b7830a896d7c9e
--- /dev/null
+++ b/trial-4/checkpoint-3011/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
+size 14244
diff --git a/trial-4/checkpoint-3011/scheduler.pt b/trial-4/checkpoint-3011/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..386cae275a4fe4ba708aad4344c553b29e37f764
--- /dev/null
+++ b/trial-4/checkpoint-3011/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c0dbc7f9aff9e32282e3dcfb80127104b5c3d0089b59d9cb1b981e6af6f8c41
+size 1064
diff --git a/trial-4/checkpoint-3011/trainer_state.json b/trial-4/checkpoint-3011/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0823a4d65e30d5a08293e438db6f325203ea94f0
--- /dev/null
+++ b/trial-4/checkpoint-3011/trainer_state.json
@@ -0,0 +1,465 @@
+{
+  "best_metric": 0.02325253002345562,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-4/checkpoint-3011",
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3011,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016605778811026237,
+      "grad_norm": 7.4845476150512695,
+      "learning_rate": 1.3209406688296726e-05,
+      "loss": 0.427,
+      "step": 50
+    },
+    {
+      "epoch": 0.033211557622052475,
+      "grad_norm": 8.739913940429688,
+      "learning_rate": 1.3184989137392264e-05,
+      "loss": 0.2079,
+      "step": 100
+    },
+    {
+      "epoch": 0.04981733643307871,
+      "grad_norm": 10.918631553649902,
+      "learning_rate": 1.31605715864878e-05,
+      "loss": 0.1374,
+      "step": 150
+    },
+    {
+      "epoch": 0.06642311524410495,
+      "grad_norm": 0.09207049757242203,
+      "learning_rate": 1.3136154035583336e-05,
+      "loss": 0.0971,
+      "step": 200
+    },
+    {
+      "epoch": 0.08302889405513118,
+      "grad_norm": 0.1270512193441391,
+      "learning_rate": 1.3111736484678873e-05,
+      "loss": 0.0431,
+      "step": 250
+    },
+    {
+      "epoch": 0.09963467286615742,
+      "grad_norm": 0.01078485231846571,
+      "learning_rate": 1.3087318933774408e-05,
+      "loss": 0.0679,
+      "step": 300
+    },
+    {
+      "epoch": 0.11624045167718366,
+      "grad_norm": 0.16803160309791565,
+      "learning_rate": 1.3062901382869945e-05,
+      "loss": 0.0364,
+      "step": 350
+    },
+    {
+      "epoch": 0.1328462304882099,
+      "grad_norm": 0.2863476872444153,
+      "learning_rate": 1.303848383196548e-05,
+      "loss": 0.0802,
+      "step": 400
+    },
+    {
+      "epoch": 0.14945200929923613,
+      "grad_norm": 0.018498318269848824,
+      "learning_rate": 1.3014066281061019e-05,
+      "loss": 0.0324,
+      "step": 450
+    },
+    {
+      "epoch": 0.16605778811026237,
+      "grad_norm": 12.099262237548828,
+      "learning_rate": 1.2989648730156554e-05,
+      "loss": 0.0567,
+      "step": 500
+    },
+    {
+      "epoch": 0.1826635669212886,
+      "grad_norm": 0.04201498255133629,
+      "learning_rate": 1.296523117925209e-05,
+      "loss": 0.0265,
+      "step": 550
+    },
+    {
+      "epoch": 0.19926934573231483,
+      "grad_norm": 13.225788116455078,
+      "learning_rate": 1.2940813628347628e-05,
+      "loss": 0.027,
+      "step": 600
+    },
+    {
+      "epoch": 0.2158751245433411,
+      "grad_norm": 2.1863136291503906,
+      "learning_rate": 1.2916396077443163e-05,
+      "loss": 0.0325,
+      "step": 650
+    },
+    {
+      "epoch": 0.23248090335436733,
+      "grad_norm": 0.0031948979012668133,
+      "learning_rate": 1.28919785265387e-05,
+      "loss": 0.0378,
+      "step": 700
+    },
+    {
+      "epoch": 0.24908668216539356,
+      "grad_norm": 0.0001850352855399251,
+      "learning_rate": 1.2867560975634237e-05,
+      "loss": 0.0242,
+      "step": 750
+    },
+    {
+      "epoch": 0.2656924609764198,
+      "grad_norm": 0.0007033672300167382,
+      "learning_rate": 1.2843143424729772e-05,
+      "loss": 0.0306,
+      "step": 800
+    },
+    {
+      "epoch": 0.282298239787446,
+      "grad_norm": 13.938993453979492,
+      "learning_rate": 1.2818725873825309e-05,
+      "loss": 0.0458,
+      "step": 850
+    },
+    {
+      "epoch": 0.29890401859847227,
+      "grad_norm": 0.02099405601620674,
+      "learning_rate": 1.2794308322920844e-05,
+      "loss": 0.0306,
+      "step": 900
+    },
+    {
+      "epoch": 0.3155097974094985,
+      "grad_norm": 0.024268606677651405,
+      "learning_rate": 1.2769890772016383e-05,
+      "loss": 0.0142,
+      "step": 950
+    },
+    {
+      "epoch": 0.33211557622052473,
+      "grad_norm": 0.004759958013892174,
+      "learning_rate": 1.2745473221111918e-05,
+      "loss": 0.0141,
+      "step": 1000
+    },
+    {
+      "epoch": 0.348721355031551,
+      "grad_norm": 0.0019629066810011864,
+      "learning_rate": 1.2721055670207453e-05,
+      "loss": 0.0345,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3653271338425772,
+      "grad_norm": 0.00019358922145329416,
+      "learning_rate": 1.2696638119302992e-05,
+      "loss": 0.0089,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38193291265360346,
+      "grad_norm": 0.0028237327933311462,
+      "learning_rate": 1.2672220568398527e-05,
+      "loss": 0.0239,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39853869146462967,
+      "grad_norm": 0.00010467255196999758,
+      "learning_rate": 1.2647803017494064e-05,
+      "loss": 0.0094,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41514447027565593,
+      "grad_norm": 0.05774892866611481,
+      "learning_rate": 1.26233854665896e-05,
+      "loss": 0.0246,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4317502490866822,
+      "grad_norm": 0.024394717067480087,
+      "learning_rate": 1.2598967915685136e-05,
+      "loss": 0.0328,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4483560278977084,
+      "grad_norm": 2.231964349746704,
+      "learning_rate": 1.2574550364780673e-05,
+      "loss": 0.0204,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46496180670873466,
+      "grad_norm": 0.0014322358183562756,
+      "learning_rate": 1.2550132813876208e-05,
+      "loss": 0.0001,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48156758551976087,
+      "grad_norm": 0.001744006876833737,
+      "learning_rate": 1.2525715262971747e-05,
+      "loss": 0.0392,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4981733643307871,
+      "grad_norm": 0.027050139382481575,
+      "learning_rate": 1.2501297712067282e-05,
+      "loss": 0.0151,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5147791431418134,
+      "grad_norm": 0.0001924823591252789,
+      "learning_rate": 1.2476880161162817e-05,
+      "loss": 0.0036,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5313849219528396,
+      "grad_norm": 4.767300128936768,
+      "learning_rate": 1.2452462610258356e-05,
+      "loss": 0.0148,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5479907007638658,
+      "grad_norm": 0.0022574588656425476,
+      "learning_rate": 1.242804505935389e-05,
+      "loss": 0.0384,
+      "step": 1650
+    },
+    {
+      "epoch": 0.564596479574892,
+      "grad_norm": 0.12995891273021698,
+      "learning_rate": 1.2403627508449428e-05,
+      "loss": 0.018,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5812022583859183,
+      "grad_norm": 0.0005374422180466354,
+      "learning_rate": 1.2379209957544964e-05,
+      "loss": 0.0039,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5978080371969445,
+      "grad_norm": 0.004592420998960733,
+      "learning_rate": 1.23547924066405e-05,
+      "loss": 0.0136,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6144138160079707,
+      "grad_norm": 0.0008812470478005707,
+      "learning_rate": 1.2330374855736037e-05,
+      "loss": 0.0167,
+      "step": 1850
+    },
+    {
+      "epoch": 0.631019594818997,
+      "grad_norm": 28.337797164916992,
+      "learning_rate": 1.2305957304831572e-05,
+      "loss": 0.0098,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6476253736300233,
+      "grad_norm": 0.0003208396374247968,
+      "learning_rate": 1.228153975392711e-05,
+      "loss": 0.0083,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6642311524410495,
+      "grad_norm": 0.004917904268950224,
+      "learning_rate": 1.2257122203022646e-05,
+      "loss": 0.012,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6808369312520757,
+      "grad_norm": 0.0006444657919928432,
+      "learning_rate": 1.2232704652118182e-05,
+      "loss": 0.0006,
+      "step": 2050
+    },
+    {
+      "epoch": 0.697442710063102,
+      "grad_norm": 0.00020880017837043852,
+      "learning_rate": 1.220828710121372e-05,
+      "loss": 0.0169,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7140484888741282,
+      "grad_norm": 0.009818737395107746,
+      "learning_rate": 1.2183869550309254e-05,
+      "loss": 0.0143,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7306542676851544,
+      "grad_norm": 0.0009041284793056548,
+      "learning_rate": 1.2159451999404791e-05,
+      "loss": 0.0026,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7472600464961807,
+      "grad_norm": 2.3109569549560547,
+      "learning_rate": 1.2135034448500328e-05,
+      "loss": 0.0062,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7638658253072069,
+      "grad_norm": 9.242107807949651e-06,
+      "learning_rate": 1.2110616897595863e-05,
+      "loss": 0.0029,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7804716041182331,
+      "grad_norm": 0.00020709235104732215,
+      "learning_rate": 1.20861993466914e-05,
+      "loss": 0.0,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7970773829292593,
+      "grad_norm": 0.0008476360817439854,
+      "learning_rate": 1.2061781795786937e-05,
+      "loss": 0.019,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8136831617402857,
+      "grad_norm": 0.0002165739715564996,
+      "learning_rate": 1.2037364244882474e-05,
+      "loss": 0.0,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8302889405513119,
+      "grad_norm": 0.029956847429275513,
+      "learning_rate": 1.201294669397801e-05,
+      "loss": 0.0012,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8468947193623381,
+      "grad_norm": 0.0002400112134637311,
+      "learning_rate": 1.1988529143073546e-05,
+      "loss": 0.0191,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8635004981733644,
+      "grad_norm": 0.0070993551053106785,
+      "learning_rate": 1.1964111592169083e-05,
+      "loss": 0.0155,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8801062769843906,
+      "grad_norm": 5.127764234202914e-05,
+      "learning_rate": 1.1939694041264618e-05,
+      "loss": 0.0185,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8967120557954168,
+      "grad_norm": 0.056577421724796295,
+      "learning_rate": 1.1915276490360155e-05,
+      "loss": 0.0063,
+      "step": 2700
+    },
+    {
+      "epoch": 0.913317834606443,
+      "grad_norm": 4.399678437039256e-05,
+      "learning_rate": 1.1890858939455692e-05,
+      "loss": 0.012,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9299236134174693,
+      "grad_norm": 6.6589759626367595e-06,
+      "learning_rate": 1.1866441388551227e-05,
+      "loss": 0.0001,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9465293922284955,
+      "grad_norm": 0.009270718321204185,
+      "learning_rate": 1.1842023837646764e-05,
+      "loss": 0.0001,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9631351710395217,
+      "grad_norm": 6.743930339813232,
+      "learning_rate": 1.1817606286742301e-05,
+      "loss": 0.0019,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9797409498505479,
+      "grad_norm": 10.679564476013184,
+      "learning_rate": 1.1793188735837838e-05,
+      "loss": 0.0258,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9963467286615743,
+      "grad_norm": 0.0007653234642930329,
+      "learning_rate": 1.1768771184933373e-05,
+      "loss": 0.0018,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.997256679389313,
+      "eval_f1": 0.9972464717374746,
+      "eval_loss": 0.02325253002345562,
+      "eval_precision": 0.997240941740882,
+      "eval_recall": 0.997256679389313,
+      "eval_runtime": 36.6991,
+      "eval_samples_per_second": 228.453,
+      "eval_steps_per_second": 14.278,
+      "step": 3011
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 27099,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.641430544259072e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-4/checkpoint-3011/training_args.bin b/trial-4/checkpoint-3011/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c3d749965ab6105cf60004e03a98b462c58a9dda
--- /dev/null
+++ b/trial-4/checkpoint-3011/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89fb66224a4a1dbc68c030610c33a1d3f64ca676b2064b388b8e2a7385785f5d
+size 5368
diff --git a/trial-5/checkpoint-3012/config.json b/trial-5/checkpoint-3012/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-5/checkpoint-3012/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-5/checkpoint-3012/model.safetensors b/trial-5/checkpoint-3012/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cd374aa8cef0f5a071b2d854957020e3677a43b8
--- /dev/null
+++ b/trial-5/checkpoint-3012/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49ba330b843aca1a1d0454785b900ed96671619efb6df36ea614d0870f5ef2aa
+size 598439784
diff --git a/trial-5/checkpoint-3012/optimizer.pt b/trial-5/checkpoint-3012/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eb5b62653bc37e0704cf26972fd6718323668a1d
--- /dev/null
+++ b/trial-5/checkpoint-3012/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60fb304abd0c5b9d4e6de61faca1856b99e71865a5c592f8acaa47567b9139d9
+size 1196967418
diff --git a/trial-5/checkpoint-3012/rng_state.pth b/trial-5/checkpoint-3012/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1
--- /dev/null
+++ b/trial-5/checkpoint-3012/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244
diff --git a/trial-5/checkpoint-3012/scheduler.pt b/trial-5/checkpoint-3012/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..478a2e191ea69f4a9f4e75e30b6dd1e8c7827fa8
--- /dev/null
+++ b/trial-5/checkpoint-3012/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c69ec29ae0867d661613f53dea74fb003b51f72db6450102f05c6dfa235171f
+size 1064
diff --git a/trial-5/checkpoint-3012/trainer_state.json b/trial-5/checkpoint-3012/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c471c3d8a2ad97774aac1e6eac580c42f92cf28b
--- /dev/null
+++ b/trial-5/checkpoint-3012/trainer_state.json
@@ -0,0 +1,477 @@
+{
+  "best_metric": 0.0418265163898468,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-5/checkpoint-3012",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 3012,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.033200531208499334,
+      "grad_norm": 6.311530113220215,
+      "learning_rate": 1.279094112727349e-06,
+      "loss": 0.7104,
+      "step": 50
+    },
+    {
+      "epoch": 0.06640106241699867,
+      "grad_norm": 17.497058868408203,
+      "learning_rate": 1.2748333062225943e-06,
+      "loss": 0.5729,
+      "step": 100
+    },
+    {
+      "epoch": 0.099601593625498,
+      "grad_norm": 7.590151309967041,
+      "learning_rate": 1.2705724997178397e-06,
+      "loss": 0.4714,
+      "step": 150
+    },
+    {
+      "epoch": 0.13280212483399734,
+      "grad_norm": 6.96728515625,
+      "learning_rate": 1.2663116932130851e-06,
+      "loss": 0.3881,
+      "step": 200
+    },
+    {
+      "epoch": 0.16600265604249667,
+      "grad_norm": 4.9838714599609375,
+      "learning_rate": 1.2620508867083303e-06,
+      "loss": 0.3194,
+      "step": 250
+    },
+    {
+      "epoch": 0.199203187250996,
+      "grad_norm": 6.317371368408203,
+      "learning_rate": 1.2577900802035758e-06,
+      "loss": 0.2976,
+      "step": 300
+    },
+    {
+      "epoch": 0.23240371845949534,
+      "grad_norm": 15.331583023071289,
+      "learning_rate": 1.2535292736988212e-06,
+      "loss": 0.2392,
+      "step": 350
+    },
+    {
+      "epoch": 0.2656042496679947,
+      "grad_norm": 15.493165016174316,
+      "learning_rate": 1.2492684671940664e-06,
+      "loss": 0.2337,
+      "step": 400
+    },
+    {
+      "epoch": 0.29880478087649404,
+      "grad_norm": 3.7081472873687744,
+      "learning_rate": 1.2450076606893118e-06,
+      "loss": 0.2037,
+      "step": 450
+    },
+    {
+      "epoch": 0.33200531208499334,
+      "grad_norm": 4.029483318328857,
+      "learning_rate": 1.240746854184557e-06,
+      "loss": 0.2054,
+      "step": 500
+    },
+    {
+      "epoch": 0.3652058432934927,
+      "grad_norm": 4.573270797729492,
+      "learning_rate": 1.2364860476798024e-06,
+      "loss": 0.1555,
+      "step": 550
+    },
+    {
+      "epoch": 0.398406374501992,
+      "grad_norm": 15.748998641967773,
+      "learning_rate": 1.2322252411750478e-06,
+      "loss": 0.1486,
+      "step": 600
+    },
+    {
+      "epoch": 0.4316069057104914,
+      "grad_norm": 12.240307807922363,
+      "learning_rate": 1.227964434670293e-06,
+      "loss": 0.1552,
+      "step": 650
+    },
+    {
+      "epoch": 0.4648074369189907,
+      "grad_norm": 17.192546844482422,
+      "learning_rate": 1.2237036281655385e-06,
+      "loss": 0.1234,
+      "step": 700
+    },
+    {
+      "epoch": 0.49800796812749004,
+      "grad_norm": 11.04953670501709,
+      "learning_rate": 1.2194428216607839e-06,
+      "loss": 0.1212,
+      "step": 750
+    },
+    {
+      "epoch": 0.5312084993359893,
+      "grad_norm": 4.883615016937256,
+      "learning_rate": 1.215182015156029e-06,
+      "loss": 0.1059,
+      "step": 800
+    },
+    {
+      "epoch": 0.5644090305444888,
+      "grad_norm": 4.633565425872803,
+      "learning_rate": 1.2109212086512745e-06,
+      "loss": 0.0788,
+      "step": 850
+    },
+    {
+      "epoch": 0.5976095617529881,
+      "grad_norm": 2.6228833198547363,
+      "learning_rate": 1.20666040214652e-06,
+      "loss": 0.087,
+      "step": 900
+    },
+    {
+      "epoch": 0.6308100929614874,
+      "grad_norm": 6.4782915115356445,
+      "learning_rate": 1.2023995956417651e-06,
+      "loss": 0.0802,
+      "step": 950
+    },
+    {
+      "epoch": 0.6640106241699867,
+      "grad_norm": 5.229304313659668,
+      "learning_rate": 1.1981387891370103e-06,
+      "loss": 0.077,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6972111553784861,
+      "grad_norm": 6.034313201904297,
+      "learning_rate": 1.1938779826322558e-06,
+      "loss": 0.0703,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7304116865869854,
+      "grad_norm": 9.29736614227295,
+      "learning_rate": 1.1896171761275012e-06,
+      "loss": 0.066,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7636122177954847,
+      "grad_norm": 0.6172637343406677,
+      "learning_rate": 1.1853563696227464e-06,
+      "loss": 0.0692,
+      "step": 1150
+    },
+    {
+      "epoch": 0.796812749003984,
+      "grad_norm": 1.642548680305481,
+      "learning_rate": 1.1810955631179918e-06,
+      "loss": 0.0437,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8300132802124834,
+      "grad_norm": 3.888737916946411,
+      "learning_rate": 1.176834756613237e-06,
+      "loss": 0.0474,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8632138114209827,
+      "grad_norm": 14.787779808044434,
+      "learning_rate": 1.1725739501084824e-06,
+      "loss": 0.0501,
+      "step": 1300
+    },
+    {
+      "epoch": 0.896414342629482,
+      "grad_norm": 0.8571153283119202,
+      "learning_rate": 1.1683131436037278e-06,
+      "loss": 0.0439,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9296148738379814,
+      "grad_norm": 0.6915457248687744,
+      "learning_rate": 1.164052337098973e-06,
+      "loss": 0.0455,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9628154050464808,
+      "grad_norm": 8.8081636428833,
+      "learning_rate": 1.1597915305942185e-06,
+      "loss": 0.0347,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9960159362549801,
+      "grad_norm": 8.551522254943848,
+      "learning_rate": 1.1555307240894639e-06,
+      "loss": 0.0346,
+      "step": 1500
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.982824427480916,
+      "eval_f1": 0.9838970307302017,
+      "eval_loss": 0.05475565418601036,
+      "eval_precision": 0.986134299459291,
+      "eval_recall": 0.982824427480916,
+      "eval_runtime": 31.8933,
+      "eval_samples_per_second": 262.877,
+      "eval_steps_per_second": 8.215,
+      "step": 1506
+    },
+    {
+      "epoch": 1.0292164674634794,
+      "grad_norm": 13.078969955444336,
+      "learning_rate": 1.151269917584709e-06,
+      "loss": 0.0379,
+      "step": 1550
+    },
+    {
+      "epoch": 1.0624169986719787,
+      "grad_norm": 1.906078815460205,
+      "learning_rate": 1.1470091110799545e-06,
+      "loss": 0.0338,
+      "step": 1600
+    },
+    {
+      "epoch": 1.095617529880478,
+      "grad_norm": 0.4020080864429474,
+      "learning_rate": 1.1427483045752e-06,
+      "loss": 0.0298,
+      "step": 1650
+    },
+    {
+      "epoch": 1.1288180610889773,
+      "grad_norm": 2.647258758544922,
+      "learning_rate": 1.1384874980704451e-06,
+      "loss": 0.023,
+      "step": 1700
+    },
+    {
+      "epoch": 1.1620185922974768,
+      "grad_norm": 2.046747922897339,
+      "learning_rate": 1.1342266915656906e-06,
+      "loss": 0.0253,
+      "step": 1750
+    },
+    {
+      "epoch": 1.1952191235059761,
+      "grad_norm": 13.14510726928711,
+      "learning_rate": 1.129965885060936e-06,
+      "loss": 0.0268,
+      "step": 1800
+    },
+    {
+      "epoch": 1.2284196547144755,
+      "grad_norm": 0.12764006853103638,
+      "learning_rate": 1.1257050785561812e-06,
+      "loss": 0.0099,
+      "step": 1850
+    },
+    {
+      "epoch": 1.2616201859229748,
+      "grad_norm": 1.6261545419692993,
+      "learning_rate": 1.1214442720514266e-06,
+      "loss": 0.0252,
+      "step": 1900
+    },
+    {
+      "epoch": 1.294820717131474,
+      "grad_norm": 5.552518844604492,
+      "learning_rate": 1.117183465546672e-06,
+      "loss": 0.036,
+      "step": 1950
+    },
+    {
+      "epoch": 1.3280212483399734,
+      "grad_norm": 24.064516067504883,
+      "learning_rate": 1.1129226590419172e-06,
+      "loss": 0.0169,
+      "step": 2000
+    },
+    {
+      "epoch": 1.361221779548473,
+      "grad_norm": 0.00925782322883606,
+      "learning_rate": 1.1086618525371626e-06,
+      "loss": 0.0184,
+      "step": 2050
+    },
+    {
+      "epoch": 1.3944223107569722,
+      "grad_norm": 16.54283905029297,
+      "learning_rate": 1.1044010460324078e-06,
+      "loss": 0.0139,
+      "step": 2100
+    },
+    {
+      "epoch": 1.4276228419654715,
+      "grad_norm": 0.24406713247299194,
+      "learning_rate": 1.1001402395276533e-06,
+      "loss": 0.0126,
+      "step": 2150
+    },
+    {
+      "epoch": 1.4608233731739708,
+      "grad_norm": 0.02731563337147236,
+      "learning_rate": 1.0958794330228987e-06,
+      "loss": 0.0198,
+      "step": 2200
+    },
+    {
+      "epoch": 1.4940239043824701,
+      "grad_norm": 17.53055191040039,
+      "learning_rate": 1.0916186265181439e-06,
+      "loss": 0.0303,
+      "step": 2250
+    },
+    {
+      "epoch": 1.5272244355909694,
+      "grad_norm": 0.07282107323408127,
+      "learning_rate": 1.0873578200133893e-06,
+      "loss": 0.0016,
+      "step": 2300
+    },
+    {
+      "epoch": 1.5604249667994687,
+      "grad_norm": 20.794416427612305,
+      "learning_rate": 1.0830970135086347e-06,
+      "loss": 0.0225,
+      "step": 2350
+    },
+    {
+      "epoch": 1.593625498007968,
+      "grad_norm": 0.052418053150177,
+      "learning_rate": 1.07883620700388e-06,
+      "loss": 0.0076,
+      "step": 2400
+    },
+    {
+      "epoch": 1.6268260292164674,
+      "grad_norm": 0.21063362061977386,
+      "learning_rate": 1.0745754004991254e-06,
+      "loss": 0.0159,
+      "step": 2450
+    },
+    {
+      "epoch": 1.6600265604249667,
+      "grad_norm": 10.455537796020508,
+      "learning_rate": 1.0703145939943708e-06,
+      "loss": 0.0105,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6932270916334662,
+      "grad_norm": 6.205326557159424,
+      "learning_rate": 1.066053787489616e-06,
+      "loss": 0.0081,
+      "step": 2550
+    },
+    {
+      "epoch": 1.7264276228419655,
+      "grad_norm": 6.523694038391113,
+      "learning_rate": 1.0617929809848614e-06,
+      "loss": 0.0159,
+      "step": 2600
+    },
+    {
+      "epoch": 1.7596281540504648,
+      "grad_norm": 0.010043232701718807,
+      "learning_rate": 1.0575321744801068e-06,
+      "loss": 0.0113,
+      "step": 2650
+    },
+    {
+      "epoch": 1.792828685258964,
+      "grad_norm": 0.00458578672260046,
+      "learning_rate": 1.053271367975352e-06,
+      "loss": 0.0086,
+      "step": 2700
+    },
+    {
+      "epoch": 1.8260292164674636,
+      "grad_norm": 0.10986531525850296,
+      "learning_rate": 1.0490105614705974e-06,
+      "loss": 0.008,
+      "step": 2750
+    },
+    {
+      "epoch": 1.859229747675963,
+      "grad_norm": 0.12284637242555618,
+      "learning_rate": 1.0447497549658429e-06,
+      "loss": 0.0052,
+      "step": 2800
+    },
+    {
+      "epoch": 1.8924302788844622,
+      "grad_norm": 0.14606119692325592,
+      "learning_rate": 1.040488948461088e-06,
+      "loss": 0.0176,
+      "step": 2850
+    },
+    {
+      "epoch": 1.9256308100929616,
+      "grad_norm": 0.020491423085331917,
+      "learning_rate": 1.0362281419563333e-06,
+      "loss": 0.0102,
+      "step": 2900
+    },
+    {
+      "epoch": 1.9588313413014609,
+      "grad_norm": 0.05764462426304817,
+      "learning_rate": 1.0319673354515787e-06,
+      "loss": 0.0044,
+      "step": 2950
+    },
+    {
+      "epoch": 1.9920318725099602,
+      "grad_norm": 0.7329011559486389,
+      "learning_rate": 1.027706528946824e-06,
+      "loss": 0.0139,
+      "step": 3000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9924856870229007,
+      "eval_f1": 0.9924235722235019,
+      "eval_loss": 0.0418265163898468,
+      "eval_precision": 0.9923830636545329,
+      "eval_recall": 0.9924856870229007,
+      "eval_runtime": 31.6222,
+      "eval_samples_per_second": 265.131,
+      "eval_steps_per_second": 8.285,
+      "step": 3012
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 15060,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-5/checkpoint-3012/training_args.bin b/trial-5/checkpoint-3012/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4692f33409907d8d382ead52b8f9423cf80dd960
--- /dev/null
+++ b/trial-5/checkpoint-3012/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5a07ff58876babfad1d92462cc9e7062c8f5b0af8d8ba9142ab6f5e8880cf2
+size 5368
diff --git a/trial-6/checkpoint-6022/config.json b/trial-6/checkpoint-6022/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-6/checkpoint-6022/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-6/checkpoint-6022/model.safetensors b/trial-6/checkpoint-6022/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f3124a5206ff69a1d2328df7fd8330a3b17025d0
--- /dev/null
+++ b/trial-6/checkpoint-6022/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a60e2fc558ad0e5a9a4825234c28006f4c14c02aab969b5ebf7cb43d8f890d9e
+size 598439784
diff --git a/trial-6/checkpoint-6022/optimizer.pt b/trial-6/checkpoint-6022/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..df6eee38753d5beb9baf27e2f30a8ffe07b42512
--- /dev/null
+++ b/trial-6/checkpoint-6022/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ccfa5cc878422afe6f38c7ea21cef7e9f532ec15d2d9169693197daa8b04fb0
+size 1196967418
diff --git a/trial-6/checkpoint-6022/rng_state.pth b/trial-6/checkpoint-6022/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1
--- /dev/null
+++ b/trial-6/checkpoint-6022/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244
diff --git a/trial-6/checkpoint-6022/scheduler.pt b/trial-6/checkpoint-6022/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2e8e492dc07ae1b174c07babfc899bd22becedc0
--- /dev/null
+++ b/trial-6/checkpoint-6022/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f20e2229466448de860622239acb9999c7ec64084a1decca7269c9cb3644988
+size 1064
diff --git a/trial-6/checkpoint-6022/trainer_state.json b/trial-6/checkpoint-6022/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..693b4b5a4df0a852da03bc416238e936836f6edb
--- /dev/null
+++ b/trial-6/checkpoint-6022/trainer_state.json
@@ -0,0 +1,897 @@
+{
+  "best_metric": 0.03524520993232727,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-6/checkpoint-6022",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 6022,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016605778811026237,
+      "grad_norm": 7.985777378082275,
+      "learning_rate": 3.663949947127632e-06,
+      "loss": 0.6449,
+      "step": 50
+    },
+    {
+      "epoch": 0.033211557622052475,
+      "grad_norm": 16.946643829345703,
+      "learning_rate": 3.64867585196702e-06,
+      "loss": 0.4098,
+      "step": 100
+    },
+    {
+      "epoch": 0.04981733643307871,
+      "grad_norm": 15.02230167388916,
+      "learning_rate": 3.633401756806408e-06,
+      "loss": 0.2997,
+      "step": 150
+    },
+    {
+      "epoch": 0.06642311524410495,
+      "grad_norm": 2.651068925857544,
+      "learning_rate": 3.6181276616457957e-06,
+      "loss": 0.2322,
+      "step": 200
+    },
+    {
+      "epoch": 0.08302889405513118,
+      "grad_norm": 71.46488189697266,
+      "learning_rate": 3.602853566485183e-06,
+      "loss": 0.1922,
+      "step": 250
+    },
+    {
+      "epoch": 0.09963467286615742,
+      "grad_norm": 2.4328176975250244,
+      "learning_rate": 3.5875794713245715e-06,
+      "loss": 0.1731,
+      "step": 300
+    },
+    {
+      "epoch": 0.11624045167718366,
+      "grad_norm": 8.744805335998535,
+      "learning_rate": 3.5723053761639594e-06,
+      "loss": 0.1264,
+      "step": 350
+    },
+    {
+      "epoch": 0.1328462304882099,
+      "grad_norm": 10.860421180725098,
+      "learning_rate": 3.557031281003347e-06,
+      "loss": 0.1423,
+      "step": 400
+    },
+    {
+      "epoch": 0.14945200929923613,
+      "grad_norm": 1.3849588632583618,
+      "learning_rate": 3.5417571858427352e-06,
+      "loss": 0.1,
+      "step": 450
+    },
+    {
+      "epoch": 0.16605778811026237,
+      "grad_norm": 18.67996597290039,
+      "learning_rate": 3.526483090682123e-06,
+      "loss": 0.1297,
+      "step": 500
+    },
+    {
+      "epoch": 0.1826635669212886,
+      "grad_norm": 22.31239128112793,
+      "learning_rate": 3.5112089955215106e-06,
+      "loss": 0.1266,
+      "step": 550
+    },
+    {
+      "epoch": 0.19926934573231483,
+      "grad_norm": 7.551675319671631,
+      "learning_rate": 3.4959349003608985e-06,
+      "loss": 0.0872,
+      "step": 600
+    },
+    {
+      "epoch": 0.2158751245433411,
+      "grad_norm": 0.4732609987258911,
+      "learning_rate": 3.480660805200287e-06,
+      "loss": 0.0735,
+      "step": 650
+    },
+    {
+      "epoch": 0.23248090335436733,
+      "grad_norm": 0.4966350495815277,
+      "learning_rate": 3.4653867100396748e-06,
+      "loss": 0.1583,
+      "step": 700
+    },
+    {
+      "epoch": 0.24908668216539356,
+      "grad_norm": 0.5777727961540222,
+      "learning_rate": 3.4501126148790623e-06,
+      "loss": 0.0954,
+      "step": 750
+    },
+    {
+      "epoch": 0.2656924609764198,
+      "grad_norm": 3.709627389907837,
+      "learning_rate": 3.4348385197184506e-06,
+      "loss": 0.07,
+      "step": 800
+    },
+    {
+      "epoch": 0.282298239787446,
+      "grad_norm": 7.013435363769531,
+      "learning_rate": 3.4195644245578385e-06,
+      "loss": 0.1039,
+      "step": 850
+    },
+    {
+      "epoch": 0.29890401859847227,
+      "grad_norm": 0.41413068771362305,
+      "learning_rate": 3.404290329397226e-06,
+      "loss": 0.0699,
+      "step": 900
+    },
+    {
+      "epoch": 0.3155097974094985,
+      "grad_norm": 0.23823711276054382,
+      "learning_rate": 3.3890162342366143e-06,
+      "loss": 0.0836,
+      "step": 950
+    },
+    {
+      "epoch": 0.33211557622052473,
+      "grad_norm": 0.011693170294165611,
+      "learning_rate": 3.3737421390760022e-06,
+      "loss": 0.0571,
+      "step": 1000
+    },
+    {
+      "epoch": 0.348721355031551,
+      "grad_norm": 0.003961833659559488,
+      "learning_rate": 3.3584680439153897e-06,
+      "loss": 0.0516,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3653271338425772,
+      "grad_norm": 0.007026594132184982,
+      "learning_rate": 3.3431939487547776e-06,
+      "loss": 0.0723,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38193291265360346,
+      "grad_norm": 0.024607744067907333,
+      "learning_rate": 3.327919853594166e-06,
+      "loss": 0.0412,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39853869146462967,
+      "grad_norm": 0.005524761509150267,
+      "learning_rate": 3.3126457584335534e-06,
+      "loss": 0.0246,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41514447027565593,
+      "grad_norm": 0.13576415181159973,
+      "learning_rate": 3.2973716632729413e-06,
+      "loss": 0.0349,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4317502490866822,
+      "grad_norm": 3.3155462741851807,
+      "learning_rate": 3.2820975681123297e-06,
+      "loss": 0.0547,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4483560278977084,
+      "grad_norm": 0.3045770823955536,
+      "learning_rate": 3.266823472951717e-06,
+      "loss": 0.0406,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46496180670873466,
+      "grad_norm": 0.003651317674666643,
+      "learning_rate": 3.251549377791105e-06,
+      "loss": 0.0064,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48156758551976087,
+      "grad_norm": 0.06915970891714096,
+      "learning_rate": 3.236275282630493e-06,
+      "loss": 0.0259,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4981733643307871,
+      "grad_norm": 0.006649952847510576,
+      "learning_rate": 3.221001187469881e-06,
+      "loss": 0.0209,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5147791431418134,
+      "grad_norm": 0.0077498299069702625,
+      "learning_rate": 3.2057270923092688e-06,
+      "loss": 0.0128,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5313849219528396,
+      "grad_norm": 17.972652435302734,
+      "learning_rate": 3.1904529971486567e-06,
+      "loss": 0.0522,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5479907007638658,
+      "grad_norm": 0.02570178173482418,
+      "learning_rate": 3.1751789019880446e-06,
+      "loss": 0.0236,
+      "step": 1650
+    },
+    {
+      "epoch": 0.564596479574892,
+      "grad_norm": 0.01210557110607624,
+      "learning_rate": 3.1599048068274325e-06,
+      "loss": 0.0116,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5812022583859183,
+      "grad_norm": 0.0021121352910995483,
+      "learning_rate": 3.1446307116668204e-06,
+      "loss": 0.004,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5978080371969445,
+      "grad_norm": 0.0030424538999795914,
+      "learning_rate": 3.129356616506208e-06,
+      "loss": 0.0086,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6144138160079707,
+      "grad_norm": 0.01042268518358469,
+      "learning_rate": 3.114082521345596e-06,
+      "loss": 0.0302,
+      "step": 1850
+    },
+    {
+      "epoch": 0.631019594818997,
+      "grad_norm": 1.67741858959198,
+      "learning_rate": 3.098808426184984e-06,
+      "loss": 0.0318,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6476253736300233,
+      "grad_norm": 3.324981689453125,
+      "learning_rate": 3.083534331024372e-06,
+      "loss": 0.0308,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6642311524410495,
+      "grad_norm": 0.05241026356816292,
+      "learning_rate": 3.06826023586376e-06,
+      "loss": 0.0229,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6808369312520757,
+      "grad_norm": 0.09731736034154892,
+      "learning_rate": 3.052986140703148e-06,
+      "loss": 0.027,
+      "step": 2050
+    },
+    {
+      "epoch": 0.697442710063102,
+      "grad_norm": 0.09534373879432678,
+      "learning_rate": 3.0377120455425357e-06,
+      "loss": 0.0278,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7140484888741282,
+      "grad_norm": 0.028926922008395195,
+      "learning_rate": 3.0224379503819232e-06,
+      "loss": 0.0195,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7306542676851544,
+      "grad_norm": 0.003165798494592309,
+      "learning_rate": 3.0071638552213115e-06,
+      "loss": 0.0127,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7472600464961807,
+      "grad_norm": 1.1501398086547852,
+      "learning_rate": 2.9918897600606995e-06,
+      "loss": 0.0228,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7638658253072069,
+      "grad_norm": 0.015178106725215912,
+      "learning_rate": 2.976615664900087e-06,
+      "loss": 0.0211,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7804716041182331,
+      "grad_norm": 0.043032608926296234,
+      "learning_rate": 2.9613415697394753e-06,
+      "loss": 0.0007,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7970773829292593,
+      "grad_norm": 0.0029065206181257963,
+      "learning_rate": 2.946067474578863e-06,
+      "loss": 0.0083,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8136831617402857,
+      "grad_norm": 0.006119361147284508,
+      "learning_rate": 2.9307933794182507e-06,
+      "loss": 0.0005,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8302889405513119,
+      "grad_norm": 0.0004126826534047723,
+      "learning_rate": 2.915519284257639e-06,
+      "loss": 0.002,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8468947193623381,
+      "grad_norm": 0.0028823954053223133,
+      "learning_rate": 2.900245189097027e-06,
+      "loss": 0.0105,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8635004981733644,
+      "grad_norm": 0.03489808738231659,
+      "learning_rate": 2.8849710939364144e-06,
+      "loss": 0.0132,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8801062769843906,
+      "grad_norm": 0.0013511483557522297,
+      "learning_rate": 2.8696969987758023e-06,
+      "loss": 0.0018,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8967120557954168,
+      "grad_norm": 0.00025652506155893207,
+      "learning_rate": 2.8544229036151906e-06,
+      "loss": 0.0059,
+      "step": 2700
+    },
+    {
+      "epoch": 0.913317834606443,
+      "grad_norm": 0.005165331996977329,
+      "learning_rate": 2.839148808454578e-06,
+      "loss": 0.0063,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9299236134174693,
+      "grad_norm": 0.00012050822988385335,
+      "learning_rate": 2.823874713293966e-06,
+      "loss": 0.0009,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9465293922284955,
+      "grad_norm": 0.026708438992500305,
+      "learning_rate": 2.8086006181333543e-06,
+      "loss": 0.0,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9631351710395217,
+      "grad_norm": 35.39009475708008,
+      "learning_rate": 2.793326522972742e-06,
+      "loss": 0.0154,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9797409498505479,
+      "grad_norm": 143.70465087890625,
+      "learning_rate": 2.7780524278121297e-06,
+      "loss": 0.0299,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9963467286615743,
+      "grad_norm": 0.00032583833672106266,
+      "learning_rate": 2.7627783326515176e-06,
+      "loss": 0.0058,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9959446564885496,
+      "eval_f1": 0.9958699330259847,
+      "eval_loss": 0.03994645178318024,
+      "eval_precision": 0.9959073754230947,
+      "eval_recall": 0.9959446564885496,
+      "eval_runtime": 36.9535,
+      "eval_samples_per_second": 226.88,
+      "eval_steps_per_second": 14.18,
+      "step": 3011
+    },
+    {
+      "epoch": 1.0129525074726005,
+      "grad_norm": 0.00038893838063813746,
+      "learning_rate": 2.7475042374909055e-06,
+      "loss": 0.0094,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0295582862836268,
+      "grad_norm": 0.0011424238327890635,
+      "learning_rate": 2.7322301423302934e-06,
+      "loss": 0.0001,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0461640650946529,
+      "grad_norm": 0.01706782355904579,
+      "learning_rate": 2.7169560471696814e-06,
+      "loss": 0.0041,
+      "step": 3150
+    },
+    {
+      "epoch": 1.0627698439056792,
+      "grad_norm": 0.00026497532962821424,
+      "learning_rate": 2.7016819520090697e-06,
+      "loss": 0.0001,
+      "step": 3200
+    },
+    {
+      "epoch": 1.0793756227167055,
+      "grad_norm": 0.4866068363189697,
+      "learning_rate": 2.686407856848457e-06,
+      "loss": 0.026,
+      "step": 3250
+    },
+    {
+      "epoch": 1.0959814015277316,
+      "grad_norm": 8.705830987310037e-05,
+      "learning_rate": 2.671133761687845e-06,
+      "loss": 0.0156,
+      "step": 3300
+    },
+    {
+      "epoch": 1.112587180338758,
+      "grad_norm": 0.004105957690626383,
+      "learning_rate": 2.655859666527233e-06,
+      "loss": 0.0028,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1291929591497842,
+      "grad_norm": 0.001343347830697894,
+      "learning_rate": 2.640585571366621e-06,
+      "loss": 0.0115,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1457987379608103,
+      "grad_norm": 8.608686039224267e-05,
+      "learning_rate": 2.625311476206009e-06,
+      "loss": 0.0065,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1624045167718366,
+      "grad_norm": 0.001792514231055975,
+      "learning_rate": 2.6100373810453967e-06,
+      "loss": 0.0078,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1790102955828627,
+      "grad_norm": 0.0008498657844029367,
+      "learning_rate": 2.5947632858847846e-06,
+      "loss": 0.0,
+      "step": 3550
+    },
+    {
+      "epoch": 1.195616074393889,
+      "grad_norm": 0.012572677806019783,
+      "learning_rate": 2.5794891907241725e-06,
+      "loss": 0.0005,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2122218532049154,
+      "grad_norm": 0.0010890236590057611,
+      "learning_rate": 2.5642150955635604e-06,
+      "loss": 0.0,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2288276320159415,
+      "grad_norm": 0.0009271232993341982,
+      "learning_rate": 2.548941000402948e-06,
+      "loss": 0.0,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2454334108269678,
+      "grad_norm": 0.0008255397551693022,
+      "learning_rate": 2.5336669052423362e-06,
+      "loss": 0.0155,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2620391896379939,
+      "grad_norm": 0.0051245614886283875,
+      "learning_rate": 2.518392810081724e-06,
+      "loss": 0.0022,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2786449684490202,
+      "grad_norm": 0.01625339686870575,
+      "learning_rate": 2.5031187149211116e-06,
+      "loss": 0.006,
+      "step": 3850
+    },
+    {
+      "epoch": 1.2952507472600465,
+      "grad_norm": 0.0009482129826210439,
+      "learning_rate": 2.4878446197605e-06,
+      "loss": 0.0001,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3118565260710726,
+      "grad_norm": 0.00012260080256965011,
+      "learning_rate": 2.472570524599888e-06,
+      "loss": 0.0001,
+      "step": 3950
+    },
+    {
+      "epoch": 1.328462304882099,
+      "grad_norm": 0.0005531097413040698,
+      "learning_rate": 2.4572964294392753e-06,
+      "loss": 0.0206,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3450680836931252,
+      "grad_norm": 0.00046819329145364463,
+      "learning_rate": 2.4420223342786637e-06,
+      "loss": 0.0015,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3616738625041513,
+      "grad_norm": 0.0008780017960816622,
+      "learning_rate": 2.4267482391180516e-06,
+      "loss": 0.0004,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3782796413151777,
+      "grad_norm": 0.0001749313232721761,
+      "learning_rate": 2.411474143957439e-06,
+      "loss": 0.0097,
+      "step": 4150
+    },
+    {
+      "epoch": 1.394885420126204,
+      "grad_norm": 0.0004841366899199784,
+      "learning_rate": 2.396200048796827e-06,
+      "loss": 0.0,
+      "step": 4200
+    },
+    {
+      "epoch": 1.41149119893723,
+      "grad_norm": 0.0015521385939791799,
+      "learning_rate": 2.3809259536362153e-06,
+      "loss": 0.0006,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4280969777482564,
+      "grad_norm": 0.0003654654719866812,
+      "learning_rate": 2.3656518584756028e-06,
+      "loss": 0.0007,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4447027565592827,
+      "grad_norm": 0.0011144432937726378,
+      "learning_rate": 2.3503777633149907e-06,
+      "loss": 0.0131,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4613085353703088,
+      "grad_norm": 0.011592933908104897,
+      "learning_rate": 2.335103668154379e-06,
+      "loss": 0.0036,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4779143141813351,
+      "grad_norm": 0.026564495638012886,
+      "learning_rate": 2.319829572993767e-06,
+      "loss": 0.0001,
+      "step": 4450
+    },
+    {
+      "epoch": 1.4945200929923614,
+      "grad_norm": 0.003402173984795809,
+      "learning_rate": 2.3045554778331544e-06,
+      "loss": 0.0052,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5111258718033875,
+      "grad_norm": 0.0031449920497834682,
+      "learning_rate": 2.2892813826725423e-06,
+      "loss": 0.0,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5277316506144138,
+      "grad_norm": 0.000741615192964673,
+      "learning_rate": 2.2740072875119306e-06,
+      "loss": 0.0,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5443374294254402,
+      "grad_norm": 0.00041236402466893196,
+      "learning_rate": 2.258733192351318e-06,
+      "loss": 0.0038,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5609432082364663,
+      "grad_norm": 0.02141823247075081,
+      "learning_rate": 2.243459097190706e-06,
+      "loss": 0.0,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5775489870474926,
+      "grad_norm": 0.00028093060245737433,
+      "learning_rate": 2.2281850020300944e-06,
+      "loss": 0.0,
+      "step": 4750
+    },
+    {
+      "epoch": 1.594154765858519,
+      "grad_norm": 0.0016667908057570457,
+      "learning_rate": 2.212910906869482e-06,
+      "loss": 0.0,
+      "step": 4800
+    },
+    {
+      "epoch": 1.610760544669545,
+      "grad_norm": 0.0005989013588987291,
+      "learning_rate": 2.1976368117088698e-06,
+      "loss": 0.0,
+      "step": 4850
+    },
+    {
+      "epoch": 1.627366323480571,
+      "grad_norm": 0.0007902457728050649,
+      "learning_rate": 2.1823627165482577e-06,
+      "loss": 0.0023,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6439721022915976,
+      "grad_norm": 6.165813829284161e-05,
+      "learning_rate": 2.1670886213876456e-06,
+      "loss": 0.0,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6605778811026237,
+      "grad_norm": 0.0002811133745126426,
+      "learning_rate": 2.1518145262270335e-06,
+      "loss": 0.0,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6771836599136498,
+      "grad_norm": 0.00117580930236727,
+      "learning_rate": 2.1365404310664214e-06,
+      "loss": 0.0,
+      "step": 5050
+    },
+    {
+      "epoch": 1.6937894387246761,
+      "grad_norm": 3.999446926172823e-05,
+      "learning_rate": 2.1212663359058093e-06,
+      "loss": 0.0,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7103952175357025,
+      "grad_norm": 0.0005360537325032055,
+      "learning_rate": 2.105992240745197e-06,
+      "loss": 0.0,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7270009963467285,
+      "grad_norm": 0.8813786506652832,
+      "learning_rate": 2.090718145584585e-06,
+      "loss": 0.0137,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7436067751577549,
+      "grad_norm": 0.046743907034397125,
+      "learning_rate": 2.0754440504239726e-06,
+      "loss": 0.0071,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7602125539687812,
+      "grad_norm": 0.00034072858397848904,
+      "learning_rate": 2.060169955263361e-06,
+      "loss": 0.0119,
+      "step": 5300
+    },
+    {
+      "epoch": 1.7768183327798073,
+      "grad_norm": 0.007919142954051495,
+      "learning_rate": 2.044895860102749e-06,
+      "loss": 0.0142,
+      "step": 5350
+    },
+    {
+      "epoch": 1.7934241115908336,
+      "grad_norm": 0.0014535001246258616,
+      "learning_rate": 2.0296217649421363e-06,
+      "loss": 0.0,
+      "step": 5400
+    },
+    {
+      "epoch": 1.81002989040186,
+      "grad_norm": 0.005964061710983515,
+      "learning_rate": 2.0143476697815246e-06,
+      "loss": 0.0006,
+      "step": 5450
+    },
+    {
+      "epoch": 1.826635669212886,
+      "grad_norm": 0.0018063083989545703,
+      "learning_rate": 1.9990735746209125e-06,
+      "loss": 0.0,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8432414480239123,
+      "grad_norm": 0.00010997989738825709,
+      "learning_rate": 1.9837994794603e-06,
+      "loss": 0.0,
+      "step": 5550
+    },
+    {
+      "epoch": 1.8598472268349386,
+      "grad_norm": 0.004599976819008589,
+      "learning_rate": 1.968525384299688e-06,
+      "loss": 0.0,
+      "step": 5600
+    },
+    {
+      "epoch": 1.8764530056459647,
+      "grad_norm": 0.00086441938765347,
+      "learning_rate": 1.9532512891390763e-06,
+      "loss": 0.0066,
+      "step": 5650
+    },
+    {
+      "epoch": 1.893058784456991,
+      "grad_norm": 0.0016492039430886507,
+      "learning_rate": 1.937977193978464e-06,
+      "loss": 0.0019,
+      "step": 5700
+    },
+    {
+      "epoch": 1.9096645632680174,
+      "grad_norm": 4.4919357605976984e-05,
+      "learning_rate": 1.9227030988178516e-06,
+      "loss": 0.0,
+      "step": 5750
+    },
+    {
+      "epoch": 1.9262703420790435,
+      "grad_norm": 1.0092087984085083,
+      "learning_rate": 1.90742900365724e-06,
+      "loss": 0.0049,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9428761208900698,
+      "grad_norm": 0.0003246455453336239,
+      "learning_rate": 1.8921549084966279e-06,
+      "loss": 0.0002,
+      "step": 5850
+    },
+    {
+      "epoch": 1.959481899701096,
+      "grad_norm": 0.0008708651876077056,
+      "learning_rate": 1.8768808133360154e-06,
+      "loss": 0.0215,
+      "step": 5900
+    },
+    {
+      "epoch": 1.9760876785121222,
+      "grad_norm": 0.003138365224003792,
+      "learning_rate": 1.8616067181754035e-06,
+      "loss": 0.0,
+      "step": 5950
+    },
+    {
+      "epoch": 1.9926934573231485,
+      "grad_norm": 10.618377685546875,
+      "learning_rate": 1.8463326230147916e-06,
+      "loss": 0.0306,
+      "step": 6000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9964217557251909,
+      "eval_f1": 0.9963727955130689,
+      "eval_loss": 0.03524520993232727,
+      "eval_precision": 0.9963861752950809,
+      "eval_recall": 0.9964217557251909,
+      "eval_runtime": 38.2329,
+      "eval_samples_per_second": 219.287,
+      "eval_steps_per_second": 13.705,
+      "step": 6022
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 12044,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-6/checkpoint-6022/training_args.bin b/trial-6/checkpoint-6022/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8ff64b4b12daf1b9553500a47074a85378615d75
--- /dev/null
+++ b/trial-6/checkpoint-6022/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd2e4fb9a115884edb87825aefbd32c53d93671f7c6430a41871a9ca795015e8
+size 5368
diff --git a/trial-7/checkpoint-6022/config.json b/trial-7/checkpoint-6022/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-7/checkpoint-6022/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-7/checkpoint-6022/model.safetensors b/trial-7/checkpoint-6022/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ba72213ea6cdd9fa00f45ebe5294757054d14ecb
--- /dev/null
+++ b/trial-7/checkpoint-6022/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:999491f80f5e7d65107af9b9c39856993ea382919338a256f506e02282154a73
+size 598439784
diff --git a/trial-7/checkpoint-6022/optimizer.pt b/trial-7/checkpoint-6022/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8556aa85feb6b7c96bcabe93d030e972640a17db
--- /dev/null
+++ b/trial-7/checkpoint-6022/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f58dae64786998ee14f679a5c11f25c6e8e826a9be7e1dc502a4960619f5b73d
+size 1196967418
diff --git a/trial-7/checkpoint-6022/rng_state.pth b/trial-7/checkpoint-6022/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1
--- /dev/null
+++ b/trial-7/checkpoint-6022/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244
diff --git a/trial-7/checkpoint-6022/scheduler.pt b/trial-7/checkpoint-6022/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..feffa73e78d93c8ae57663715e45d3c3914fb368
--- /dev/null
+++ b/trial-7/checkpoint-6022/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c7a8af5cc58476915589f4479c98012a6f77921d3365ad0d2b9efb681952de5
+size 1064
diff --git a/trial-7/checkpoint-6022/trainer_state.json b/trial-7/checkpoint-6022/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c437b8cf7dba6cc29f8d97e55828db555f44696f
--- /dev/null
+++ b/trial-7/checkpoint-6022/trainer_state.json
@@ -0,0 +1,897 @@
+{
+  "best_metric": 0.021904850378632545,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-7/checkpoint-6022",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 6022,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016605778811026237,
+      "grad_norm": 4.708388328552246,
+      "learning_rate": 4.332282504686465e-05,
+      "loss": 0.3911,
+      "step": 50
+    },
+    {
+      "epoch": 0.033211557622052475,
+      "grad_norm": 3.693350315093994,
+      "learning_rate": 4.3142222974800016e-05,
+      "loss": 0.1288,
+      "step": 100
+    },
+    {
+      "epoch": 0.04981733643307871,
+      "grad_norm": 27.80695915222168,
+      "learning_rate": 4.2961620902735386e-05,
+      "loss": 0.1,
+      "step": 150
+    },
+    {
+      "epoch": 0.06642311524410495,
+      "grad_norm": 0.014199809171259403,
+      "learning_rate": 4.278101883067075e-05,
+      "loss": 0.0817,
+      "step": 200
+    },
+    {
+      "epoch": 0.08302889405513118,
+      "grad_norm": 0.024310972541570663,
+      "learning_rate": 4.260041675860611e-05,
+      "loss": 0.0234,
+      "step": 250
+    },
+    {
+      "epoch": 0.09963467286615742,
+      "grad_norm": 0.0038154239300638437,
+      "learning_rate": 4.2419814686541476e-05,
+      "loss": 0.0781,
+      "step": 300
+    },
+    {
+      "epoch": 0.11624045167718366,
+      "grad_norm": 0.002823322080075741,
+      "learning_rate": 4.2239212614476846e-05,
+      "loss": 0.0394,
+      "step": 350
+    },
+    {
+      "epoch": 0.1328462304882099,
+      "grad_norm": 12.188178062438965,
+      "learning_rate": 4.20586105424122e-05,
+      "loss": 0.0786,
+      "step": 400
+    },
+    {
+      "epoch": 0.14945200929923613,
+      "grad_norm": 0.02171366475522518,
+      "learning_rate": 4.187800847034757e-05,
+      "loss": 0.0337,
+      "step": 450
+    },
+    {
+      "epoch": 0.16605778811026237,
+      "grad_norm": 1.8111064434051514,
+      "learning_rate": 4.1697406398282937e-05,
+      "loss": 0.0427,
+      "step": 500
+    },
+    {
+      "epoch": 0.1826635669212886,
+      "grad_norm": 0.8548564910888672,
+      "learning_rate": 4.15168043262183e-05,
+      "loss": 0.0733,
+      "step": 550
+    },
+    {
+      "epoch": 0.19926934573231483,
+      "grad_norm": 0.03964327648282051,
+      "learning_rate": 4.133620225415367e-05,
+      "loss": 0.0411,
+      "step": 600
+    },
+    {
+      "epoch": 0.2158751245433411,
+      "grad_norm": 0.012529165484011173,
+      "learning_rate": 4.1155600182089034e-05,
+      "loss": 0.0453,
+      "step": 650
+    },
+    {
+      "epoch": 0.23248090335436733,
+      "grad_norm": 0.042264923453330994,
+      "learning_rate": 4.09749981100244e-05,
+      "loss": 0.0338,
+      "step": 700
+    },
+    {
+      "epoch": 0.24908668216539356,
+      "grad_norm": 0.004559572786092758,
+      "learning_rate": 4.079439603795976e-05,
+      "loss": 0.0467,
+      "step": 750
+    },
+    {
+      "epoch": 0.2656924609764198,
+      "grad_norm": 0.002454261528328061,
+      "learning_rate": 4.061379396589513e-05,
+      "loss": 0.0228,
+      "step": 800
+    },
+    {
+      "epoch": 0.282298239787446,
+      "grad_norm": 2.4341135025024414,
+      "learning_rate": 4.0433191893830494e-05,
+      "loss": 0.0572,
+      "step": 850
+    },
+    {
+      "epoch": 0.29890401859847227,
+      "grad_norm": 0.15002170205116272,
+      "learning_rate": 4.025258982176586e-05,
+      "loss": 0.0244,
+      "step": 900
+    },
+    {
+      "epoch": 0.3155097974094985,
+      "grad_norm": 0.00873472262173891,
+      "learning_rate": 4.007198774970122e-05,
+      "loss": 0.0207,
+      "step": 950
+    },
+    {
+      "epoch": 0.33211557622052473,
+      "grad_norm": 0.07583663612604141,
+      "learning_rate": 3.989138567763659e-05,
+      "loss": 0.0233,
+      "step": 1000
+    },
+    {
+      "epoch": 0.348721355031551,
+      "grad_norm": 0.002584233647212386,
+      "learning_rate": 3.9710783605571955e-05,
+      "loss": 0.0228,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3653271338425772,
+      "grad_norm": 0.025093793869018555,
+      "learning_rate": 3.953018153350732e-05,
+      "loss": 0.0393,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38193291265360346,
+      "grad_norm": 0.26810237765312195,
+      "learning_rate": 3.934957946144268e-05,
+      "loss": 0.0213,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39853869146462967,
+      "grad_norm": 0.0005972657818347216,
+      "learning_rate": 3.9168977389378045e-05,
+      "loss": 0.0099,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41514447027565593,
+      "grad_norm": 2.7341020107269287,
+      "learning_rate": 3.8988375317313415e-05,
+      "loss": 0.0165,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4317502490866822,
+      "grad_norm": 30.906461715698242,
+      "learning_rate": 3.880777324524878e-05,
+      "loss": 0.0481,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4483560278977084,
+      "grad_norm": 0.07481276988983154,
+      "learning_rate": 3.862717117318414e-05,
+      "loss": 0.0249,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46496180670873466,
+      "grad_norm": 0.04692293331027031,
+      "learning_rate": 3.8446569101119505e-05,
+      "loss": 0.0047,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48156758551976087,
+      "grad_norm": 0.008200657553970814,
+      "learning_rate": 3.8265967029054876e-05,
+      "loss": 0.0244,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4981733643307871,
+      "grad_norm": 0.006684092804789543,
+      "learning_rate": 3.808536495699023e-05,
+      "loss": 0.0285,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5147791431418134,
+      "grad_norm": 0.00020126289746258408,
+      "learning_rate": 3.79047628849256e-05,
+      "loss": 0.0187,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5313849219528396,
+      "grad_norm": 0.5489906668663025,
+      "learning_rate": 3.7724160812860966e-05,
+      "loss": 0.0245,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5479907007638658,
+      "grad_norm": 0.01115335151553154,
+      "learning_rate": 3.754355874079633e-05,
+      "loss": 0.0317,
+      "step": 1650
+    },
+    {
+      "epoch": 0.564596479574892,
+      "grad_norm": 0.0077936286106705666,
+      "learning_rate": 3.73629566687317e-05,
+      "loss": 0.0044,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5812022583859183,
+      "grad_norm": 48.291107177734375,
+      "learning_rate": 3.718235459666706e-05,
+      "loss": 0.0039,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5978080371969445,
+      "grad_norm": 0.005009625572711229,
+      "learning_rate": 3.7001752524602426e-05,
+      "loss": 0.0119,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6144138160079707,
+      "grad_norm": 0.0016993529861792922,
+      "learning_rate": 3.682115045253779e-05,
+      "loss": 0.0111,
+      "step": 1850
+    },
+    {
+      "epoch": 0.631019594818997,
+      "grad_norm": 0.03398797661066055,
+      "learning_rate": 3.664054838047316e-05,
+      "loss": 0.0155,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6476253736300233,
+      "grad_norm": 0.0077589512802660465,
+      "learning_rate": 3.6459946308408524e-05,
+      "loss": 0.0014,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6642311524410495,
+      "grad_norm": 0.0004693228402175009,
+      "learning_rate": 3.627934423634389e-05,
+      "loss": 0.0209,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6808369312520757,
+      "grad_norm": 0.0019584419205784798,
+      "learning_rate": 3.609874216427925e-05,
+      "loss": 0.0015,
+      "step": 2050
+    },
+    {
+      "epoch": 0.697442710063102,
+      "grad_norm": 0.0007614546921104193,
+      "learning_rate": 3.591814009221462e-05,
+      "loss": 0.0201,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7140484888741282,
+      "grad_norm": 2.3300867080688477,
+      "learning_rate": 3.5737538020149984e-05,
+      "loss": 0.0249,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7306542676851544,
+      "grad_norm": 0.00295511307194829,
+      "learning_rate": 3.555693594808535e-05,
+      "loss": 0.0177,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7472600464961807,
+      "grad_norm": 0.0010525333927944303,
+      "learning_rate": 3.537633387602071e-05,
+      "loss": 0.0012,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7638658253072069,
+      "grad_norm": 0.0007724681054241955,
+      "learning_rate": 3.5195731803956074e-05,
+      "loss": 0.0206,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7804716041182331,
+      "grad_norm": 0.029788095504045486,
+      "learning_rate": 3.5015129731891445e-05,
+      "loss": 0.0077,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7970773829292593,
+      "grad_norm": 0.0016215493669733405,
+      "learning_rate": 3.483452765982681e-05,
+      "loss": 0.0278,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8136831617402857,
+      "grad_norm": 0.0008576202089898288,
+      "learning_rate": 3.465392558776217e-05,
+      "loss": 0.0001,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8302889405513119,
+      "grad_norm": 0.0007036550086922944,
+      "learning_rate": 3.4473323515697535e-05,
+      "loss": 0.0064,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8468947193623381,
+      "grad_norm": 0.007299103774130344,
+      "learning_rate": 3.4292721443632905e-05,
+      "loss": 0.0202,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8635004981733644,
+      "grad_norm": 0.004318034276366234,
+      "learning_rate": 3.411211937156826e-05,
+      "loss": 0.0239,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8801062769843906,
+      "grad_norm": 0.00030149793019518256,
+      "learning_rate": 3.393151729950363e-05,
+      "loss": 0.0011,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8967120557954168,
+      "grad_norm": 0.00011602124141063541,
+      "learning_rate": 3.3750915227438995e-05,
+      "loss": 0.0003,
+      "step": 2700
+    },
+    {
+      "epoch": 0.913317834606443,
+      "grad_norm": 0.00016940826026257128,
+      "learning_rate": 3.357031315537436e-05,
+      "loss": 0.0001,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9299236134174693,
+      "grad_norm": 0.00018533991533331573,
+      "learning_rate": 3.338971108330973e-05,
+      "loss": 0.0072,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9465293922284955,
+      "grad_norm": 0.007014571689069271,
+      "learning_rate": 3.320910901124509e-05,
+      "loss": 0.0072,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9631351710395217,
+      "grad_norm": 0.0949193611741066,
+      "learning_rate": 3.3028506939180456e-05,
+      "loss": 0.0021,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9797409498505479,
+      "grad_norm": 9.85204792022705,
+      "learning_rate": 3.284790486711582e-05,
+      "loss": 0.0314,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9963467286615743,
+      "grad_norm": 0.0009359152754768729,
+      "learning_rate": 3.266730279505119e-05,
+      "loss": 0.0169,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.997256679389313,
+      "eval_f1": 0.9972546509870224,
+      "eval_loss": 0.02280445024371147,
+      "eval_precision": 0.9972528075162768,
+      "eval_recall": 0.997256679389313,
+      "eval_runtime": 39.3237,
+      "eval_samples_per_second": 213.205,
+      "eval_steps_per_second": 13.325,
+      "step": 3011
+    },
+    {
+      "epoch": 1.0129525074726005,
+      "grad_norm": 0.001626566518098116,
+      "learning_rate": 3.2486700722986546e-05,
+      "loss": 0.0064,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0295582862836268,
+      "grad_norm": 0.0007160088862292469,
+      "learning_rate": 3.2306098650921916e-05,
+      "loss": 0.0002,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0461640650946529,
+      "grad_norm": 0.0008541371207684278,
+      "learning_rate": 3.212549657885728e-05,
+      "loss": 0.0009,
+      "step": 3150
+    },
+    {
+      "epoch": 1.0627698439056792,
+      "grad_norm": 0.0005321013741195202,
+      "learning_rate": 3.194489450679265e-05,
+      "loss": 0.0059,
+      "step": 3200
+    },
+    {
+      "epoch": 1.0793756227167055,
+      "grad_norm": 0.005570960231125355,
+      "learning_rate": 3.176429243472801e-05,
+      "loss": 0.0233,
+      "step": 3250
+    },
+    {
+      "epoch": 1.0959814015277316,
+      "grad_norm": 0.0008483761921525002,
+      "learning_rate": 3.158369036266338e-05,
+      "loss": 0.0051,
+      "step": 3300
+    },
+    {
+      "epoch": 1.112587180338758,
+      "grad_norm": 0.26837238669395447,
+      "learning_rate": 3.140308829059874e-05,
+      "loss": 0.0002,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1291929591497842,
+      "grad_norm": 0.006303045898675919,
+      "learning_rate": 3.1222486218534104e-05,
+      "loss": 0.0009,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1457987379608103,
+      "grad_norm": 0.0001649777841521427,
+      "learning_rate": 3.1041884146469474e-05,
+      "loss": 0.0145,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1624045167718366,
+      "grad_norm": 0.00047482753871008754,
+      "learning_rate": 3.086128207440484e-05,
+      "loss": 0.0002,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1790102955828627,
+      "grad_norm": 0.0005254672723822296,
+      "learning_rate": 3.06806800023402e-05,
+      "loss": 0.0033,
+      "step": 3550
+    },
+    {
+      "epoch": 1.195616074393889,
+      "grad_norm": 0.003435322782024741,
+      "learning_rate": 3.0500077930275568e-05,
+      "loss": 0.0121,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2122218532049154,
+      "grad_norm": 0.0008208905346691608,
+      "learning_rate": 3.031947585821093e-05,
+      "loss": 0.0004,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2288276320159415,
+      "grad_norm": 0.0008073259959928691,
+      "learning_rate": 3.0138873786146294e-05,
+      "loss": 0.0001,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2454334108269678,
+      "grad_norm": 0.0022343341261148453,
+      "learning_rate": 2.995827171408166e-05,
+      "loss": 0.0132,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2620391896379939,
+      "grad_norm": 0.003108004806563258,
+      "learning_rate": 2.9777669642017028e-05,
+      "loss": 0.0072,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2786449684490202,
+      "grad_norm": 0.0003189484996255487,
+      "learning_rate": 2.9597067569952388e-05,
+      "loss": 0.0004,
+      "step": 3850
+    },
+    {
+      "epoch": 1.2952507472600465,
+      "grad_norm": 0.000715159869287163,
+      "learning_rate": 2.9416465497887755e-05,
+      "loss": 0.0001,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3118565260710726,
+      "grad_norm": 0.00012114120909245685,
+      "learning_rate": 2.9235863425823122e-05,
+      "loss": 0.0046,
+      "step": 3950
+    },
+    {
+      "epoch": 1.328462304882099,
+      "grad_norm": 0.0006089384551160038,
+      "learning_rate": 2.9055261353758482e-05,
+      "loss": 0.0181,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3450680836931252,
+      "grad_norm": 43.172584533691406,
+      "learning_rate": 2.887465928169385e-05,
+      "loss": 0.0177,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3616738625041513,
+      "grad_norm": 0.000983994104899466,
+      "learning_rate": 2.8694057209629215e-05,
+      "loss": 0.0039,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3782796413151777,
+      "grad_norm": 7.642904529348016e-05,
+      "learning_rate": 2.851345513756458e-05,
+      "loss": 0.0,
+      "step": 4150
+    },
+    {
+      "epoch": 1.394885420126204,
+      "grad_norm": 9.202577348332852e-05,
+      "learning_rate": 2.8332853065499946e-05,
+      "loss": 0.0018,
+      "step": 4200
+    },
+    {
+      "epoch": 1.41149119893723,
+      "grad_norm": 0.00020998790569137782,
+      "learning_rate": 2.8152250993435312e-05,
+      "loss": 0.0076,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4280969777482564,
+      "grad_norm": 0.0004194548528175801,
+      "learning_rate": 2.7971648921370673e-05,
+      "loss": 0.0028,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4447027565592827,
+      "grad_norm": 0.005269031506031752,
+      "learning_rate": 2.779104684930604e-05,
+      "loss": 0.0367,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4613085353703088,
+      "grad_norm": 0.010580910369753838,
+      "learning_rate": 2.7610444777241406e-05,
+      "loss": 0.0058,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4779143141813351,
+      "grad_norm": 0.019897054880857468,
+      "learning_rate": 2.7429842705176773e-05,
+      "loss": 0.0215,
+      "step": 4450
+    },
+    {
+      "epoch": 1.4945200929923614,
+      "grad_norm": 0.004522784613072872,
+      "learning_rate": 2.7249240633112133e-05,
+      "loss": 0.0048,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5111258718033875,
+      "grad_norm": 0.0006406618049368262,
+      "learning_rate": 2.70686385610475e-05,
+      "loss": 0.0001,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5277316506144138,
+      "grad_norm": 0.000602134910877794,
+      "learning_rate": 2.6888036488982867e-05,
+      "loss": 0.0035,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5443374294254402,
+      "grad_norm": 0.000469192280434072,
+      "learning_rate": 2.670743441691823e-05,
+      "loss": 0.001,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5609432082364663,
+      "grad_norm": 5.6851687986636534e-05,
+      "learning_rate": 2.6526832344853594e-05,
+      "loss": 0.0006,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5775489870474926,
+      "grad_norm": 7.19124946044758e-05,
+      "learning_rate": 2.634623027278896e-05,
+      "loss": 0.0037,
+      "step": 4750
+    },
+    {
+      "epoch": 1.594154765858519,
+      "grad_norm": 7.235410885186866e-05,
+      "learning_rate": 2.6165628200724324e-05,
+      "loss": 0.0028,
+      "step": 4800
+    },
+    {
+      "epoch": 1.610760544669545,
+      "grad_norm": 0.000146635458804667,
+      "learning_rate": 2.598502612865969e-05,
+      "loss": 0.0055,
+      "step": 4850
+    },
+    {
+      "epoch": 1.627366323480571,
+      "grad_norm": 0.01404090877622366,
+      "learning_rate": 2.5804424056595057e-05,
+      "loss": 0.0186,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6439721022915976,
+      "grad_norm": 0.00503704184666276,
+      "learning_rate": 2.5623821984530417e-05,
+      "loss": 0.0108,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6605778811026237,
+      "grad_norm": 0.0004921660874970257,
+      "learning_rate": 2.5443219912465784e-05,
+      "loss": 0.0009,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6771836599136498,
+      "grad_norm": 0.000255432038102299,
+      "learning_rate": 2.526261784040115e-05,
+      "loss": 0.0002,
+      "step": 5050
+    },
+    {
+      "epoch": 1.6937894387246761,
+      "grad_norm": 0.0001737813145155087,
+      "learning_rate": 2.508201576833651e-05,
+      "loss": 0.0007,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7103952175357025,
+      "grad_norm": 4.374636773718521e-05,
+      "learning_rate": 2.4901413696271878e-05,
+      "loss": 0.0,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7270009963467285,
+      "grad_norm": 0.00044321315363049507,
+      "learning_rate": 2.4720811624207245e-05,
+      "loss": 0.0095,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7436067751577549,
+      "grad_norm": 0.00015154728316701949,
+      "learning_rate": 2.4540209552142608e-05,
+      "loss": 0.0165,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7602125539687812,
+      "grad_norm": 6.615974416490644e-05,
+      "learning_rate": 2.4359607480077975e-05,
+      "loss": 0.0,
+      "step": 5300
+    },
+    {
+      "epoch": 1.7768183327798073,
+      "grad_norm": 0.0006762578268535435,
+      "learning_rate": 2.4179005408013342e-05,
+      "loss": 0.0137,
+      "step": 5350
+    },
+    {
+      "epoch": 1.7934241115908336,
+      "grad_norm": 0.00047573362826369703,
+      "learning_rate": 2.3998403335948702e-05,
+      "loss": 0.0,
+      "step": 5400
+    },
+    {
+      "epoch": 1.81002989040186,
+      "grad_norm": 0.0024170074611902237,
+      "learning_rate": 2.381780126388407e-05,
+      "loss": 0.0001,
+      "step": 5450
+    },
+    {
+      "epoch": 1.826635669212886,
+      "grad_norm": 0.00010628774907672778,
+      "learning_rate": 2.3637199191819436e-05,
+      "loss": 0.0061,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8432414480239123,
+      "grad_norm": 0.0007334867841564119,
+      "learning_rate": 2.3456597119754796e-05,
+      "loss": 0.0,
+      "step": 5550
+    },
+    {
+      "epoch": 1.8598472268349386,
+      "grad_norm": 0.00021514961554203182,
+      "learning_rate": 2.3275995047690162e-05,
+      "loss": 0.0,
+      "step": 5600
+    },
+    {
+      "epoch": 1.8764530056459647,
+      "grad_norm": 0.0011384404497221112,
+      "learning_rate": 2.309539297562553e-05,
+      "loss": 0.0146,
+      "step": 5650
+    },
+    {
+      "epoch": 1.893058784456991,
+      "grad_norm": 0.00022749503841623664,
+      "learning_rate": 2.2914790903560896e-05,
+      "loss": 0.0001,
+      "step": 5700
+    },
+    {
+      "epoch": 1.9096645632680174,
+      "grad_norm": 0.00016585957200732082,
+      "learning_rate": 2.273418883149626e-05,
+      "loss": 0.0,
+      "step": 5750
+    },
+    {
+      "epoch": 1.9262703420790435,
+      "grad_norm": 0.0008972834912128747,
+      "learning_rate": 2.2553586759431623e-05,
+      "loss": 0.0052,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9428761208900698,
+      "grad_norm": 0.00017760394257493317,
+      "learning_rate": 2.237298468736699e-05,
+      "loss": 0.0043,
+      "step": 5850
+    },
+    {
+      "epoch": 1.959481899701096,
+      "grad_norm": 0.2404995709657669,
+      "learning_rate": 2.2192382615302353e-05,
+      "loss": 0.0114,
+      "step": 5900
+    },
+    {
+      "epoch": 1.9760876785121222,
+      "grad_norm": 0.0012148089008405805,
+      "learning_rate": 2.201178054323772e-05,
+      "loss": 0.0001,
+      "step": 5950
+    },
+    {
+      "epoch": 1.9926934573231485,
+      "grad_norm": 0.04914182797074318,
+      "learning_rate": 2.1831178471173087e-05,
+      "loss": 0.0113,
+      "step": 6000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9973759541984732,
+      "eval_f1": 0.997344144336412,
+      "eval_loss": 0.021904850378632545,
+      "eval_precision": 0.997361916418535,
+      "eval_recall": 0.9973759541984732,
+      "eval_runtime": 37.5313,
+      "eval_samples_per_second": 223.387,
+      "eval_steps_per_second": 13.962,
+      "step": 6022
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 12044,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-7/checkpoint-6022/training_args.bin b/trial-7/checkpoint-6022/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fefe87215650a4566f77c3438918ef8ad7d881d
--- /dev/null
+++ b/trial-7/checkpoint-6022/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac8ed1456bca015299da067993fb69b9ff68148b43f14eb2eec1cb64894fdc05
+size 5368
diff --git a/trial-8/checkpoint-6022/config.json b/trial-8/checkpoint-6022/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-8/checkpoint-6022/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-8/checkpoint-6022/model.safetensors b/trial-8/checkpoint-6022/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..87796ccd0552c0f6a6f4c976e67fd85cf382603d
--- /dev/null
+++ b/trial-8/checkpoint-6022/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7f3420381a8242245db48bb0ab8abf24564fb2045fe804c8ad857e91a85c91b
+size 598439784
diff --git a/trial-8/checkpoint-6022/optimizer.pt b/trial-8/checkpoint-6022/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f5525387755445c53176210db5136c0125908cc2
--- /dev/null
+++ b/trial-8/checkpoint-6022/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dd6138aab64b3d396432c35ce4db532bfba1306b4f99159aaa0c6d362152374
+size 1196967418
diff --git a/trial-8/checkpoint-6022/rng_state.pth b/trial-8/checkpoint-6022/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1
--- /dev/null
+++ b/trial-8/checkpoint-6022/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244
diff --git a/trial-8/checkpoint-6022/scheduler.pt b/trial-8/checkpoint-6022/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cf460763ee7d22e479fe60bc5a6cf8fc3d9894b6
--- /dev/null
+++ b/trial-8/checkpoint-6022/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbc92a16e2b5e4d8e3fa3b973f2760dc5f362ac0ce5d49f2d1336359c23db225
+size 1064
diff --git a/trial-8/checkpoint-6022/trainer_state.json b/trial-8/checkpoint-6022/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..02f21018346f9318a8b2121adaa18a08f0f4acb4
--- /dev/null
+++ b/trial-8/checkpoint-6022/trainer_state.json
@@ -0,0 +1,897 @@
+{
+  "best_metric": 0.016700224950909615,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-8/checkpoint-6022",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 6022,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016605778811026237,
+      "grad_norm": 7.783297538757324,
+      "learning_rate": 1.75347410056435e-05,
+      "loss": 0.3917,
+      "step": 50
+    },
+    {
+      "epoch": 0.033211557622052475,
+      "grad_norm": 9.493297576904297,
+      "learning_rate": 1.74763113455114e-05,
+      "loss": 0.2198,
+      "step": 100
+    },
+    {
+      "epoch": 0.04981733643307871,
+      "grad_norm": 14.9419527053833,
+      "learning_rate": 1.7417881685379297e-05,
+      "loss": 0.1591,
+      "step": 150
+    },
+    {
+      "epoch": 0.06642311524410495,
+      "grad_norm": 12.10505199432373,
+      "learning_rate": 1.73594520252472e-05,
+      "loss": 0.1272,
+      "step": 200
+    },
+    {
+      "epoch": 0.08302889405513118,
+      "grad_norm": 0.3676553964614868,
+      "learning_rate": 1.7301022365115096e-05,
+      "loss": 0.0714,
+      "step": 250
+    },
+    {
+      "epoch": 0.09963467286615742,
+      "grad_norm": 0.059806231409311295,
+      "learning_rate": 1.7242592704982996e-05,
+      "loss": 0.0855,
+      "step": 300
+    },
+    {
+      "epoch": 0.11624045167718366,
+      "grad_norm": 0.017794443294405937,
+      "learning_rate": 1.7184163044850895e-05,
+      "loss": 0.0436,
+      "step": 350
+    },
+    {
+      "epoch": 0.1328462304882099,
+      "grad_norm": 5.694874286651611,
+      "learning_rate": 1.7125733384718795e-05,
+      "loss": 0.0805,
+      "step": 400
+    },
+    {
+      "epoch": 0.14945200929923613,
+      "grad_norm": 0.31493690609931946,
+      "learning_rate": 1.706730372458669e-05,
+      "loss": 0.0266,
+      "step": 450
+    },
+    {
+      "epoch": 0.16605778811026237,
+      "grad_norm": 3.3532514572143555,
+      "learning_rate": 1.700887406445459e-05,
+      "loss": 0.0457,
+      "step": 500
+    },
+    {
+      "epoch": 0.1826635669212886,
+      "grad_norm": 0.006790875922888517,
+      "learning_rate": 1.6950444404322493e-05,
+      "loss": 0.0178,
+      "step": 550
+    },
+    {
+      "epoch": 0.19926934573231483,
+      "grad_norm": 0.23334099352359772,
+      "learning_rate": 1.689201474419039e-05,
+      "loss": 0.0218,
+      "step": 600
+    },
+    {
+      "epoch": 0.2158751245433411,
+      "grad_norm": 0.07603476941585541,
+      "learning_rate": 1.683358508405829e-05,
+      "loss": 0.0273,
+      "step": 650
+    },
+    {
+      "epoch": 0.23248090335436733,
+      "grad_norm": 30.228376388549805,
+      "learning_rate": 1.677515542392619e-05,
+      "loss": 0.045,
+      "step": 700
+    },
+    {
+      "epoch": 0.24908668216539356,
+      "grad_norm": 0.00013399416639003903,
+      "learning_rate": 1.6716725763794088e-05,
+      "loss": 0.0442,
+      "step": 750
+    },
+    {
+      "epoch": 0.2656924609764198,
+      "grad_norm": 0.0007570835296064615,
+      "learning_rate": 1.6658296103661984e-05,
+      "loss": 0.0352,
+      "step": 800
+    },
+    {
+      "epoch": 0.282298239787446,
+      "grad_norm": 6.466372013092041,
+      "learning_rate": 1.6599866443529884e-05,
+      "loss": 0.057,
+      "step": 850
+    },
+    {
+      "epoch": 0.29890401859847227,
+      "grad_norm": 0.04902864992618561,
+      "learning_rate": 1.6541436783397787e-05,
+      "loss": 0.0282,
+      "step": 900
+    },
+    {
+      "epoch": 0.3155097974094985,
+      "grad_norm": 1.3140225410461426,
+      "learning_rate": 1.6483007123265683e-05,
+      "loss": 0.0128,
+      "step": 950
+    },
+    {
+      "epoch": 0.33211557622052473,
+      "grad_norm": 0.00038628041511401534,
+      "learning_rate": 1.6424577463133583e-05,
+      "loss": 0.0373,
+      "step": 1000
+    },
+    {
+      "epoch": 0.348721355031551,
+      "grad_norm": 0.0020163152366876602,
+      "learning_rate": 1.6366147803001482e-05,
+      "loss": 0.0281,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3653271338425772,
+      "grad_norm": 0.005716539453715086,
+      "learning_rate": 1.630771814286938e-05,
+      "loss": 0.0178,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38193291265360346,
+      "grad_norm": 0.0029005147516727448,
+      "learning_rate": 1.6249288482737278e-05,
+      "loss": 0.0028,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39853869146462967,
+      "grad_norm": 0.0008354082820005715,
+      "learning_rate": 1.6190858822605177e-05,
+      "loss": 0.0128,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41514447027565593,
+      "grad_norm": 0.04189394786953926,
+      "learning_rate": 1.613242916247308e-05,
+      "loss": 0.0358,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4317502490866822,
+      "grad_norm": 0.30329668521881104,
+      "learning_rate": 1.6073999502340976e-05,
+      "loss": 0.0496,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4483560278977084,
+      "grad_norm": 2.6213114261627197,
+      "learning_rate": 1.6015569842208876e-05,
+      "loss": 0.0296,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46496180670873466,
+      "grad_norm": 0.005221163388341665,
+      "learning_rate": 1.5957140182076775e-05,
+      "loss": 0.0057,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48156758551976087,
+      "grad_norm": 0.0011331464629620314,
+      "learning_rate": 1.5898710521944675e-05,
+      "loss": 0.0182,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4981733643307871,
+      "grad_norm": 0.003077897010371089,
+      "learning_rate": 1.584028086181257e-05,
+      "loss": 0.0163,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5147791431418134,
+      "grad_norm": 0.001127161318436265,
+      "learning_rate": 1.578185120168047e-05,
+      "loss": 0.008,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5313849219528396,
+      "grad_norm": 20.000713348388672,
+      "learning_rate": 1.572342154154837e-05,
+      "loss": 0.0277,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5479907007638658,
+      "grad_norm": 0.0031673621851950884,
+      "learning_rate": 1.566499188141627e-05,
+      "loss": 0.0295,
+      "step": 1650
+    },
+    {
+      "epoch": 0.564596479574892,
+      "grad_norm": 0.1044340580701828,
+      "learning_rate": 1.560656222128417e-05,
+      "loss": 0.0162,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5812022583859183,
+      "grad_norm": 0.002000702079385519,
+      "learning_rate": 1.554813256115207e-05,
+      "loss": 0.0047,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5978080371969445,
+      "grad_norm": 0.014721410349011421,
+      "learning_rate": 1.548970290101997e-05,
+      "loss": 0.0164,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6144138160079707,
+      "grad_norm": 0.00020889069128315896,
+      "learning_rate": 1.5431273240887864e-05,
+      "loss": 0.0065,
+      "step": 1850
+    },
+    {
+      "epoch": 0.631019594818997,
+      "grad_norm": 0.000981863122433424,
+      "learning_rate": 1.5372843580755764e-05,
+      "loss": 0.0104,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6476253736300233,
+      "grad_norm": 0.00036494643427431583,
+      "learning_rate": 1.5314413920623664e-05,
+      "loss": 0.0078,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6642311524410495,
+      "grad_norm": 0.00018712542077992111,
+      "learning_rate": 1.5255984260491563e-05,
+      "loss": 0.0147,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6808369312520757,
+      "grad_norm": 0.00041754008270800114,
+      "learning_rate": 1.5197554600359463e-05,
+      "loss": 0.0056,
+      "step": 2050
+    },
+    {
+      "epoch": 0.697442710063102,
+      "grad_norm": 0.0007413614075630903,
+      "learning_rate": 1.513912494022736e-05,
+      "loss": 0.0218,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7140484888741282,
+      "grad_norm": 0.17359164357185364,
+      "learning_rate": 1.508069528009526e-05,
+      "loss": 0.0087,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7306542676851544,
+      "grad_norm": 0.0031391121447086334,
+      "learning_rate": 1.5022265619963158e-05,
+      "loss": 0.0091,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7472600464961807,
+      "grad_norm": 0.0002511175407562405,
+      "learning_rate": 1.4963835959831057e-05,
+      "loss": 0.0085,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7638658253072069,
+      "grad_norm": 0.0005632131360471249,
+      "learning_rate": 1.4905406299698959e-05,
+      "loss": 0.0134,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7804716041182331,
+      "grad_norm": 0.020430119708180428,
+      "learning_rate": 1.4846976639566856e-05,
+      "loss": 0.0123,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7970773829292593,
+      "grad_norm": 0.002410025568678975,
+      "learning_rate": 1.4788546979434756e-05,
+      "loss": 0.0165,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8136831617402857,
+      "grad_norm": 0.0005655758432112634,
+      "learning_rate": 1.4730117319302654e-05,
+      "loss": 0.0001,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8302889405513119,
+      "grad_norm": 0.0001908275589812547,
+      "learning_rate": 1.4671687659170553e-05,
+      "loss": 0.0031,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8468947193623381,
+      "grad_norm": 0.007557071279734373,
+      "learning_rate": 1.4613257999038451e-05,
+      "loss": 0.011,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8635004981733644,
+      "grad_norm": 0.003107853000983596,
+      "learning_rate": 1.455482833890635e-05,
+      "loss": 0.0036,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8801062769843906,
+      "grad_norm": 0.00015464833995793015,
+      "learning_rate": 1.4496398678774252e-05,
+      "loss": 0.0187,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8967120557954168,
+      "grad_norm": 0.0004067471018061042,
+      "learning_rate": 1.443796901864215e-05,
+      "loss": 0.0003,
+      "step": 2700
+    },
+    {
+      "epoch": 0.913317834606443,
+      "grad_norm": 0.002043587388470769,
+      "learning_rate": 1.437953935851005e-05,
+      "loss": 0.0067,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9299236134174693,
+      "grad_norm": 4.698836164607201e-06,
+      "learning_rate": 1.4321109698377947e-05,
+      "loss": 0.0027,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9465293922284955,
+      "grad_norm": 0.004311679396778345,
+      "learning_rate": 1.4262680038245847e-05,
+      "loss": 0.0099,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9631351710395217,
+      "grad_norm": 16.798845291137695,
+      "learning_rate": 1.4204250378113745e-05,
+      "loss": 0.0056,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9797409498505479,
+      "grad_norm": 9.97236156463623,
+      "learning_rate": 1.4145820717981646e-05,
+      "loss": 0.0408,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9963467286615743,
+      "grad_norm": 0.0034861546009778976,
+      "learning_rate": 1.4087391057849545e-05,
+      "loss": 0.0111,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9958253816793893,
+      "eval_f1": 0.9958527220108943,
+      "eval_loss": 0.02235870435833931,
+      "eval_precision": 0.9958943559410505,
+      "eval_recall": 0.9958253816793893,
+      "eval_runtime": 37.1995,
+      "eval_samples_per_second": 225.379,
+      "eval_steps_per_second": 14.086,
+      "step": 3011
+    },
+    {
+      "epoch": 1.0129525074726005,
+      "grad_norm": 0.0027454651426523924,
+      "learning_rate": 1.4028961397717443e-05,
+      "loss": 0.0031,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0295582862836268,
+      "grad_norm": 0.00027677303296513855,
+      "learning_rate": 1.3970531737585343e-05,
+      "loss": 0.0,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0461640650946529,
+      "grad_norm": 0.00034389185020700097,
+      "learning_rate": 1.391210207745324e-05,
+      "loss": 0.0,
+      "step": 3150
+    },
+    {
+      "epoch": 1.0627698439056792,
+      "grad_norm": 0.0010943470988422632,
+      "learning_rate": 1.385367241732114e-05,
+      "loss": 0.0119,
+      "step": 3200
+    },
+    {
+      "epoch": 1.0793756227167055,
+      "grad_norm": 0.007589911110699177,
+      "learning_rate": 1.3795242757189038e-05,
+      "loss": 0.0177,
+      "step": 3250
+    },
+    {
+      "epoch": 1.0959814015277316,
+      "grad_norm": 0.00021936999110039324,
+      "learning_rate": 1.3736813097056939e-05,
+      "loss": 0.0058,
+      "step": 3300
+    },
+    {
+      "epoch": 1.112587180338758,
+      "grad_norm": 0.005090941209346056,
+      "learning_rate": 1.3678383436924837e-05,
+      "loss": 0.0147,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1291929591497842,
+      "grad_norm": 0.0033587052021175623,
+      "learning_rate": 1.3619953776792737e-05,
+      "loss": 0.0022,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1457987379608103,
+      "grad_norm": 3.696617568493821e-05,
+      "learning_rate": 1.3561524116660634e-05,
+      "loss": 0.0001,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1624045167718366,
+      "grad_norm": 0.00012033848179271445,
+      "learning_rate": 1.3503094456528534e-05,
+      "loss": 0.0158,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1790102955828627,
+      "grad_norm": 0.0007672170177102089,
+      "learning_rate": 1.3444664796396433e-05,
+      "loss": 0.0001,
+      "step": 3550
+    },
+    {
+      "epoch": 1.195616074393889,
+      "grad_norm": 0.0013132853200659156,
+      "learning_rate": 1.3386235136264331e-05,
+      "loss": 0.0052,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2122218532049154,
+      "grad_norm": 0.0014574166852980852,
+      "learning_rate": 1.3327805476132233e-05,
+      "loss": 0.0007,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2288276320159415,
+      "grad_norm": 0.0005620878073386848,
+      "learning_rate": 1.326937581600013e-05,
+      "loss": 0.0,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2454334108269678,
+      "grad_norm": 0.000902441912330687,
+      "learning_rate": 1.321094615586803e-05,
+      "loss": 0.0118,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2620391896379939,
+      "grad_norm": 0.0004086974367965013,
+      "learning_rate": 1.3152516495735928e-05,
+      "loss": 0.0,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2786449684490202,
+      "grad_norm": 0.00640166224911809,
+      "learning_rate": 1.3094086835603827e-05,
+      "loss": 0.0,
+      "step": 3850
+    },
+    {
+      "epoch": 1.2952507472600465,
+      "grad_norm": 0.00021929937065578997,
+      "learning_rate": 1.3035657175471725e-05,
+      "loss": 0.0045,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3118565260710726,
+      "grad_norm": 0.0003949702368117869,
+      "learning_rate": 1.2977227515339625e-05,
+      "loss": 0.0,
+      "step": 3950
+    },
+    {
+      "epoch": 1.328462304882099,
+      "grad_norm": 0.00032811236451379955,
+      "learning_rate": 1.2918797855207526e-05,
+      "loss": 0.0004,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3450680836931252,
+      "grad_norm": 0.0013841086765751243,
+      "learning_rate": 1.2860368195075424e-05,
+      "loss": 0.0,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3616738625041513,
+      "grad_norm": 0.002687544096261263,
+      "learning_rate": 1.2801938534943323e-05,
+      "loss": 0.0061,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3782796413151777,
+      "grad_norm": 0.0017038496444001794,
+      "learning_rate": 1.2743508874811221e-05,
+      "loss": 0.0163,
+      "step": 4150
+    },
+    {
+      "epoch": 1.394885420126204,
+      "grad_norm": 3.813268995145336e-05,
+      "learning_rate": 1.268507921467912e-05,
+      "loss": 0.0001,
+      "step": 4200
+    },
+    {
+      "epoch": 1.41149119893723,
+      "grad_norm": 5.391587546910159e-05,
+      "learning_rate": 1.2626649554547018e-05,
+      "loss": 0.0,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4280969777482564,
+      "grad_norm": 0.0006178281037136912,
+      "learning_rate": 1.2568219894414918e-05,
+      "loss": 0.0069,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4447027565592827,
+      "grad_norm": 0.0003027453494723886,
+      "learning_rate": 1.250979023428282e-05,
+      "loss": 0.0084,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4613085353703088,
+      "grad_norm": 0.0006302434485405684,
+      "learning_rate": 1.2451360574150717e-05,
+      "loss": 0.0006,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4779143141813351,
+      "grad_norm": 0.0009224305395036936,
+      "learning_rate": 1.2392930914018617e-05,
+      "loss": 0.0128,
+      "step": 4450
+    },
+    {
+      "epoch": 1.4945200929923614,
+      "grad_norm": 0.006426838226616383,
+      "learning_rate": 1.2334501253886514e-05,
+      "loss": 0.0001,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5111258718033875,
+      "grad_norm": 0.00010298648703610525,
+      "learning_rate": 1.2276071593754414e-05,
+      "loss": 0.0,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5277316506144138,
+      "grad_norm": 0.00034292592317797244,
+      "learning_rate": 1.2217641933622312e-05,
+      "loss": 0.0003,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5443374294254402,
+      "grad_norm": 0.0020624478347599506,
+      "learning_rate": 1.2159212273490211e-05,
+      "loss": 0.0001,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5609432082364663,
+      "grad_norm": 0.0019984941463917494,
+      "learning_rate": 1.2100782613358111e-05,
+      "loss": 0.0,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5775489870474926,
+      "grad_norm": 0.00010821606701938435,
+      "learning_rate": 1.204235295322601e-05,
+      "loss": 0.0004,
+      "step": 4750
+    },
+    {
+      "epoch": 1.594154765858519,
+      "grad_norm": 2.993629277625587e-05,
+      "learning_rate": 1.198392329309391e-05,
+      "loss": 0.0001,
+      "step": 4800
+    },
+    {
+      "epoch": 1.610760544669545,
+      "grad_norm": 1.737935235723853e-05,
+      "learning_rate": 1.1925493632961808e-05,
+      "loss": 0.0045,
+      "step": 4850
+    },
+    {
+      "epoch": 1.627366323480571,
+      "grad_norm": 0.0006885113543830812,
+      "learning_rate": 1.1867063972829707e-05,
+      "loss": 0.0243,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6439721022915976,
+      "grad_norm": 0.0001697670086286962,
+      "learning_rate": 1.1808634312697605e-05,
+      "loss": 0.0009,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6605778811026237,
+      "grad_norm": 0.00014473537157755345,
+      "learning_rate": 1.1750204652565505e-05,
+      "loss": 0.0155,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6771836599136498,
+      "grad_norm": 2.826682793966029e-05,
+      "learning_rate": 1.1691774992433404e-05,
+      "loss": 0.0,
+      "step": 5050
+    },
+    {
+      "epoch": 1.6937894387246761,
+      "grad_norm": 4.407271626405418e-05,
+      "learning_rate": 1.1633345332301304e-05,
+      "loss": 0.0,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7103952175357025,
+      "grad_norm": 0.0044469875283539295,
+      "learning_rate": 1.1574915672169202e-05,
+      "loss": 0.003,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7270009963467285,
+      "grad_norm": 2.422453326289542e-05,
+      "learning_rate": 1.1516486012037101e-05,
+      "loss": 0.0114,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7436067751577549,
+      "grad_norm": 2.4912773369578645e-05,
+      "learning_rate": 1.1458056351905e-05,
+      "loss": 0.0,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7602125539687812,
+      "grad_norm": 0.0004292759404052049,
+      "learning_rate": 1.1399626691772899e-05,
+      "loss": 0.0041,
+      "step": 5300
+    },
+    {
+      "epoch": 1.7768183327798073,
+      "grad_norm": 7.934037208557129,
+      "learning_rate": 1.1341197031640798e-05,
+      "loss": 0.0265,
+      "step": 5350
+    },
+    {
+      "epoch": 1.7934241115908336,
+      "grad_norm": 0.0004573618352878839,
+      "learning_rate": 1.1282767371508698e-05,
+      "loss": 0.0,
+      "step": 5400
+    },
+    {
+      "epoch": 1.81002989040186,
+      "grad_norm": 0.00449636485427618,
+      "learning_rate": 1.1224337711376597e-05,
+      "loss": 0.0,
+      "step": 5450
+    },
+    {
+      "epoch": 1.826635669212886,
+      "grad_norm": 0.00016635288193356246,
+      "learning_rate": 1.1165908051244495e-05,
+      "loss": 0.0001,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8432414480239123,
+      "grad_norm": 0.0066105956211686134,
+      "learning_rate": 1.1107478391112395e-05,
+      "loss": 0.0046,
+      "step": 5550
+    },
+    {
+      "epoch": 1.8598472268349386,
+      "grad_norm": 6.935372948646545e-05,
+      "learning_rate": 1.1049048730980292e-05,
+      "loss": 0.0,
+      "step": 5600
+    },
+    {
+      "epoch": 1.8764530056459647,
+      "grad_norm": 0.0036935850512236357,
+      "learning_rate": 1.0990619070848192e-05,
+      "loss": 0.0135,
+      "step": 5650
+    },
+    {
+      "epoch": 1.893058784456991,
+      "grad_norm": 0.002138146897777915,
+      "learning_rate": 1.0932189410716091e-05,
+      "loss": 0.0087,
+      "step": 5700
+    },
+    {
+      "epoch": 1.9096645632680174,
+      "grad_norm": 0.00018446841568220407,
+      "learning_rate": 1.0873759750583991e-05,
+      "loss": 0.0,
+      "step": 5750
+    },
+    {
+      "epoch": 1.9262703420790435,
+      "grad_norm": 0.0010917658219113946,
+      "learning_rate": 1.081533009045189e-05,
+      "loss": 0.0,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9428761208900698,
+      "grad_norm": 0.0001402223715558648,
+      "learning_rate": 1.0756900430319788e-05,
+      "loss": 0.0009,
+      "step": 5850
+    },
+    {
+      "epoch": 1.959481899701096,
+      "grad_norm": 0.0169499684125185,
+      "learning_rate": 1.0698470770187688e-05,
+      "loss": 0.0388,
+      "step": 5900
+    },
+    {
+      "epoch": 1.9760876785121222,
+      "grad_norm": 0.0036119220312684774,
+      "learning_rate": 1.0640041110055586e-05,
+      "loss": 0.0071,
+      "step": 5950
+    },
+    {
+      "epoch": 1.9926934573231485,
+      "grad_norm": 0.0004526925040408969,
+      "learning_rate": 1.0581611449923485e-05,
+      "loss": 0.008,
+      "step": 6000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9970181297709924,
+      "eval_f1": 0.9970025459787301,
+      "eval_loss": 0.016700224950909615,
+      "eval_precision": 0.9969961560397819,
+      "eval_recall": 0.9970181297709924,
+      "eval_runtime": 36.5623,
+      "eval_samples_per_second": 229.307,
+      "eval_steps_per_second": 14.332,
+      "step": 6022
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 15055,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-8/checkpoint-6022/training_args.bin b/trial-8/checkpoint-6022/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38b919e408e37c8659bd156ac7debfa744d1306f
--- /dev/null
+++ b/trial-8/checkpoint-6022/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67c4a841d9791e131313053f2277167e82e39e63ce8d395fd78c60884646dc0f
+size 5368
diff --git a/trial-9/checkpoint-3012/config.json b/trial-9/checkpoint-3012/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00
--- /dev/null
+++ b/trial-9/checkpoint-3012/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}
diff --git a/trial-9/checkpoint-3012/model.safetensors b/trial-9/checkpoint-3012/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..344849614bcd8ca99397f5076f96a4d3a5861441
--- /dev/null
+++ b/trial-9/checkpoint-3012/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbdaa19d83b8a2e040bb399bf1c9efb459a15e8429ba43cbf85ed958325c8a8a
+size 598439784
diff --git a/trial-9/checkpoint-3012/optimizer.pt b/trial-9/checkpoint-3012/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..411534b735c0ef326b49041bee5352a531831675
--- /dev/null
+++ b/trial-9/checkpoint-3012/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c0340932c9aad23f7fcb0392b8fde450fb7ced7808a6df9095d7314b759b59d
+size 1196967418
diff --git a/trial-9/checkpoint-3012/rng_state.pth b/trial-9/checkpoint-3012/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1
--- /dev/null
+++ b/trial-9/checkpoint-3012/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244
diff --git a/trial-9/checkpoint-3012/scheduler.pt b/trial-9/checkpoint-3012/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..83a710614683b29a7a852f855b46ce0871810214
--- /dev/null
+++ b/trial-9/checkpoint-3012/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0a01c4c544a9920a83cf6474f63ae7bb57f7d77551e95df7c0ed175573c04db
+size 1064
diff --git a/trial-9/checkpoint-3012/trainer_state.json b/trial-9/checkpoint-3012/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a8719291430b7e3922103d2d4fdfe4c33385420
--- /dev/null
+++ b/trial-9/checkpoint-3012/trainer_state.json
@@ -0,0 +1,477 @@
+{
+  "best_metric": 0.013810121454298496,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-9/checkpoint-3012",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 3012,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.033200531208499334,
+      "grad_norm": 3.6277902126312256,
+      "learning_rate": 4.223876761987849e-05,
+      "loss": 0.2681,
+      "step": 50
+    },
+    {
+      "epoch": 0.06640106241699867,
+      "grad_norm": 0.3113747537136078,
+      "learning_rate": 4.2037477267134116e-05,
+      "loss": 0.0958,
+      "step": 100
+    },
+    {
+      "epoch": 0.099601593625498,
+      "grad_norm": 0.18985196948051453,
+      "learning_rate": 4.183618691438975e-05,
+      "loss": 0.0636,
+      "step": 150
+    },
+    {
+      "epoch": 0.13280212483399734,
+      "grad_norm": 0.8797281980514526,
+      "learning_rate": 4.163489656164538e-05,
+      "loss": 0.0562,
+      "step": 200
+    },
+    {
+      "epoch": 0.16600265604249667,
+      "grad_norm": 1.9831819534301758,
+      "learning_rate": 4.143360620890101e-05,
+      "loss": 0.047,
+      "step": 250
+    },
+    {
+      "epoch": 0.199203187250996,
+      "grad_norm": 3.341094732284546,
+      "learning_rate": 4.123231585615664e-05,
+      "loss": 0.0346,
+      "step": 300
+    },
+    {
+      "epoch": 0.23240371845949534,
+      "grad_norm": 0.048162225633859634,
+      "learning_rate": 4.103102550341227e-05,
+      "loss": 0.0457,
+      "step": 350
+    },
+    {
+      "epoch": 0.2656042496679947,
+      "grad_norm": 0.0644642785191536,
+      "learning_rate": 4.08297351506679e-05,
+      "loss": 0.0391,
+      "step": 400
+    },
+    {
+      "epoch": 0.29880478087649404,
+      "grad_norm": 6.679907321929932,
+      "learning_rate": 4.0628444797923535e-05,
+      "loss": 0.0346,
+      "step": 450
+    },
+    {
+      "epoch": 0.33200531208499334,
+      "grad_norm": 0.01181253232061863,
+      "learning_rate": 4.0427154445179164e-05,
+      "loss": 0.0103,
+      "step": 500
+    },
+    {
+      "epoch": 0.3652058432934927,
+      "grad_norm": 0.06453288346529007,
+      "learning_rate": 4.022586409243479e-05,
+      "loss": 0.0292,
+      "step": 550
+    },
+    {
+      "epoch": 0.398406374501992,
+      "grad_norm": 0.011014764197170734,
+      "learning_rate": 4.002457373969042e-05,
+      "loss": 0.0101,
+      "step": 600
+    },
+    {
+      "epoch": 0.4316069057104914,
+      "grad_norm": 0.44575539231300354,
+      "learning_rate": 3.982328338694605e-05,
+      "loss": 0.0187,
+      "step": 650
+    },
+    {
+      "epoch": 0.4648074369189907,
+      "grad_norm": 0.27992862462997437,
+      "learning_rate": 3.962199303420168e-05,
+      "loss": 0.0196,
+      "step": 700
+    },
+    {
+      "epoch": 0.49800796812749004,
+      "grad_norm": 0.003195864148437977,
+      "learning_rate": 3.942070268145732e-05,
+      "loss": 0.0266,
+      "step": 750
+    },
+    {
+      "epoch": 0.5312084993359893,
+      "grad_norm": 5.236836910247803,
+      "learning_rate": 3.921941232871295e-05,
+      "loss": 0.0128,
+      "step": 800
+    },
+    {
+      "epoch": 0.5644090305444888,
+      "grad_norm": 0.6897503137588501,
+      "learning_rate": 3.9018121975968576e-05,
+      "loss": 0.0163,
+      "step": 850
+    },
+    {
+      "epoch": 0.5976095617529881,
+      "grad_norm": 0.07702745497226715,
+      "learning_rate": 3.881683162322421e-05,
+      "loss": 0.0196,
+      "step": 900
+    },
+    {
+      "epoch": 0.6308100929614874,
+      "grad_norm": 0.00853784941136837,
+      "learning_rate": 3.8615541270479835e-05,
+      "loss": 0.0207,
+      "step": 950
+    },
+    {
+      "epoch": 0.6640106241699867,
+      "grad_norm": 0.1736297905445099,
+      "learning_rate": 3.841425091773547e-05,
+      "loss": 0.0219,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6972111553784861,
+      "grad_norm": 0.07749740034341812,
+      "learning_rate": 3.82129605649911e-05,
+      "loss": 0.0158,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7304116865869854,
+      "grad_norm": 0.0033312628511339426,
+      "learning_rate": 3.801167021224673e-05,
+      "loss": 0.0133,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7636122177954847,
+      "grad_norm": 0.003007786348462105,
+      "learning_rate": 3.781037985950236e-05,
+      "loss": 0.0143,
+      "step": 1150
+    },
+    {
+      "epoch": 0.796812749003984,
+      "grad_norm": 3.926494598388672,
+      "learning_rate": 3.760908950675799e-05,
+      "loss": 0.0113,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8300132802124834,
+      "grad_norm": 0.0027299339417368174,
+      "learning_rate": 3.740779915401362e-05,
+      "loss": 0.0013,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8632138114209827,
+      "grad_norm": 0.013163665309548378,
+      "learning_rate": 3.7206508801269253e-05,
+      "loss": 0.0109,
+      "step": 1300
+    },
+    {
+      "epoch": 0.896414342629482,
+      "grad_norm": 0.0006267031421884894,
+      "learning_rate": 3.700521844852488e-05,
+      "loss": 0.0034,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9296148738379814,
+      "grad_norm": 0.00010993422620231286,
+      "learning_rate": 3.680392809578051e-05,
+      "loss": 0.0013,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9628154050464808,
+      "grad_norm": 1.569828987121582,
+      "learning_rate": 3.660263774303614e-05,
+      "loss": 0.0077,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9960159362549801,
+      "grad_norm": 0.01317554246634245,
+      "learning_rate": 3.640134739029177e-05,
+      "loss": 0.0144,
+      "step": 1500
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9965410305343512,
+      "eval_f1": 0.9965177127714778,
+      "eval_loss": 0.025706786662340164,
+      "eval_precision": 0.9965095878718089,
+      "eval_recall": 0.9965410305343512,
+      "eval_runtime": 31.4201,
+      "eval_samples_per_second": 266.836,
+      "eval_steps_per_second": 8.339,
+      "step": 1506
+    },
+    {
+      "epoch": 1.0292164674634794,
+      "grad_norm": 0.40110042691230774,
+      "learning_rate": 3.620005703754741e-05,
+      "loss": 0.0002,
+      "step": 1550
+    },
+    {
+      "epoch": 1.0624169986719787,
+      "grad_norm": 0.0003720091190189123,
+      "learning_rate": 3.5998766684803036e-05,
+      "loss": 0.0035,
+      "step": 1600
+    },
+    {
+      "epoch": 1.095617529880478,
+      "grad_norm": 0.0027410192415118217,
+      "learning_rate": 3.5797476332058666e-05,
+      "loss": 0.0122,
+      "step": 1650
+    },
+    {
+      "epoch": 1.1288180610889773,
+      "grad_norm": 0.010054959915578365,
+      "learning_rate": 3.5596185979314295e-05,
+      "loss": 0.0034,
+      "step": 1700
+    },
+    {
+      "epoch": 1.1620185922974768,
+      "grad_norm": 0.013796687126159668,
+      "learning_rate": 3.539489562656993e-05,
+      "loss": 0.0067,
+      "step": 1750
+    },
+    {
+      "epoch": 1.1952191235059761,
+      "grad_norm": 1.8211485147476196,
+      "learning_rate": 3.5193605273825553e-05,
+      "loss": 0.006,
+      "step": 1800
+    },
+    {
+      "epoch": 1.2284196547144755,
+      "grad_norm": 0.016155727207660675,
+      "learning_rate": 3.499231492108119e-05,
+      "loss": 0.0002,
+      "step": 1850
+    },
+    {
+      "epoch": 1.2616201859229748,
+      "grad_norm": 0.003053726628422737,
+      "learning_rate": 3.479102456833682e-05,
+      "loss": 0.0071,
+      "step": 1900
+    },
+    {
+      "epoch": 1.294820717131474,
+      "grad_norm": 0.0003582706267479807,
+      "learning_rate": 3.458973421559245e-05,
+      "loss": 0.0001,
+      "step": 1950
+    },
+    {
+      "epoch": 1.3280212483399734,
+      "grad_norm": 0.03127170354127884,
+      "learning_rate": 3.438844386284808e-05,
+      "loss": 0.0071,
+      "step": 2000
+    },
+    {
+      "epoch": 1.361221779548473,
+      "grad_norm": 0.0018665710231289268,
+      "learning_rate": 3.418715351010371e-05,
+      "loss": 0.0112,
+      "step": 2050
+    },
+    {
+      "epoch": 1.3944223107569722,
+      "grad_norm": 0.05576420947909355,
+      "learning_rate": 3.3985863157359336e-05,
+      "loss": 0.0071,
+      "step": 2100
+    },
+    {
+      "epoch": 1.4276228419654715,
+      "grad_norm": 4.577602863311768,
+      "learning_rate": 3.378457280461497e-05,
+      "loss": 0.0114,
+      "step": 2150
+    },
+    {
+      "epoch": 1.4608233731739708,
+      "grad_norm": 0.00251931045204401,
+      "learning_rate": 3.35832824518706e-05,
+      "loss": 0.0136,
+      "step": 2200
+    },
+    {
+      "epoch": 1.4940239043824701,
+      "grad_norm": 0.00033850205363705754,
+      "learning_rate": 3.338199209912623e-05,
+      "loss": 0.001,
+      "step": 2250
+    },
+    {
+      "epoch": 1.5272244355909694,
+      "grad_norm": 0.0019836120773106813,
+      "learning_rate": 3.318070174638187e-05,
+      "loss": 0.0001,
+      "step": 2300
+    },
+    {
+      "epoch": 1.5604249667994687,
+      "grad_norm": 0.00020935946668032557,
+      "learning_rate": 3.297941139363749e-05,
+      "loss": 0.0,
+      "step": 2350
+    },
+    {
+      "epoch": 1.593625498007968,
+      "grad_norm": 0.000308250222587958,
+      "learning_rate": 3.2778121040893126e-05,
+      "loss": 0.0001,
+      "step": 2400
+    },
+    {
+      "epoch": 1.6268260292164674,
+      "grad_norm": 0.020645378157496452,
+      "learning_rate": 3.2576830688148755e-05,
+      "loss": 0.0092,
+      "step": 2450
+    },
+    {
+      "epoch": 1.6600265604249667,
+      "grad_norm": 0.000331960734911263,
+      "learning_rate": 3.2375540335404384e-05,
+      "loss": 0.004,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6932270916334662,
+      "grad_norm": 0.0023610808420926332,
+      "learning_rate": 3.2174249982660014e-05,
+      "loss": 0.0062,
+      "step": 2550
+    },
+    {
+      "epoch": 1.7264276228419655,
+      "grad_norm": 0.0006301538087427616,
+      "learning_rate": 3.197295962991565e-05,
+      "loss": 0.0023,
+      "step": 2600
+    },
+    {
+      "epoch": 1.7596281540504648,
+      "grad_norm": 0.00027294279425404966,
+      "learning_rate": 3.177166927717127e-05,
+      "loss": 0.0078,
+      "step": 2650
+    },
+    {
+      "epoch": 1.792828685258964,
+      "grad_norm": 0.012537718750536442,
+      "learning_rate": 3.157037892442691e-05,
+      "loss": 0.0069,
+      "step": 2700
+    },
+    {
+      "epoch": 1.8260292164674636,
+      "grad_norm": 0.0029420643113553524,
+      "learning_rate": 3.136908857168254e-05,
+      "loss": 0.0086,
+      "step": 2750
+    },
+    {
+      "epoch": 1.859229747675963,
+      "grad_norm": 0.023261522874236107,
+      "learning_rate": 3.116779821893817e-05,
+      "loss": 0.0099,
+      "step": 2800
+    },
+    {
+      "epoch": 1.8924302788844622,
+      "grad_norm": 0.0013812623219564557,
+      "learning_rate": 3.0966507866193796e-05,
+      "loss": 0.007,
+      "step": 2850
+    },
+    {
+      "epoch": 1.9256308100929616,
+      "grad_norm": 0.0014662343310192227,
+      "learning_rate": 3.0765217513449426e-05,
+      "loss": 0.0013,
+      "step": 2900
+    },
+    {
+      "epoch": 1.9588313413014609,
+      "grad_norm": 0.014669010415673256,
+      "learning_rate": 3.0563927160705055e-05,
+      "loss": 0.0134,
+      "step": 2950
+    },
+    {
+      "epoch": 1.9920318725099602,
+      "grad_norm": 0.014950312674045563,
+      "learning_rate": 3.036263680796069e-05,
+      "loss": 0.0061,
+      "step": 3000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9977337786259542,
+      "eval_f1": 0.9977150455912147,
+      "eval_loss": 0.013810121454298496,
+      "eval_precision": 0.9977195036627051,
+      "eval_recall": 0.9977337786259542,
+      "eval_runtime": 31.5085,
+      "eval_samples_per_second": 266.087,
+      "eval_steps_per_second": 8.315,
+      "step": 3012
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 10542,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 7,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/trial-9/checkpoint-3012/training_args.bin b/trial-9/checkpoint-3012/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5f1454f457da94bd2c196f1046d24d373d571f40
--- /dev/null
+++ b/trial-9/checkpoint-3012/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72c66611e9dfe0c92b05ede60995e52f447a378ac3e2dd77b9ffae1fd950a0d4
+size 5368