AhmedSSoliman commited on Feb 12

Commit

a671687

verified ·

1 Parent(s): 6d06c2a

Model save

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +73 -0
config.json +47 -0
model.safetensors +3 -0
training_args.bin +3 -0
trial-0/checkpoint-1506/config.json +47 -0
trial-0/checkpoint-1506/model.safetensors +3 -0
trial-0/checkpoint-1506/optimizer.pt +3 -0
trial-0/checkpoint-1506/rng_state.pth +3 -0
trial-0/checkpoint-1506/scheduler.pt +3 -0
trial-0/checkpoint-1506/trainer_state.json +255 -0
trial-0/checkpoint-1506/training_args.bin +3 -0
trial-1/checkpoint-6022/config.json +47 -0
trial-1/checkpoint-6022/model.safetensors +3 -0
trial-1/checkpoint-6022/optimizer.pt +3 -0
trial-1/checkpoint-6022/rng_state.pth +3 -0
trial-1/checkpoint-6022/scheduler.pt +3 -0
trial-1/checkpoint-6022/trainer_state.json +897 -0
trial-1/checkpoint-6022/training_args.bin +3 -0
trial-2/checkpoint-6022/config.json +47 -0
trial-2/checkpoint-6022/model.safetensors +3 -0
trial-2/checkpoint-6022/optimizer.pt +3 -0
trial-2/checkpoint-6022/rng_state.pth +3 -0
trial-2/checkpoint-6022/scheduler.pt +3 -0
trial-2/checkpoint-6022/trainer_state.json +897 -0
trial-2/checkpoint-6022/training_args.bin +3 -0
trial-3/checkpoint-1506/config.json +47 -0
trial-3/checkpoint-1506/model.safetensors +3 -0
trial-3/checkpoint-1506/optimizer.pt +3 -0
trial-3/checkpoint-1506/rng_state.pth +3 -0
trial-3/checkpoint-1506/scheduler.pt +3 -0
trial-3/checkpoint-1506/trainer_state.json +255 -0
trial-3/checkpoint-1506/training_args.bin +3 -0
trial-4/checkpoint-3011/config.json +47 -0
trial-4/checkpoint-3011/model.safetensors +3 -0
trial-4/checkpoint-3011/optimizer.pt +3 -0
trial-4/checkpoint-3011/rng_state.pth +3 -0
trial-4/checkpoint-3011/scheduler.pt +3 -0
trial-4/checkpoint-3011/trainer_state.json +465 -0
trial-4/checkpoint-3011/training_args.bin +3 -0
trial-5/checkpoint-3012/config.json +47 -0
trial-5/checkpoint-3012/model.safetensors +3 -0
trial-5/checkpoint-3012/optimizer.pt +3 -0
trial-5/checkpoint-3012/rng_state.pth +3 -0
trial-5/checkpoint-3012/scheduler.pt +3 -0
trial-5/checkpoint-3012/trainer_state.json +477 -0
trial-5/checkpoint-3012/training_args.bin +3 -0
trial-6/checkpoint-6022/config.json +47 -0
trial-6/checkpoint-6022/model.safetensors +3 -0
trial-6/checkpoint-6022/optimizer.pt +3 -0
trial-6/checkpoint-6022/rng_state.pth +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+---
+library_name: transformers
+license: apache-2.0
+base_model: answerdotai/ModernBERT-base
+tags:
+- generated_from_trainer
+metrics:
+- accuracy
+- precision
+- recall
+- f1
+model-index:
+- name: answerdotai-ModernBERT-base-finetuned
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# answerdotai-ModernBERT-base-finetuned
+This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0116
+- Accuracy: 0.9976
+- Precision: 0.9977
+- Recall: 0.9976
+- F1: 0.9976
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 4.244005797262286e-05
+- train_batch_size: 32
+- eval_batch_size: 32
+- seed: 42
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- num_epochs: 7
+### Training results
+| Training Loss | Epoch | Step  | Validation Loss | Accuracy | Precision | Recall | F1     |
+|:-------------:|:-----:|:-----:|:---------------:|:--------:|:---------:|:------:|:------:|
+| 0.0175        | 1.0   | 1506  | 0.0195          | 0.9971   | 0.9971    | 0.9971 | 0.9971 |
+| 0.0134        | 2.0   | 3012  | 0.0153          | 0.9970   | 0.9970    | 0.9970 | 0.9970 |
+| 0.0           | 3.0   | 4518  | 0.0228          | 0.9976   | 0.9976    | 0.9976 | 0.9976 |
+| 0.0           | 4.0   | 6024  | 0.0270          | 0.9976   | 0.9976    | 0.9976 | 0.9976 |
+| 0.0           | 5.0   | 7530  | 0.0272          | 0.9976   | 0.9976    | 0.9976 | 0.9976 |
+| 0.0           | 6.0   | 9036  | 0.0279          | 0.9975   | 0.9975    | 0.9975 | 0.9975 |
+| 0.0           | 7.0   | 10542 | 0.0283          | 0.9975   | 0.9975    | 0.9975 | 0.9975 |
+### Framework versions
+- Transformers 4.48.0.dev0
+- Pytorch 2.5.1+cu124
+- Datasets 3.2.0
+- Tokenizers 0.21.0

config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd2c8555404b25095196f950baad8216db0404ff16448d62a6d453105d7bd0c7
+size 598439784

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33b0c987e99ad21c3b9517dc831f21fd66bcbcd55d62a62f0a28008a0e8674e2
+size 5432

trial-0/checkpoint-1506/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}

trial-0/checkpoint-1506/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68eefa4a9be7b2db68618e1cb44c2cdf2163fb53cc3380fc52767266b121ddd2
+size 598439784

trial-0/checkpoint-1506/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08a1a4cc69805f73befa2723d41c1d97c0a2f799125f15e25de8295d6c23580c
+size 1196967418

trial-0/checkpoint-1506/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
+size 14244

trial-0/checkpoint-1506/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5bddebb63f2196cebff07c6da8f9e668e8379463981f8be40fb7e151e6c09ff
+size 1064

trial-0/checkpoint-1506/trainer_state.json ADDED Viewed

	@@ -0,0 +1,255 @@

+{
+  "best_metric": 0.02135350927710533,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-0/checkpoint-1506",
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1506,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.033200531208499334,
+      "grad_norm": 11.822611808776855,
+      "learning_rate": 4.4935320035267014e-05,
+      "loss": 0.295,
+      "step": 50
+    },
+    {
+      "epoch": 0.06640106241699867,
+      "grad_norm": 0.11557121574878693,
+      "learning_rate": 4.463495024893502e-05,
+      "loss": 0.0808,
+      "step": 100
+    },
+    {
+      "epoch": 0.099601593625498,
+      "grad_norm": 0.01743650808930397,
+      "learning_rate": 4.433458046260302e-05,
+      "loss": 0.052,
+      "step": 150
+    },
+    {
+      "epoch": 0.13280212483399734,
+      "grad_norm": 4.474731922149658,
+      "learning_rate": 4.4034210676271024e-05,
+      "loss": 0.0491,
+      "step": 200
+    },
+    {
+      "epoch": 0.16600265604249667,
+      "grad_norm": 4.205756664276123,
+      "learning_rate": 4.373384088993902e-05,
+      "loss": 0.0344,
+      "step": 250
+    },
+    {
+      "epoch": 0.199203187250996,
+      "grad_norm": 4.239188194274902,
+      "learning_rate": 4.343347110360703e-05,
+      "loss": 0.0295,
+      "step": 300
+    },
+    {
+      "epoch": 0.23240371845949534,
+      "grad_norm": 0.19662700593471527,
+      "learning_rate": 4.3133101317275027e-05,
+      "loss": 0.0342,
+      "step": 350
+    },
+    {
+      "epoch": 0.2656042496679947,
+      "grad_norm": 0.008393031544983387,
+      "learning_rate": 4.2832731530943025e-05,
+      "loss": 0.0245,
+      "step": 400
+    },
+    {
+      "epoch": 0.29880478087649404,
+      "grad_norm": 0.06995929777622223,
+      "learning_rate": 4.253236174461103e-05,
+      "loss": 0.0281,
+      "step": 450
+    },
+    {
+      "epoch": 0.33200531208499334,
+      "grad_norm": 0.010315222665667534,
+      "learning_rate": 4.223199195827902e-05,
+      "loss": 0.0188,
+      "step": 500
+    },
+    {
+      "epoch": 0.3652058432934927,
+      "grad_norm": 3.1021769046783447,
+      "learning_rate": 4.193162217194703e-05,
+      "loss": 0.018,
+      "step": 550
+    },
+    {
+      "epoch": 0.398406374501992,
+      "grad_norm": 0.00041495164623484015,
+      "learning_rate": 4.1631252385615027e-05,
+      "loss": 0.0053,
+      "step": 600
+    },
+    {
+      "epoch": 0.4316069057104914,
+      "grad_norm": 0.19596342742443085,
+      "learning_rate": 4.133088259928303e-05,
+      "loss": 0.0178,
+      "step": 650
+    },
+    {
+      "epoch": 0.4648074369189907,
+      "grad_norm": 0.0566418319940567,
+      "learning_rate": 4.103051281295103e-05,
+      "loss": 0.0101,
+      "step": 700
+    },
+    {
+      "epoch": 0.49800796812749004,
+      "grad_norm": 0.005816417746245861,
+      "learning_rate": 4.0730143026619036e-05,
+      "loss": 0.0166,
+      "step": 750
+    },
+    {
+      "epoch": 0.5312084993359893,
+      "grad_norm": 2.2474324703216553,
+      "learning_rate": 4.0429773240287035e-05,
+      "loss": 0.0156,
+      "step": 800
+    },
+    {
+      "epoch": 0.5644090305444888,
+      "grad_norm": 0.06311876326799393,
+      "learning_rate": 4.0129403453955033e-05,
+      "loss": 0.0166,
+      "step": 850
+    },
+    {
+      "epoch": 0.5976095617529881,
+      "grad_norm": 0.012764506973326206,
+      "learning_rate": 3.982903366762304e-05,
+      "loss": 0.0175,
+      "step": 900
+    },
+    {
+      "epoch": 0.6308100929614874,
+      "grad_norm": 0.00253055221401155,
+      "learning_rate": 3.952866388129104e-05,
+      "loss": 0.0047,
+      "step": 950
+    },
+    {
+      "epoch": 0.6640106241699867,
+      "grad_norm": 0.03604559600353241,
+      "learning_rate": 3.922829409495904e-05,
+      "loss": 0.016,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6972111553784861,
+      "grad_norm": 0.006498202681541443,
+      "learning_rate": 3.892792430862704e-05,
+      "loss": 0.0055,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7304116865869854,
+      "grad_norm": 0.11296769976615906,
+      "learning_rate": 3.862755452229504e-05,
+      "loss": 0.0122,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7636122177954847,
+      "grad_norm": 0.0005851402529515326,
+      "learning_rate": 3.8327184735963046e-05,
+      "loss": 0.01,
+      "step": 1150
+    },
+    {
+      "epoch": 0.796812749003984,
+      "grad_norm": 0.018440622836351395,
+      "learning_rate": 3.8026814949631044e-05,
+      "loss": 0.0064,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8300132802124834,
+      "grad_norm": 0.0023099363315850496,
+      "learning_rate": 3.772644516329905e-05,
+      "loss": 0.0011,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8632138114209827,
+      "grad_norm": 0.07595626264810562,
+      "learning_rate": 3.742607537696705e-05,
+      "loss": 0.0156,
+      "step": 1300
+    },
+    {
+      "epoch": 0.896414342629482,
+      "grad_norm": 0.0008996099350042641,
+      "learning_rate": 3.7125705590635054e-05,
+      "loss": 0.0103,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9296148738379814,
+      "grad_norm": 3.656134504126385e-05,
+      "learning_rate": 3.682533580430305e-05,
+      "loss": 0.0027,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9628154050464808,
+      "grad_norm": 0.2666904032230377,
+      "learning_rate": 3.652496601797105e-05,
+      "loss": 0.0152,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9960159362549801,
+      "grad_norm": 0.011590929701924324,
+      "learning_rate": 3.622459623163905e-05,
+      "loss": 0.0115,
+      "step": 1500
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9963024809160306,
+      "eval_f1": 0.9962997469825083,
+      "eval_loss": 0.02135350927710533,
+      "eval_precision": 0.9962971957079396,
+      "eval_recall": 0.9963024809160306,
+      "eval_runtime": 34.0647,
+      "eval_samples_per_second": 246.12,
+      "eval_steps_per_second": 7.691,
+      "step": 1506
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 7530,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.641430544259072e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

trial-0/checkpoint-1506/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f87e0989b8aabc63686d8b1c4f4f6463501f9b534fd10b5dda472e02e5c6d200
+size 5368

trial-1/checkpoint-6022/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}

trial-1/checkpoint-6022/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9376e02caf20a3536db5adaec49e89c8583378974c975bdfa4e4fa72bb7ed87c
+size 598439784

trial-1/checkpoint-6022/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f989a18c3b9f0cb969ade19c78b7d7d4405053c69000081f12d16f8076c4691
+size 1196967418

trial-1/checkpoint-6022/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244

trial-1/checkpoint-6022/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04bd594b0cd8e46cee28cfc34b0ba6a02854df28789c81eb4c180d9356f4de00
+size 1064

trial-1/checkpoint-6022/trainer_state.json ADDED Viewed

	@@ -0,0 +1,897 @@

+{
+  "best_metric": 0.0445549376308918,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-1/checkpoint-6022",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 6022,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016605778811026237,
+      "grad_norm": 15.757351875305176,
+      "learning_rate": 2.4306427769118723e-06,
+      "loss": 0.6703,
+      "step": 50
+    },
+    {
+      "epoch": 0.033211557622052475,
+      "grad_norm": 14.056926727294922,
+      "learning_rate": 2.425586942863882e-06,
+      "loss": 0.4736,
+      "step": 100
+    },
+    {
+      "epoch": 0.04981733643307871,
+      "grad_norm": 15.678231239318848,
+      "learning_rate": 2.4205311088158915e-06,
+      "loss": 0.338,
+      "step": 150
+    },
+    {
+      "epoch": 0.06642311524410495,
+      "grad_norm": 4.84220552444458,
+      "learning_rate": 2.4154752747679013e-06,
+      "loss": 0.2931,
+      "step": 200
+    },
+    {
+      "epoch": 0.08302889405513118,
+      "grad_norm": 5.182389736175537,
+      "learning_rate": 2.4104194407199107e-06,
+      "loss": 0.251,
+      "step": 250
+    },
+    {
+      "epoch": 0.09963467286615742,
+      "grad_norm": 1.5187151432037354,
+      "learning_rate": 2.4053636066719205e-06,
+      "loss": 0.2133,
+      "step": 300
+    },
+    {
+      "epoch": 0.11624045167718366,
+      "grad_norm": 16.253589630126953,
+      "learning_rate": 2.40030777262393e-06,
+      "loss": 0.1518,
+      "step": 350
+    },
+    {
+      "epoch": 0.1328462304882099,
+      "grad_norm": 6.757865905761719,
+      "learning_rate": 2.3952519385759397e-06,
+      "loss": 0.1508,
+      "step": 400
+    },
+    {
+      "epoch": 0.14945200929923613,
+      "grad_norm": 2.119438886642456,
+      "learning_rate": 2.390196104527949e-06,
+      "loss": 0.1175,
+      "step": 450
+    },
+    {
+      "epoch": 0.16605778811026237,
+      "grad_norm": 15.932334899902344,
+      "learning_rate": 2.3851402704799585e-06,
+      "loss": 0.1401,
+      "step": 500
+    },
+    {
+      "epoch": 0.1826635669212886,
+      "grad_norm": 22.459735870361328,
+      "learning_rate": 2.3800844364319683e-06,
+      "loss": 0.1384,
+      "step": 550
+    },
+    {
+      "epoch": 0.19926934573231483,
+      "grad_norm": 10.65778923034668,
+      "learning_rate": 2.3750286023839777e-06,
+      "loss": 0.1179,
+      "step": 600
+    },
+    {
+      "epoch": 0.2158751245433411,
+      "grad_norm": 6.71965217590332,
+      "learning_rate": 2.3699727683359876e-06,
+      "loss": 0.0782,
+      "step": 650
+    },
+    {
+      "epoch": 0.23248090335436733,
+      "grad_norm": 3.6098344326019287,
+      "learning_rate": 2.364916934287997e-06,
+      "loss": 0.138,
+      "step": 700
+    },
+    {
+      "epoch": 0.24908668216539356,
+      "grad_norm": 2.3249447345733643,
+      "learning_rate": 2.3598611002400068e-06,
+      "loss": 0.1087,
+      "step": 750
+    },
+    {
+      "epoch": 0.2656924609764198,
+      "grad_norm": 15.047837257385254,
+      "learning_rate": 2.354805266192016e-06,
+      "loss": 0.0868,
+      "step": 800
+    },
+    {
+      "epoch": 0.282298239787446,
+      "grad_norm": 6.7322773933410645,
+      "learning_rate": 2.349749432144026e-06,
+      "loss": 0.0954,
+      "step": 850
+    },
+    {
+      "epoch": 0.29890401859847227,
+      "grad_norm": 12.954623222351074,
+      "learning_rate": 2.3446935980960354e-06,
+      "loss": 0.0689,
+      "step": 900
+    },
+    {
+      "epoch": 0.3155097974094985,
+      "grad_norm": 1.4312756061553955,
+      "learning_rate": 2.3396377640480448e-06,
+      "loss": 0.0908,
+      "step": 950
+    },
+    {
+      "epoch": 0.33211557622052473,
+      "grad_norm": 0.21316280961036682,
+      "learning_rate": 2.3345819300000546e-06,
+      "loss": 0.0766,
+      "step": 1000
+    },
+    {
+      "epoch": 0.348721355031551,
+      "grad_norm": 13.642809867858887,
+      "learning_rate": 2.329526095952064e-06,
+      "loss": 0.0533,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3653271338425772,
+      "grad_norm": 14.525202751159668,
+      "learning_rate": 2.324470261904074e-06,
+      "loss": 0.0745,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38193291265360346,
+      "grad_norm": 0.5210687518119812,
+      "learning_rate": 2.319414427856083e-06,
+      "loss": 0.0618,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39853869146462967,
+      "grad_norm": 0.07292640954256058,
+      "learning_rate": 2.314358593808093e-06,
+      "loss": 0.0307,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41514447027565593,
+      "grad_norm": 0.08236780017614365,
+      "learning_rate": 2.309302759760103e-06,
+      "loss": 0.0321,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4317502490866822,
+      "grad_norm": 28.97471809387207,
+      "learning_rate": 2.304246925712112e-06,
+      "loss": 0.0748,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4483560278977084,
+      "grad_norm": 0.4781515896320343,
+      "learning_rate": 2.2991910916641216e-06,
+      "loss": 0.0733,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46496180670873466,
+      "grad_norm": 3.214794397354126,
+      "learning_rate": 2.2941352576161314e-06,
+      "loss": 0.0149,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48156758551976087,
+      "grad_norm": 0.3289443850517273,
+      "learning_rate": 2.289079423568141e-06,
+      "loss": 0.0401,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4981733643307871,
+      "grad_norm": 0.12368986010551453,
+      "learning_rate": 2.28402358952015e-06,
+      "loss": 0.0334,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5147791431418134,
+      "grad_norm": 0.08283340185880661,
+      "learning_rate": 2.27896775547216e-06,
+      "loss": 0.0331,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5313849219528396,
+      "grad_norm": 2.650063991546631,
+      "learning_rate": 2.2739119214241694e-06,
+      "loss": 0.0496,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5479907007638658,
+      "grad_norm": 3.296297311782837,
+      "learning_rate": 2.2688560873761792e-06,
+      "loss": 0.0365,
+      "step": 1650
+    },
+    {
+      "epoch": 0.564596479574892,
+      "grad_norm": 0.032304324209690094,
+      "learning_rate": 2.263800253328189e-06,
+      "loss": 0.005,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5812022583859183,
+      "grad_norm": 0.003552216337993741,
+      "learning_rate": 2.2587444192801985e-06,
+      "loss": 0.0183,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5978080371969445,
+      "grad_norm": 0.0315885953605175,
+      "learning_rate": 2.253688585232208e-06,
+      "loss": 0.0184,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6144138160079707,
+      "grad_norm": 0.004702410195022821,
+      "learning_rate": 2.2486327511842177e-06,
+      "loss": 0.0346,
+      "step": 1850
+    },
+    {
+      "epoch": 0.631019594818997,
+      "grad_norm": 0.07862639427185059,
+      "learning_rate": 2.243576917136227e-06,
+      "loss": 0.0296,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6476253736300233,
+      "grad_norm": 0.3578585982322693,
+      "learning_rate": 2.2385210830882364e-06,
+      "loss": 0.0266,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6642311524410495,
+      "grad_norm": 0.045335959643125534,
+      "learning_rate": 2.2334652490402463e-06,
+      "loss": 0.032,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6808369312520757,
+      "grad_norm": 1.6869137287139893,
+      "learning_rate": 2.2284094149922557e-06,
+      "loss": 0.0297,
+      "step": 2050
+    },
+    {
+      "epoch": 0.697442710063102,
+      "grad_norm": 0.6017621755599976,
+      "learning_rate": 2.2233535809442655e-06,
+      "loss": 0.0119,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7140484888741282,
+      "grad_norm": 0.13145552575588226,
+      "learning_rate": 2.2182977468962753e-06,
+      "loss": 0.0157,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7306542676851544,
+      "grad_norm": 0.00971242692321539,
+      "learning_rate": 2.2132419128482847e-06,
+      "loss": 0.0099,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7472600464961807,
+      "grad_norm": 0.5801131725311279,
+      "learning_rate": 2.208186078800294e-06,
+      "loss": 0.0235,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7638658253072069,
+      "grad_norm": 0.008363746106624603,
+      "learning_rate": 2.203130244752304e-06,
+      "loss": 0.0275,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7804716041182331,
+      "grad_norm": 0.23013177514076233,
+      "learning_rate": 2.1980744107043133e-06,
+      "loss": 0.0022,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7970773829292593,
+      "grad_norm": 0.044313572347164154,
+      "learning_rate": 2.1930185766563227e-06,
+      "loss": 0.0185,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8136831617402857,
+      "grad_norm": 0.008519169874489307,
+      "learning_rate": 2.1879627426083325e-06,
+      "loss": 0.0023,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8302889405513119,
+      "grad_norm": 0.0008576350519433618,
+      "learning_rate": 2.182906908560342e-06,
+      "loss": 0.0062,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8468947193623381,
+      "grad_norm": 0.56068354845047,
+      "learning_rate": 2.1778510745123517e-06,
+      "loss": 0.0106,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8635004981733644,
+      "grad_norm": 33.770652770996094,
+      "learning_rate": 2.1727952404643615e-06,
+      "loss": 0.0298,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8801062769843906,
+      "grad_norm": 0.0006891911034472287,
+      "learning_rate": 2.167739406416371e-06,
+      "loss": 0.0046,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8967120557954168,
+      "grad_norm": 0.000691475928761065,
+      "learning_rate": 2.1626835723683803e-06,
+      "loss": 0.0014,
+      "step": 2700
+    },
+    {
+      "epoch": 0.913317834606443,
+      "grad_norm": 0.022216275334358215,
+      "learning_rate": 2.15762773832039e-06,
+      "loss": 0.0152,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9299236134174693,
+      "grad_norm": 0.0004267705953679979,
+      "learning_rate": 2.1525719042723995e-06,
+      "loss": 0.0117,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9465293922284955,
+      "grad_norm": 0.016712836921215057,
+      "learning_rate": 2.147516070224409e-06,
+      "loss": 0.0009,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9631351710395217,
+      "grad_norm": 23.74860382080078,
+      "learning_rate": 2.1424602361764187e-06,
+      "loss": 0.0233,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9797409498505479,
+      "grad_norm": 0.0039037028327584267,
+      "learning_rate": 2.137404402128428e-06,
+      "loss": 0.0193,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9963467286615743,
+      "grad_norm": 0.0023961260449141264,
+      "learning_rate": 2.132348568080438e-06,
+      "loss": 0.0068,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9921278625954199,
+      "eval_f1": 0.9921278625954199,
+      "eval_loss": 0.046909503638744354,
+      "eval_precision": 0.9921278625954199,
+      "eval_recall": 0.9921278625954199,
+      "eval_runtime": 36.762,
+      "eval_samples_per_second": 228.061,
+      "eval_steps_per_second": 14.254,
+      "step": 3011
+    },
+    {
+      "epoch": 1.0129525074726005,
+      "grad_norm": 0.0033601378090679646,
+      "learning_rate": 2.1272927340324478e-06,
+      "loss": 0.0005,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0295582862836268,
+      "grad_norm": 0.038166940212249756,
+      "learning_rate": 2.122236899984457e-06,
+      "loss": 0.0002,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0461640650946529,
+      "grad_norm": 0.0003456630220171064,
+      "learning_rate": 2.1171810659364666e-06,
+      "loss": 0.0139,
+      "step": 3150
+    },
+    {
+      "epoch": 1.0627698439056792,
+      "grad_norm": 0.004587268922477961,
+      "learning_rate": 2.1121252318884764e-06,
+      "loss": 0.0001,
+      "step": 3200
+    },
+    {
+      "epoch": 1.0793756227167055,
+      "grad_norm": 0.08502045273780823,
+      "learning_rate": 2.1070693978404858e-06,
+      "loss": 0.0216,
+      "step": 3250
+    },
+    {
+      "epoch": 1.0959814015277316,
+      "grad_norm": 0.10945820808410645,
+      "learning_rate": 2.102013563792495e-06,
+      "loss": 0.0256,
+      "step": 3300
+    },
+    {
+      "epoch": 1.112587180338758,
+      "grad_norm": 0.03236968442797661,
+      "learning_rate": 2.096957729744505e-06,
+      "loss": 0.005,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1291929591497842,
+      "grad_norm": 0.007731316145509481,
+      "learning_rate": 2.0919018956965144e-06,
+      "loss": 0.0101,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1457987379608103,
+      "grad_norm": 0.00674546230584383,
+      "learning_rate": 2.086846061648524e-06,
+      "loss": 0.0051,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1624045167718366,
+      "grad_norm": 0.004380326252430677,
+      "learning_rate": 2.081790227600534e-06,
+      "loss": 0.0039,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1790102955828627,
+      "grad_norm": 0.031456008553504944,
+      "learning_rate": 2.0767343935525434e-06,
+      "loss": 0.0001,
+      "step": 3550
+    },
+    {
+      "epoch": 1.195616074393889,
+      "grad_norm": 0.017602458596229553,
+      "learning_rate": 2.071678559504553e-06,
+      "loss": 0.006,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2122218532049154,
+      "grad_norm": 0.009589639492332935,
+      "learning_rate": 2.0666227254565626e-06,
+      "loss": 0.001,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2288276320159415,
+      "grad_norm": 0.003254746785387397,
+      "learning_rate": 2.061566891408572e-06,
+      "loss": 0.0,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2454334108269678,
+      "grad_norm": 0.0011986729223281145,
+      "learning_rate": 2.056511057360582e-06,
+      "loss": 0.0126,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2620391896379939,
+      "grad_norm": 0.006293583195656538,
+      "learning_rate": 2.0514552233125912e-06,
+      "loss": 0.0006,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2786449684490202,
+      "grad_norm": 0.11370380967855453,
+      "learning_rate": 2.0463993892646006e-06,
+      "loss": 0.0252,
+      "step": 3850
+    },
+    {
+      "epoch": 1.2952507472600465,
+      "grad_norm": 0.0018469190690666437,
+      "learning_rate": 2.0413435552166104e-06,
+      "loss": 0.0004,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3118565260710726,
+      "grad_norm": 0.0002411604655208066,
+      "learning_rate": 2.0362877211686202e-06,
+      "loss": 0.003,
+      "step": 3950
+    },
+    {
+      "epoch": 1.328462304882099,
+      "grad_norm": 4.065009852638468e-05,
+      "learning_rate": 2.0312318871206296e-06,
+      "loss": 0.0165,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3450680836931252,
+      "grad_norm": 0.005062599666416645,
+      "learning_rate": 2.0261760530726395e-06,
+      "loss": 0.0028,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3616738625041513,
+      "grad_norm": 0.017400013282895088,
+      "learning_rate": 2.021120219024649e-06,
+      "loss": 0.001,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3782796413151777,
+      "grad_norm": 0.05683843046426773,
+      "learning_rate": 2.0160643849766582e-06,
+      "loss": 0.0124,
+      "step": 4150
+    },
+    {
+      "epoch": 1.394885420126204,
+      "grad_norm": 0.0027029893826693296,
+      "learning_rate": 2.011008550928668e-06,
+      "loss": 0.0003,
+      "step": 4200
+    },
+    {
+      "epoch": 1.41149119893723,
+      "grad_norm": 0.002034110017120838,
+      "learning_rate": 2.0059527168806775e-06,
+      "loss": 0.0073,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4280969777482564,
+      "grad_norm": 0.001398180378600955,
+      "learning_rate": 2.000896882832687e-06,
+      "loss": 0.0044,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4447027565592827,
+      "grad_norm": 0.00037716259248554707,
+      "learning_rate": 1.9958410487846967e-06,
+      "loss": 0.0228,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4613085353703088,
+      "grad_norm": 0.015627387911081314,
+      "learning_rate": 1.9907852147367065e-06,
+      "loss": 0.0114,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4779143141813351,
+      "grad_norm": 0.008964600041508675,
+      "learning_rate": 1.985729380688716e-06,
+      "loss": 0.0032,
+      "step": 4450
+    },
+    {
+      "epoch": 1.4945200929923614,
+      "grad_norm": 0.003252738853916526,
+      "learning_rate": 1.9806735466407257e-06,
+      "loss": 0.0082,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5111258718033875,
+      "grad_norm": 0.00012037971464451402,
+      "learning_rate": 1.975617712592735e-06,
+      "loss": 0.0001,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5277316506144138,
+      "grad_norm": 0.010974590666592121,
+      "learning_rate": 1.9705618785447445e-06,
+      "loss": 0.0,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5443374294254402,
+      "grad_norm": 0.08398176729679108,
+      "learning_rate": 1.9655060444967543e-06,
+      "loss": 0.0002,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5609432082364663,
+      "grad_norm": 0.03629281371831894,
+      "learning_rate": 1.9604502104487637e-06,
+      "loss": 0.006,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5775489870474926,
+      "grad_norm": 0.00034110501292161644,
+      "learning_rate": 1.955394376400773e-06,
+      "loss": 0.0003,
+      "step": 4750
+    },
+    {
+      "epoch": 1.594154765858519,
+      "grad_norm": 0.0027959852013736963,
+      "learning_rate": 1.950338542352783e-06,
+      "loss": 0.0,
+      "step": 4800
+    },
+    {
+      "epoch": 1.610760544669545,
+      "grad_norm": 0.0001677741383900866,
+      "learning_rate": 1.9452827083047927e-06,
+      "loss": 0.0023,
+      "step": 4850
+    },
+    {
+      "epoch": 1.627366323480571,
+      "grad_norm": 0.055583104491233826,
+      "learning_rate": 1.940226874256802e-06,
+      "loss": 0.0225,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6439721022915976,
+      "grad_norm": 8.664117194712162e-05,
+      "learning_rate": 1.935171040208812e-06,
+      "loss": 0.0009,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6605778811026237,
+      "grad_norm": 0.0017323939828202128,
+      "learning_rate": 1.9301152061608213e-06,
+      "loss": 0.008,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6771836599136498,
+      "grad_norm": 0.0034425491467118263,
+      "learning_rate": 1.9250593721128307e-06,
+      "loss": 0.0,
+      "step": 5050
+    },
+    {
+      "epoch": 1.6937894387246761,
+      "grad_norm": 6.076216959627345e-05,
+      "learning_rate": 1.9200035380648405e-06,
+      "loss": 0.0041,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7103952175357025,
+      "grad_norm": 0.0018082900205627084,
+      "learning_rate": 1.91494770401685e-06,
+      "loss": 0.0017,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7270009963467285,
+      "grad_norm": 0.008552160114049911,
+      "learning_rate": 1.9098918699688593e-06,
+      "loss": 0.0137,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7436067751577549,
+      "grad_norm": 0.08908296376466751,
+      "learning_rate": 1.9048360359208694e-06,
+      "loss": 0.0092,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7602125539687812,
+      "grad_norm": 0.002973488997668028,
+      "learning_rate": 1.8997802018728788e-06,
+      "loss": 0.0002,
+      "step": 5300
+    },
+    {
+      "epoch": 1.7768183327798073,
+      "grad_norm": 0.005116044543683529,
+      "learning_rate": 1.8947243678248884e-06,
+      "loss": 0.0079,
+      "step": 5350
+    },
+    {
+      "epoch": 1.7934241115908336,
+      "grad_norm": 0.002092874376103282,
+      "learning_rate": 1.889668533776898e-06,
+      "loss": 0.0,
+      "step": 5400
+    },
+    {
+      "epoch": 1.81002989040186,
+      "grad_norm": 0.0070649790577590466,
+      "learning_rate": 1.8846126997289076e-06,
+      "loss": 0.0,
+      "step": 5450
+    },
+    {
+      "epoch": 1.826635669212886,
+      "grad_norm": 0.001974167302250862,
+      "learning_rate": 1.879556865680917e-06,
+      "loss": 0.016,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8432414480239123,
+      "grad_norm": 0.0012006360339000821,
+      "learning_rate": 1.8745010316329268e-06,
+      "loss": 0.0,
+      "step": 5550
+    },
+    {
+      "epoch": 1.8598472268349386,
+      "grad_norm": 0.006318301893770695,
+      "learning_rate": 1.8694451975849362e-06,
+      "loss": 0.0,
+      "step": 5600
+    },
+    {
+      "epoch": 1.8764530056459647,
+      "grad_norm": 0.0020722977351397276,
+      "learning_rate": 1.8643893635369458e-06,
+      "loss": 0.0104,
+      "step": 5650
+    },
+    {
+      "epoch": 1.893058784456991,
+      "grad_norm": 0.0874456912279129,
+      "learning_rate": 1.8593335294889556e-06,
+      "loss": 0.0023,
+      "step": 5700
+    },
+    {
+      "epoch": 1.9096645632680174,
+      "grad_norm": 0.00042386740096844733,
+      "learning_rate": 1.854277695440965e-06,
+      "loss": 0.0105,
+      "step": 5750
+    },
+    {
+      "epoch": 1.9262703420790435,
+      "grad_norm": 0.05140538513660431,
+      "learning_rate": 1.8492218613929746e-06,
+      "loss": 0.0008,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9428761208900698,
+      "grad_norm": 0.00046465068589895964,
+      "learning_rate": 1.8441660273449842e-06,
+      "loss": 0.0176,
+      "step": 5850
+    },
+    {
+      "epoch": 1.959481899701096,
+      "grad_norm": 0.001875279936939478,
+      "learning_rate": 1.8391101932969938e-06,
+      "loss": 0.0002,
+      "step": 5900
+    },
+    {
+      "epoch": 1.9760876785121222,
+      "grad_norm": 0.0012590339174494147,
+      "learning_rate": 1.8340543592490032e-06,
+      "loss": 0.001,
+      "step": 5950
+    },
+    {
+      "epoch": 1.9926934573231485,
+      "grad_norm": 25.133811950683594,
+      "learning_rate": 1.828998525201013e-06,
+      "loss": 0.0229,
+      "step": 6000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.995706106870229,
+      "eval_f1": 0.9956269879098661,
+      "eval_loss": 0.0445549376308918,
+      "eval_precision": 0.9956596696711074,
+      "eval_recall": 0.995706106870229,
+      "eval_runtime": 38.3077,
+      "eval_samples_per_second": 218.859,
+      "eval_steps_per_second": 13.679,
+      "step": 6022
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 24088,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

trial-1/checkpoint-6022/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:161830f01fe4451cf2afb08516c24e569c5b229b44b735c51814ae17b5494e10
+size 5368

trial-2/checkpoint-6022/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}

trial-2/checkpoint-6022/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33d8242e8a21a76a0ad8b21949fe7bd68e94de5ce2da543a151336909fcb8e83
+size 598439784

trial-2/checkpoint-6022/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c89405c1def95fb7d1e0ff7deac188ca136134ebd620d1451c9f0d4ed557d77a
+size 1196967418

trial-2/checkpoint-6022/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244

trial-2/checkpoint-6022/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:daebe5b6f96508652ee77aa623e80e4943a4ab7b8acffe2720aa77d58c2624f9
+size 1064

trial-2/checkpoint-6022/trainer_state.json ADDED Viewed

	@@ -0,0 +1,897 @@

+{
+  "best_metric": 0.031979888677597046,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-2/checkpoint-6022",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 6022,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016605778811026237,
+      "grad_norm": 21.788597106933594,
+      "learning_rate": 5.429575351871404e-06,
+      "loss": 0.5789,
+      "step": 50
+    },
+    {
+      "epoch": 0.033211557622052475,
+      "grad_norm": 20.038349151611328,
+      "learning_rate": 5.416664391316233e-06,
+      "loss": 0.37,
+      "step": 100
+    },
+    {
+      "epoch": 0.04981733643307871,
+      "grad_norm": 23.927526473999023,
+      "learning_rate": 5.403753430761063e-06,
+      "loss": 0.25,
+      "step": 150
+    },
+    {
+      "epoch": 0.06642311524410495,
+      "grad_norm": 4.1712799072265625,
+      "learning_rate": 5.390842470205893e-06,
+      "loss": 0.1921,
+      "step": 200
+    },
+    {
+      "epoch": 0.08302889405513118,
+      "grad_norm": 6.138601303100586,
+      "learning_rate": 5.3779315096507225e-06,
+      "loss": 0.1365,
+      "step": 250
+    },
+    {
+      "epoch": 0.09963467286615742,
+      "grad_norm": 0.9431160092353821,
+      "learning_rate": 5.3650205490955514e-06,
+      "loss": 0.1473,
+      "step": 300
+    },
+    {
+      "epoch": 0.11624045167718366,
+      "grad_norm": 25.303245544433594,
+      "learning_rate": 5.352109588540381e-06,
+      "loss": 0.0875,
+      "step": 350
+    },
+    {
+      "epoch": 0.1328462304882099,
+      "grad_norm": 14.83379077911377,
+      "learning_rate": 5.33919862798521e-06,
+      "loss": 0.111,
+      "step": 400
+    },
+    {
+      "epoch": 0.14945200929923613,
+      "grad_norm": 0.2346535325050354,
+      "learning_rate": 5.32628766743004e-06,
+      "loss": 0.0722,
+      "step": 450
+    },
+    {
+      "epoch": 0.16605778811026237,
+      "grad_norm": 19.045169830322266,
+      "learning_rate": 5.31337670687487e-06,
+      "loss": 0.1236,
+      "step": 500
+    },
+    {
+      "epoch": 0.1826635669212886,
+      "grad_norm": 10.871609687805176,
+      "learning_rate": 5.300465746319699e-06,
+      "loss": 0.1018,
+      "step": 550
+    },
+    {
+      "epoch": 0.19926934573231483,
+      "grad_norm": 8.278830528259277,
+      "learning_rate": 5.287554785764528e-06,
+      "loss": 0.0608,
+      "step": 600
+    },
+    {
+      "epoch": 0.2158751245433411,
+      "grad_norm": 3.4486818313598633,
+      "learning_rate": 5.274643825209358e-06,
+      "loss": 0.0684,
+      "step": 650
+    },
+    {
+      "epoch": 0.23248090335436733,
+      "grad_norm": 9.789453506469727,
+      "learning_rate": 5.261732864654187e-06,
+      "loss": 0.0826,
+      "step": 700
+    },
+    {
+      "epoch": 0.24908668216539356,
+      "grad_norm": 0.013454285450279713,
+      "learning_rate": 5.248821904099017e-06,
+      "loss": 0.0672,
+      "step": 750
+    },
+    {
+      "epoch": 0.2656924609764198,
+      "grad_norm": 0.8878294825553894,
+      "learning_rate": 5.2359109435438465e-06,
+      "loss": 0.0472,
+      "step": 800
+    },
+    {
+      "epoch": 0.282298239787446,
+      "grad_norm": 15.41006088256836,
+      "learning_rate": 5.222999982988676e-06,
+      "loss": 0.0616,
+      "step": 850
+    },
+    {
+      "epoch": 0.29890401859847227,
+      "grad_norm": 0.04324938729405403,
+      "learning_rate": 5.210089022433506e-06,
+      "loss": 0.0215,
+      "step": 900
+    },
+    {
+      "epoch": 0.3155097974094985,
+      "grad_norm": 0.011849366128444672,
+      "learning_rate": 5.197178061878335e-06,
+      "loss": 0.0398,
+      "step": 950
+    },
+    {
+      "epoch": 0.33211557622052473,
+      "grad_norm": 0.0020897299982607365,
+      "learning_rate": 5.184267101323165e-06,
+      "loss": 0.0294,
+      "step": 1000
+    },
+    {
+      "epoch": 0.348721355031551,
+      "grad_norm": 0.00038467388367280364,
+      "learning_rate": 5.171356140767994e-06,
+      "loss": 0.0328,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3653271338425772,
+      "grad_norm": 0.0022064056247472763,
+      "learning_rate": 5.158445180212823e-06,
+      "loss": 0.0216,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38193291265360346,
+      "grad_norm": 0.012603014707565308,
+      "learning_rate": 5.145534219657653e-06,
+      "loss": 0.0293,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39853869146462967,
+      "grad_norm": 0.002970542525872588,
+      "learning_rate": 5.132623259102483e-06,
+      "loss": 0.0133,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41514447027565593,
+      "grad_norm": 0.09289965778589249,
+      "learning_rate": 5.119712298547312e-06,
+      "loss": 0.0189,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4317502490866822,
+      "grad_norm": 0.030116688460111618,
+      "learning_rate": 5.106801337992142e-06,
+      "loss": 0.0266,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4483560278977084,
+      "grad_norm": 23.291847229003906,
+      "learning_rate": 5.0938903774369705e-06,
+      "loss": 0.0378,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46496180670873466,
+      "grad_norm": 0.00580954784527421,
+      "learning_rate": 5.0809794168818e-06,
+      "loss": 0.0002,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48156758551976087,
+      "grad_norm": 0.0036250711418688297,
+      "learning_rate": 5.06806845632663e-06,
+      "loss": 0.0297,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4981733643307871,
+      "grad_norm": 0.0013630707981064916,
+      "learning_rate": 5.05515749577146e-06,
+      "loss": 0.0114,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5147791431418134,
+      "grad_norm": 0.025447094812989235,
+      "learning_rate": 5.042246535216289e-06,
+      "loss": 0.0019,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5313849219528396,
+      "grad_norm": 18.81841468811035,
+      "learning_rate": 5.0293355746611185e-06,
+      "loss": 0.0286,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5479907007638658,
+      "grad_norm": 0.0033424277789890766,
+      "learning_rate": 5.016424614105948e-06,
+      "loss": 0.0393,
+      "step": 1650
+    },
+    {
+      "epoch": 0.564596479574892,
+      "grad_norm": 0.039123374968767166,
+      "learning_rate": 5.003513653550777e-06,
+      "loss": 0.0186,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5812022583859183,
+      "grad_norm": 0.0005275913863442838,
+      "learning_rate": 4.990602692995607e-06,
+      "loss": 0.0003,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5978080371969445,
+      "grad_norm": 0.005070064682513475,
+      "learning_rate": 4.977691732440437e-06,
+      "loss": 0.01,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6144138160079707,
+      "grad_norm": 0.003932475112378597,
+      "learning_rate": 4.9647807718852664e-06,
+      "loss": 0.0222,
+      "step": 1850
+    },
+    {
+      "epoch": 0.631019594818997,
+      "grad_norm": 0.6544032692909241,
+      "learning_rate": 4.951869811330095e-06,
+      "loss": 0.0138,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6476253736300233,
+      "grad_norm": 0.008768323808908463,
+      "learning_rate": 4.938958850774925e-06,
+      "loss": 0.0056,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6642311524410495,
+      "grad_norm": 0.0021180976182222366,
+      "learning_rate": 4.926047890219754e-06,
+      "loss": 0.0049,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6808369312520757,
+      "grad_norm": 0.002039346843957901,
+      "learning_rate": 4.913136929664584e-06,
+      "loss": 0.0142,
+      "step": 2050
+    },
+    {
+      "epoch": 0.697442710063102,
+      "grad_norm": 0.012900142930448055,
+      "learning_rate": 4.9002259691094136e-06,
+      "loss": 0.0105,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7140484888741282,
+      "grad_norm": 0.0022153747268021107,
+      "learning_rate": 4.887315008554243e-06,
+      "loss": 0.0142,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7306542676851544,
+      "grad_norm": 0.001426122267730534,
+      "learning_rate": 4.874404047999072e-06,
+      "loss": 0.0068,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7472600464961807,
+      "grad_norm": 0.0008603449095971882,
+      "learning_rate": 4.861493087443902e-06,
+      "loss": 0.0119,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7638658253072069,
+      "grad_norm": 0.0006780526018701494,
+      "learning_rate": 4.848582126888731e-06,
+      "loss": 0.0108,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7804716041182331,
+      "grad_norm": 0.014527379535138607,
+      "learning_rate": 4.835671166333561e-06,
+      "loss": 0.0002,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7970773829292593,
+      "grad_norm": 0.00022624376288149506,
+      "learning_rate": 4.8227602057783904e-06,
+      "loss": 0.0092,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8136831617402857,
+      "grad_norm": 0.0044932495802640915,
+      "learning_rate": 4.80984924522322e-06,
+      "loss": 0.0001,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8302889405513119,
+      "grad_norm": 0.0009355309884995222,
+      "learning_rate": 4.79693828466805e-06,
+      "loss": 0.0002,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8468947193623381,
+      "grad_norm": 0.12550997734069824,
+      "learning_rate": 4.784027324112879e-06,
+      "loss": 0.0024,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8635004981733644,
+      "grad_norm": 0.02399071305990219,
+      "learning_rate": 4.771116363557709e-06,
+      "loss": 0.0099,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8801062769843906,
+      "grad_norm": 0.008470265194773674,
+      "learning_rate": 4.7582054030025375e-06,
+      "loss": 0.0157,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8967120557954168,
+      "grad_norm": 3.967735028709285e-05,
+      "learning_rate": 4.745294442447367e-06,
+      "loss": 0.0013,
+      "step": 2700
+    },
+    {
+      "epoch": 0.913317834606443,
+      "grad_norm": 0.0005532742943614721,
+      "learning_rate": 4.732383481892197e-06,
+      "loss": 0.0025,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9299236134174693,
+      "grad_norm": 9.227233022102155e-06,
+      "learning_rate": 4.719472521337027e-06,
+      "loss": 0.0028,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9465293922284955,
+      "grad_norm": 0.280258446931839,
+      "learning_rate": 4.706561560781856e-06,
+      "loss": 0.0004,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9631351710395217,
+      "grad_norm": 27.427757263183594,
+      "learning_rate": 4.6936506002266855e-06,
+      "loss": 0.0127,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9797409498505479,
+      "grad_norm": 176.85423278808594,
+      "learning_rate": 4.680739639671514e-06,
+      "loss": 0.0298,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9963467286615743,
+      "grad_norm": 0.00011263355554547161,
+      "learning_rate": 4.667828679116344e-06,
+      "loss": 0.001,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9963024809160306,
+      "eval_f1": 0.9962431632227496,
+      "eval_loss": 0.04071500524878502,
+      "eval_precision": 0.9962693439313673,
+      "eval_recall": 0.9963024809160306,
+      "eval_runtime": 38.0003,
+      "eval_samples_per_second": 220.63,
+      "eval_steps_per_second": 13.789,
+      "step": 3011
+    },
+    {
+      "epoch": 1.0129525074726005,
+      "grad_norm": 0.05092976614832878,
+      "learning_rate": 4.654917718561174e-06,
+      "loss": 0.018,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0295582862836268,
+      "grad_norm": 3.4633874747669324e-05,
+      "learning_rate": 4.642006758006004e-06,
+      "loss": 0.0,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0461640650946529,
+      "grad_norm": 8.058391540544108e-05,
+      "learning_rate": 4.629095797450833e-06,
+      "loss": 0.0,
+      "step": 3150
+    },
+    {
+      "epoch": 1.0627698439056792,
+      "grad_norm": 0.00043129033292643726,
+      "learning_rate": 4.616184836895662e-06,
+      "loss": 0.0,
+      "step": 3200
+    },
+    {
+      "epoch": 1.0793756227167055,
+      "grad_norm": 0.012417804449796677,
+      "learning_rate": 4.603273876340492e-06,
+      "loss": 0.0204,
+      "step": 3250
+    },
+    {
+      "epoch": 1.0959814015277316,
+      "grad_norm": 0.07707448303699493,
+      "learning_rate": 4.590362915785321e-06,
+      "loss": 0.0089,
+      "step": 3300
+    },
+    {
+      "epoch": 1.112587180338758,
+      "grad_norm": 0.0019856118597090244,
+      "learning_rate": 4.577451955230151e-06,
+      "loss": 0.0003,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1291929591497842,
+      "grad_norm": 0.0003844090970233083,
+      "learning_rate": 4.564540994674981e-06,
+      "loss": 0.0,
+      "step": 3400
+    },
+    {
+      "epoch": 1.1457987379608103,
+      "grad_norm": 0.004796341527253389,
+      "learning_rate": 4.55163003411981e-06,
+      "loss": 0.0054,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1624045167718366,
+      "grad_norm": 0.0021394495852291584,
+      "learning_rate": 4.538719073564639e-06,
+      "loss": 0.0001,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1790102955828627,
+      "grad_norm": 0.00016287445032503456,
+      "learning_rate": 4.525808113009469e-06,
+      "loss": 0.0017,
+      "step": 3550
+    },
+    {
+      "epoch": 1.195616074393889,
+      "grad_norm": 0.005753168836236,
+      "learning_rate": 4.512897152454298e-06,
+      "loss": 0.0132,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2122218532049154,
+      "grad_norm": 0.00012519631127361208,
+      "learning_rate": 4.499986191899128e-06,
+      "loss": 0.0,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2288276320159415,
+      "grad_norm": 0.0009526669164188206,
+      "learning_rate": 4.487075231343957e-06,
+      "loss": 0.0083,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2454334108269678,
+      "grad_norm": 6.90124070388265e-05,
+      "learning_rate": 4.474164270788787e-06,
+      "loss": 0.0114,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2620391896379939,
+      "grad_norm": 0.0029422417283058167,
+      "learning_rate": 4.461253310233616e-06,
+      "loss": 0.0001,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2786449684490202,
+      "grad_norm": 1.6564589738845825,
+      "learning_rate": 4.448342349678446e-06,
+      "loss": 0.0065,
+      "step": 3850
+    },
+    {
+      "epoch": 1.2952507472600465,
+      "grad_norm": 4.6906425268389285e-05,
+      "learning_rate": 4.435431389123275e-06,
+      "loss": 0.0,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3118565260710726,
+      "grad_norm": 1.4456440112553537e-05,
+      "learning_rate": 4.4225204285681046e-06,
+      "loss": 0.0,
+      "step": 3950
+    },
+    {
+      "epoch": 1.328462304882099,
+      "grad_norm": 4.6707005822099745e-05,
+      "learning_rate": 4.409609468012934e-06,
+      "loss": 0.0227,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3450680836931252,
+      "grad_norm": 4.7155015636235476e-05,
+      "learning_rate": 4.396698507457763e-06,
+      "loss": 0.0002,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3616738625041513,
+      "grad_norm": 0.01696430891752243,
+      "learning_rate": 4.383787546902593e-06,
+      "loss": 0.0188,
+      "step": 4100
+    },
+    {
+      "epoch": 1.3782796413151777,
+      "grad_norm": 0.0008329456904903054,
+      "learning_rate": 4.370876586347423e-06,
+      "loss": 0.0178,
+      "step": 4150
+    },
+    {
+      "epoch": 1.394885420126204,
+      "grad_norm": 9.179511835100129e-05,
+      "learning_rate": 4.3579656257922525e-06,
+      "loss": 0.0,
+      "step": 4200
+    },
+    {
+      "epoch": 1.41149119893723,
+      "grad_norm": 2.924172622442711e-05,
+      "learning_rate": 4.3450546652370814e-06,
+      "loss": 0.0013,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4280969777482564,
+      "grad_norm": 0.015076125971972942,
+      "learning_rate": 4.332143704681911e-06,
+      "loss": 0.0104,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4447027565592827,
+      "grad_norm": 5.385762415244244e-05,
+      "learning_rate": 4.31923274412674e-06,
+      "loss": 0.014,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4613085353703088,
+      "grad_norm": 0.0007110639126040041,
+      "learning_rate": 4.30632178357157e-06,
+      "loss": 0.0126,
+      "step": 4400
+    },
+    {
+      "epoch": 1.4779143141813351,
+      "grad_norm": 0.00014339391782414168,
+      "learning_rate": 4.2934108230164e-06,
+      "loss": 0.0003,
+      "step": 4450
+    },
+    {
+      "epoch": 1.4945200929923614,
+      "grad_norm": 0.0006024091853760183,
+      "learning_rate": 4.280499862461229e-06,
+      "loss": 0.0118,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5111258718033875,
+      "grad_norm": 0.0002353072923142463,
+      "learning_rate": 4.267588901906058e-06,
+      "loss": 0.0086,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5277316506144138,
+      "grad_norm": 0.0008946498855948448,
+      "learning_rate": 4.254677941350888e-06,
+      "loss": 0.0,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5443374294254402,
+      "grad_norm": 7.315174298128113e-05,
+      "learning_rate": 4.241766980795717e-06,
+      "loss": 0.0003,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5609432082364663,
+      "grad_norm": 9.232313459506258e-05,
+      "learning_rate": 4.228856020240547e-06,
+      "loss": 0.0001,
+      "step": 4700
+    },
+    {
+      "epoch": 1.5775489870474926,
+      "grad_norm": 1.4020029084349517e-05,
+      "learning_rate": 4.2159450596853765e-06,
+      "loss": 0.0,
+      "step": 4750
+    },
+    {
+      "epoch": 1.594154765858519,
+      "grad_norm": 4.0607475966680795e-05,
+      "learning_rate": 4.203034099130206e-06,
+      "loss": 0.0,
+      "step": 4800
+    },
+    {
+      "epoch": 1.610760544669545,
+      "grad_norm": 4.69290571345482e-05,
+      "learning_rate": 4.190123138575036e-06,
+      "loss": 0.0177,
+      "step": 4850
+    },
+    {
+      "epoch": 1.627366323480571,
+      "grad_norm": 0.14096687734127045,
+      "learning_rate": 4.177212178019865e-06,
+      "loss": 0.0115,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6439721022915976,
+      "grad_norm": 0.00020342542848084122,
+      "learning_rate": 4.164301217464695e-06,
+      "loss": 0.0001,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6605778811026237,
+      "grad_norm": 0.0002786288969218731,
+      "learning_rate": 4.151390256909524e-06,
+      "loss": 0.0,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6771836599136498,
+      "grad_norm": 2.8438846129574813e-05,
+      "learning_rate": 4.138479296354353e-06,
+      "loss": 0.0032,
+      "step": 5050
+    },
+    {
+      "epoch": 1.6937894387246761,
+      "grad_norm": 5.944320037087891e-06,
+      "learning_rate": 4.125568335799183e-06,
+      "loss": 0.0001,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7103952175357025,
+      "grad_norm": 0.005958211608231068,
+      "learning_rate": 4.112657375244013e-06,
+      "loss": 0.0,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7270009963467285,
+      "grad_norm": 0.002004456939175725,
+      "learning_rate": 4.099746414688842e-06,
+      "loss": 0.0106,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7436067751577549,
+      "grad_norm": 0.0008562383009120822,
+      "learning_rate": 4.086835454133672e-06,
+      "loss": 0.0081,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7602125539687812,
+      "grad_norm": 0.03570560738444328,
+      "learning_rate": 4.0739244935785005e-06,
+      "loss": 0.025,
+      "step": 5300
+    },
+    {
+      "epoch": 1.7768183327798073,
+      "grad_norm": 0.001486024702899158,
+      "learning_rate": 4.06101353302333e-06,
+      "loss": 0.0145,
+      "step": 5350
+    },
+    {
+      "epoch": 1.7934241115908336,
+      "grad_norm": 0.0015331929316744208,
+      "learning_rate": 4.04810257246816e-06,
+      "loss": 0.0001,
+      "step": 5400
+    },
+    {
+      "epoch": 1.81002989040186,
+      "grad_norm": 0.004162834957242012,
+      "learning_rate": 4.03519161191299e-06,
+      "loss": 0.0005,
+      "step": 5450
+    },
+    {
+      "epoch": 1.826635669212886,
+      "grad_norm": 0.0003064811462536454,
+      "learning_rate": 4.022280651357819e-06,
+      "loss": 0.0,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8432414480239123,
+      "grad_norm": 0.000830256671179086,
+      "learning_rate": 4.0093696908026485e-06,
+      "loss": 0.0034,
+      "step": 5550
+    },
+    {
+      "epoch": 1.8598472268349386,
+      "grad_norm": 0.001540405093692243,
+      "learning_rate": 3.996458730247478e-06,
+      "loss": 0.0,
+      "step": 5600
+    },
+    {
+      "epoch": 1.8764530056459647,
+      "grad_norm": 0.011221639811992645,
+      "learning_rate": 3.983547769692307e-06,
+      "loss": 0.0116,
+      "step": 5650
+    },
+    {
+      "epoch": 1.893058784456991,
+      "grad_norm": 0.0031693174969404936,
+      "learning_rate": 3.970636809137137e-06,
+      "loss": 0.0061,
+      "step": 5700
+    },
+    {
+      "epoch": 1.9096645632680174,
+      "grad_norm": 7.828649540897459e-05,
+      "learning_rate": 3.957725848581967e-06,
+      "loss": 0.0,
+      "step": 5750
+    },
+    {
+      "epoch": 1.9262703420790435,
+      "grad_norm": 0.00892726145684719,
+      "learning_rate": 3.9448148880267964e-06,
+      "loss": 0.0003,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9428761208900698,
+      "grad_norm": 0.0033830904867500067,
+      "learning_rate": 3.931903927471625e-06,
+      "loss": 0.0007,
+      "step": 5850
+    },
+    {
+      "epoch": 1.959481899701096,
+      "grad_norm": 0.017441514879465103,
+      "learning_rate": 3.918992966916455e-06,
+      "loss": 0.0109,
+      "step": 5900
+    },
+    {
+      "epoch": 1.9760876785121222,
+      "grad_norm": 0.006790176033973694,
+      "learning_rate": 3.906082006361284e-06,
+      "loss": 0.0101,
+      "step": 5950
+    },
+    {
+      "epoch": 1.9926934573231485,
+      "grad_norm": 0.0004248483164701611,
+      "learning_rate": 3.893171045806114e-06,
+      "loss": 0.0103,
+      "step": 6000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9959446564885496,
+      "eval_f1": 0.9958827988724177,
+      "eval_loss": 0.031979888677597046,
+      "eval_precision": 0.9958978797187497,
+      "eval_recall": 0.9959446564885496,
+      "eval_runtime": 37.4063,
+      "eval_samples_per_second": 224.134,
+      "eval_steps_per_second": 14.008,
+      "step": 6022
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 21077,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 7,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

trial-2/checkpoint-6022/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9657a8731817c986f017540c64090098467c35e79328bfa7cab093c33da6a8e9
+size 5368

trial-3/checkpoint-1506/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}

trial-3/checkpoint-1506/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:577af3b8b0a6d7db7f2ff1054a5c4c43704103dd0ed797800f9d9582a3237033
+size 598439784

trial-3/checkpoint-1506/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:309810681fe0458054a9e76c6bfbb6fc2862ae83f89b084906874442e8913f57
+size 1196967418

trial-3/checkpoint-1506/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
+size 14244

trial-3/checkpoint-1506/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77511df67542c270c7a8ed9a3ae9f0a88d6822756582e31cb89e7ee9b503abfb
+size 1064

trial-3/checkpoint-1506/trainer_state.json ADDED Viewed

	@@ -0,0 +1,255 @@

+{
+  "best_metric": 0.03509189188480377,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-3/checkpoint-1506",
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1506,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.033200531208499334,
+      "grad_norm": 6.976862907409668,
+      "learning_rate": 2.8972663455552343e-06,
+      "loss": 0.5378,
+      "step": 50
+    },
+    {
+      "epoch": 0.06640106241699867,
+      "grad_norm": 3.674832344055176,
+      "learning_rate": 2.8648439379281615e-06,
+      "loss": 0.3375,
+      "step": 100
+    },
+    {
+      "epoch": 0.099601593625498,
+      "grad_norm": 2.678229570388794,
+      "learning_rate": 2.8324215303010886e-06,
+      "loss": 0.2213,
+      "step": 150
+    },
+    {
+      "epoch": 0.13280212483399734,
+      "grad_norm": 6.4370551109313965,
+      "learning_rate": 2.7999991226740153e-06,
+      "loss": 0.1558,
+      "step": 200
+    },
+    {
+      "epoch": 0.16600265604249667,
+      "grad_norm": 6.4544525146484375,
+      "learning_rate": 2.767576715046943e-06,
+      "loss": 0.1457,
+      "step": 250
+    },
+    {
+      "epoch": 0.199203187250996,
+      "grad_norm": 2.4753177165985107,
+      "learning_rate": 2.7351543074198696e-06,
+      "loss": 0.1349,
+      "step": 300
+    },
+    {
+      "epoch": 0.23240371845949534,
+      "grad_norm": 3.116945743560791,
+      "learning_rate": 2.7027318997927968e-06,
+      "loss": 0.1144,
+      "step": 350
+    },
+    {
+      "epoch": 0.2656042496679947,
+      "grad_norm": 10.000889778137207,
+      "learning_rate": 2.670309492165724e-06,
+      "loss": 0.0942,
+      "step": 400
+    },
+    {
+      "epoch": 0.29880478087649404,
+      "grad_norm": 0.3915446996688843,
+      "learning_rate": 2.637887084538651e-06,
+      "loss": 0.0841,
+      "step": 450
+    },
+    {
+      "epoch": 0.33200531208499334,
+      "grad_norm": 0.7093335390090942,
+      "learning_rate": 2.605464676911578e-06,
+      "loss": 0.0815,
+      "step": 500
+    },
+    {
+      "epoch": 0.3652058432934927,
+      "grad_norm": 5.660763263702393,
+      "learning_rate": 2.5730422692845053e-06,
+      "loss": 0.058,
+      "step": 550
+    },
+    {
+      "epoch": 0.398406374501992,
+      "grad_norm": 9.372917175292969,
+      "learning_rate": 2.5406198616574325e-06,
+      "loss": 0.0521,
+      "step": 600
+    },
+    {
+      "epoch": 0.4316069057104914,
+      "grad_norm": 6.086747169494629,
+      "learning_rate": 2.5081974540303596e-06,
+      "loss": 0.0671,
+      "step": 650
+    },
+    {
+      "epoch": 0.4648074369189907,
+      "grad_norm": 5.661391735076904,
+      "learning_rate": 2.4757750464032863e-06,
+      "loss": 0.0354,
+      "step": 700
+    },
+    {
+      "epoch": 0.49800796812749004,
+      "grad_norm": 1.4707638025283813,
+      "learning_rate": 2.443352638776214e-06,
+      "loss": 0.0386,
+      "step": 750
+    },
+    {
+      "epoch": 0.5312084993359893,
+      "grad_norm": 7.550576686859131,
+      "learning_rate": 2.4109302311491406e-06,
+      "loss": 0.0363,
+      "step": 800
+    },
+    {
+      "epoch": 0.5644090305444888,
+      "grad_norm": 11.072442054748535,
+      "learning_rate": 2.3785078235220678e-06,
+      "loss": 0.0254,
+      "step": 850
+    },
+    {
+      "epoch": 0.5976095617529881,
+      "grad_norm": 0.3040500581264496,
+      "learning_rate": 2.346085415894995e-06,
+      "loss": 0.018,
+      "step": 900
+    },
+    {
+      "epoch": 0.6308100929614874,
+      "grad_norm": 11.503410339355469,
+      "learning_rate": 2.313663008267922e-06,
+      "loss": 0.0302,
+      "step": 950
+    },
+    {
+      "epoch": 0.6640106241699867,
+      "grad_norm": 0.7599239945411682,
+      "learning_rate": 2.281240600640849e-06,
+      "loss": 0.0267,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6972111553784861,
+      "grad_norm": 0.21025581657886505,
+      "learning_rate": 2.2488181930137764e-06,
+      "loss": 0.0211,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7304116865869854,
+      "grad_norm": 11.052717208862305,
+      "learning_rate": 2.2163957853867035e-06,
+      "loss": 0.0112,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7636122177954847,
+      "grad_norm": 0.0778539627790451,
+      "learning_rate": 2.1839733777596302e-06,
+      "loss": 0.0212,
+      "step": 1150
+    },
+    {
+      "epoch": 0.796812749003984,
+      "grad_norm": 0.050592467188835144,
+      "learning_rate": 2.151550970132558e-06,
+      "loss": 0.0082,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8300132802124834,
+      "grad_norm": 0.04680703952908516,
+      "learning_rate": 2.1191285625054845e-06,
+      "loss": 0.008,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8632138114209827,
+      "grad_norm": 127.69743347167969,
+      "learning_rate": 2.0867061548784117e-06,
+      "loss": 0.0192,
+      "step": 1300
+    },
+    {
+      "epoch": 0.896414342629482,
+      "grad_norm": 0.013791153207421303,
+      "learning_rate": 2.0542837472513392e-06,
+      "loss": 0.0063,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9296148738379814,
+      "grad_norm": 0.011688283644616604,
+      "learning_rate": 2.021861339624266e-06,
+      "loss": 0.0068,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9628154050464808,
+      "grad_norm": 14.885448455810547,
+      "learning_rate": 1.989438931997193e-06,
+      "loss": 0.004,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9960159362549801,
+      "grad_norm": 0.38216766715049744,
+      "learning_rate": 1.9570165243701202e-06,
+      "loss": 0.0069,
+      "step": 1500
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.992604961832061,
+      "eval_f1": 0.9926480803352735,
+      "eval_loss": 0.03509189188480377,
+      "eval_precision": 0.9927020529431649,
+      "eval_recall": 0.992604961832061,
+      "eval_runtime": 31.6693,
+      "eval_samples_per_second": 264.736,
+      "eval_steps_per_second": 8.273,
+      "step": 1506
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 4518,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.641430544259072e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

trial-3/checkpoint-1506/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ed06b7fefd178dad53ae3fef61fd304580c1d532a37d5010e58ca8f39e302fa
+size 5368

trial-4/checkpoint-3011/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}

trial-4/checkpoint-3011/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6998cd19c83cb7aad4574fdf2f2d1d911f7f01e8d94fcb558dc40e5561e3d188
+size 598439784

trial-4/checkpoint-3011/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0517bca24af0d5ed5988e5100a9e9f6f59df1b0d3e7ca53764baa7878d5d5e3
+size 1196967418

trial-4/checkpoint-3011/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
+size 14244

trial-4/checkpoint-3011/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c0dbc7f9aff9e32282e3dcfb80127104b5c3d0089b59d9cb1b981e6af6f8c41
+size 1064

trial-4/checkpoint-3011/trainer_state.json ADDED Viewed

	@@ -0,0 +1,465 @@

+{
+  "best_metric": 0.02325253002345562,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-4/checkpoint-3011",
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3011,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016605778811026237,
+      "grad_norm": 7.4845476150512695,
+      "learning_rate": 1.3209406688296726e-05,
+      "loss": 0.427,
+      "step": 50
+    },
+    {
+      "epoch": 0.033211557622052475,
+      "grad_norm": 8.739913940429688,
+      "learning_rate": 1.3184989137392264e-05,
+      "loss": 0.2079,
+      "step": 100
+    },
+    {
+      "epoch": 0.04981733643307871,
+      "grad_norm": 10.918631553649902,
+      "learning_rate": 1.31605715864878e-05,
+      "loss": 0.1374,
+      "step": 150
+    },
+    {
+      "epoch": 0.06642311524410495,
+      "grad_norm": 0.09207049757242203,
+      "learning_rate": 1.3136154035583336e-05,
+      "loss": 0.0971,
+      "step": 200
+    },
+    {
+      "epoch": 0.08302889405513118,
+      "grad_norm": 0.1270512193441391,
+      "learning_rate": 1.3111736484678873e-05,
+      "loss": 0.0431,
+      "step": 250
+    },
+    {
+      "epoch": 0.09963467286615742,
+      "grad_norm": 0.01078485231846571,
+      "learning_rate": 1.3087318933774408e-05,
+      "loss": 0.0679,
+      "step": 300
+    },
+    {
+      "epoch": 0.11624045167718366,
+      "grad_norm": 0.16803160309791565,
+      "learning_rate": 1.3062901382869945e-05,
+      "loss": 0.0364,
+      "step": 350
+    },
+    {
+      "epoch": 0.1328462304882099,
+      "grad_norm": 0.2863476872444153,
+      "learning_rate": 1.303848383196548e-05,
+      "loss": 0.0802,
+      "step": 400
+    },
+    {
+      "epoch": 0.14945200929923613,
+      "grad_norm": 0.018498318269848824,
+      "learning_rate": 1.3014066281061019e-05,
+      "loss": 0.0324,
+      "step": 450
+    },
+    {
+      "epoch": 0.16605778811026237,
+      "grad_norm": 12.099262237548828,
+      "learning_rate": 1.2989648730156554e-05,
+      "loss": 0.0567,
+      "step": 500
+    },
+    {
+      "epoch": 0.1826635669212886,
+      "grad_norm": 0.04201498255133629,
+      "learning_rate": 1.296523117925209e-05,
+      "loss": 0.0265,
+      "step": 550
+    },
+    {
+      "epoch": 0.19926934573231483,
+      "grad_norm": 13.225788116455078,
+      "learning_rate": 1.2940813628347628e-05,
+      "loss": 0.027,
+      "step": 600
+    },
+    {
+      "epoch": 0.2158751245433411,
+      "grad_norm": 2.1863136291503906,
+      "learning_rate": 1.2916396077443163e-05,
+      "loss": 0.0325,
+      "step": 650
+    },
+    {
+      "epoch": 0.23248090335436733,
+      "grad_norm": 0.0031948979012668133,
+      "learning_rate": 1.28919785265387e-05,
+      "loss": 0.0378,
+      "step": 700
+    },
+    {
+      "epoch": 0.24908668216539356,
+      "grad_norm": 0.0001850352855399251,
+      "learning_rate": 1.2867560975634237e-05,
+      "loss": 0.0242,
+      "step": 750
+    },
+    {
+      "epoch": 0.2656924609764198,
+      "grad_norm": 0.0007033672300167382,
+      "learning_rate": 1.2843143424729772e-05,
+      "loss": 0.0306,
+      "step": 800
+    },
+    {
+      "epoch": 0.282298239787446,
+      "grad_norm": 13.938993453979492,
+      "learning_rate": 1.2818725873825309e-05,
+      "loss": 0.0458,
+      "step": 850
+    },
+    {
+      "epoch": 0.29890401859847227,
+      "grad_norm": 0.02099405601620674,
+      "learning_rate": 1.2794308322920844e-05,
+      "loss": 0.0306,
+      "step": 900
+    },
+    {
+      "epoch": 0.3155097974094985,
+      "grad_norm": 0.024268606677651405,
+      "learning_rate": 1.2769890772016383e-05,
+      "loss": 0.0142,
+      "step": 950
+    },
+    {
+      "epoch": 0.33211557622052473,
+      "grad_norm": 0.004759958013892174,
+      "learning_rate": 1.2745473221111918e-05,
+      "loss": 0.0141,
+      "step": 1000
+    },
+    {
+      "epoch": 0.348721355031551,
+      "grad_norm": 0.0019629066810011864,
+      "learning_rate": 1.2721055670207453e-05,
+      "loss": 0.0345,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3653271338425772,
+      "grad_norm": 0.00019358922145329416,
+      "learning_rate": 1.2696638119302992e-05,
+      "loss": 0.0089,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38193291265360346,
+      "grad_norm": 0.0028237327933311462,
+      "learning_rate": 1.2672220568398527e-05,
+      "loss": 0.0239,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39853869146462967,
+      "grad_norm": 0.00010467255196999758,
+      "learning_rate": 1.2647803017494064e-05,
+      "loss": 0.0094,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41514447027565593,
+      "grad_norm": 0.05774892866611481,
+      "learning_rate": 1.26233854665896e-05,
+      "loss": 0.0246,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4317502490866822,
+      "grad_norm": 0.024394717067480087,
+      "learning_rate": 1.2598967915685136e-05,
+      "loss": 0.0328,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4483560278977084,
+      "grad_norm": 2.231964349746704,
+      "learning_rate": 1.2574550364780673e-05,
+      "loss": 0.0204,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46496180670873466,
+      "grad_norm": 0.0014322358183562756,
+      "learning_rate": 1.2550132813876208e-05,
+      "loss": 0.0001,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48156758551976087,
+      "grad_norm": 0.001744006876833737,
+      "learning_rate": 1.2525715262971747e-05,
+      "loss": 0.0392,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4981733643307871,
+      "grad_norm": 0.027050139382481575,
+      "learning_rate": 1.2501297712067282e-05,
+      "loss": 0.0151,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5147791431418134,
+      "grad_norm": 0.0001924823591252789,
+      "learning_rate": 1.2476880161162817e-05,
+      "loss": 0.0036,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5313849219528396,
+      "grad_norm": 4.767300128936768,
+      "learning_rate": 1.2452462610258356e-05,
+      "loss": 0.0148,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5479907007638658,
+      "grad_norm": 0.0022574588656425476,
+      "learning_rate": 1.242804505935389e-05,
+      "loss": 0.0384,
+      "step": 1650
+    },
+    {
+      "epoch": 0.564596479574892,
+      "grad_norm": 0.12995891273021698,
+      "learning_rate": 1.2403627508449428e-05,
+      "loss": 0.018,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5812022583859183,
+      "grad_norm": 0.0005374422180466354,
+      "learning_rate": 1.2379209957544964e-05,
+      "loss": 0.0039,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5978080371969445,
+      "grad_norm": 0.004592420998960733,
+      "learning_rate": 1.23547924066405e-05,
+      "loss": 0.0136,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6144138160079707,
+      "grad_norm": 0.0008812470478005707,
+      "learning_rate": 1.2330374855736037e-05,
+      "loss": 0.0167,
+      "step": 1850
+    },
+    {
+      "epoch": 0.631019594818997,
+      "grad_norm": 28.337797164916992,
+      "learning_rate": 1.2305957304831572e-05,
+      "loss": 0.0098,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6476253736300233,
+      "grad_norm": 0.0003208396374247968,
+      "learning_rate": 1.228153975392711e-05,
+      "loss": 0.0083,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6642311524410495,
+      "grad_norm": 0.004917904268950224,
+      "learning_rate": 1.2257122203022646e-05,
+      "loss": 0.012,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6808369312520757,
+      "grad_norm": 0.0006444657919928432,
+      "learning_rate": 1.2232704652118182e-05,
+      "loss": 0.0006,
+      "step": 2050
+    },
+    {
+      "epoch": 0.697442710063102,
+      "grad_norm": 0.00020880017837043852,
+      "learning_rate": 1.220828710121372e-05,
+      "loss": 0.0169,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7140484888741282,
+      "grad_norm": 0.009818737395107746,
+      "learning_rate": 1.2183869550309254e-05,
+      "loss": 0.0143,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7306542676851544,
+      "grad_norm": 0.0009041284793056548,
+      "learning_rate": 1.2159451999404791e-05,
+      "loss": 0.0026,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7472600464961807,
+      "grad_norm": 2.3109569549560547,
+      "learning_rate": 1.2135034448500328e-05,
+      "loss": 0.0062,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7638658253072069,
+      "grad_norm": 9.242107807949651e-06,
+      "learning_rate": 1.2110616897595863e-05,
+      "loss": 0.0029,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7804716041182331,
+      "grad_norm": 0.00020709235104732215,
+      "learning_rate": 1.20861993466914e-05,
+      "loss": 0.0,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7970773829292593,
+      "grad_norm": 0.0008476360817439854,
+      "learning_rate": 1.2061781795786937e-05,
+      "loss": 0.019,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8136831617402857,
+      "grad_norm": 0.0002165739715564996,
+      "learning_rate": 1.2037364244882474e-05,
+      "loss": 0.0,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8302889405513119,
+      "grad_norm": 0.029956847429275513,
+      "learning_rate": 1.201294669397801e-05,
+      "loss": 0.0012,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8468947193623381,
+      "grad_norm": 0.0002400112134637311,
+      "learning_rate": 1.1988529143073546e-05,
+      "loss": 0.0191,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8635004981733644,
+      "grad_norm": 0.0070993551053106785,
+      "learning_rate": 1.1964111592169083e-05,
+      "loss": 0.0155,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8801062769843906,
+      "grad_norm": 5.127764234202914e-05,
+      "learning_rate": 1.1939694041264618e-05,
+      "loss": 0.0185,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8967120557954168,
+      "grad_norm": 0.056577421724796295,
+      "learning_rate": 1.1915276490360155e-05,
+      "loss": 0.0063,
+      "step": 2700
+    },
+    {
+      "epoch": 0.913317834606443,
+      "grad_norm": 4.399678437039256e-05,
+      "learning_rate": 1.1890858939455692e-05,
+      "loss": 0.012,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9299236134174693,
+      "grad_norm": 6.6589759626367595e-06,
+      "learning_rate": 1.1866441388551227e-05,
+      "loss": 0.0001,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9465293922284955,
+      "grad_norm": 0.009270718321204185,
+      "learning_rate": 1.1842023837646764e-05,
+      "loss": 0.0001,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9631351710395217,
+      "grad_norm": 6.743930339813232,
+      "learning_rate": 1.1817606286742301e-05,
+      "loss": 0.0019,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9797409498505479,
+      "grad_norm": 10.679564476013184,
+      "learning_rate": 1.1793188735837838e-05,
+      "loss": 0.0258,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9963467286615743,
+      "grad_norm": 0.0007653234642930329,
+      "learning_rate": 1.1768771184933373e-05,
+      "loss": 0.0018,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.997256679389313,
+      "eval_f1": 0.9972464717374746,
+      "eval_loss": 0.02325253002345562,
+      "eval_precision": 0.997240941740882,
+      "eval_recall": 0.997256679389313,
+      "eval_runtime": 36.6991,
+      "eval_samples_per_second": 228.453,
+      "eval_steps_per_second": 14.278,
+      "step": 3011
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 27099,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.641430544259072e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

trial-4/checkpoint-3011/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89fb66224a4a1dbc68c030610c33a1d3f64ca676b2064b388b8e2a7385785f5d
+size 5368

trial-5/checkpoint-3012/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}

trial-5/checkpoint-3012/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49ba330b843aca1a1d0454785b900ed96671619efb6df36ea614d0870f5ef2aa
+size 598439784

trial-5/checkpoint-3012/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60fb304abd0c5b9d4e6de61faca1856b99e71865a5c592f8acaa47567b9139d9
+size 1196967418

trial-5/checkpoint-3012/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244

trial-5/checkpoint-3012/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c69ec29ae0867d661613f53dea74fb003b51f72db6450102f05c6dfa235171f
+size 1064

trial-5/checkpoint-3012/trainer_state.json ADDED Viewed

	@@ -0,0 +1,477 @@

+{
+  "best_metric": 0.0418265163898468,
+  "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-5/checkpoint-3012",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 3012,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.033200531208499334,
+      "grad_norm": 6.311530113220215,
+      "learning_rate": 1.279094112727349e-06,
+      "loss": 0.7104,
+      "step": 50
+    },
+    {
+      "epoch": 0.06640106241699867,
+      "grad_norm": 17.497058868408203,
+      "learning_rate": 1.2748333062225943e-06,
+      "loss": 0.5729,
+      "step": 100
+    },
+    {
+      "epoch": 0.099601593625498,
+      "grad_norm": 7.590151309967041,
+      "learning_rate": 1.2705724997178397e-06,
+      "loss": 0.4714,
+      "step": 150
+    },
+    {
+      "epoch": 0.13280212483399734,
+      "grad_norm": 6.96728515625,
+      "learning_rate": 1.2663116932130851e-06,
+      "loss": 0.3881,
+      "step": 200
+    },
+    {
+      "epoch": 0.16600265604249667,
+      "grad_norm": 4.9838714599609375,
+      "learning_rate": 1.2620508867083303e-06,
+      "loss": 0.3194,
+      "step": 250
+    },
+    {
+      "epoch": 0.199203187250996,
+      "grad_norm": 6.317371368408203,
+      "learning_rate": 1.2577900802035758e-06,
+      "loss": 0.2976,
+      "step": 300
+    },
+    {
+      "epoch": 0.23240371845949534,
+      "grad_norm": 15.331583023071289,
+      "learning_rate": 1.2535292736988212e-06,
+      "loss": 0.2392,
+      "step": 350
+    },
+    {
+      "epoch": 0.2656042496679947,
+      "grad_norm": 15.493165016174316,
+      "learning_rate": 1.2492684671940664e-06,
+      "loss": 0.2337,
+      "step": 400
+    },
+    {
+      "epoch": 0.29880478087649404,
+      "grad_norm": 3.7081472873687744,
+      "learning_rate": 1.2450076606893118e-06,
+      "loss": 0.2037,
+      "step": 450
+    },
+    {
+      "epoch": 0.33200531208499334,
+      "grad_norm": 4.029483318328857,
+      "learning_rate": 1.240746854184557e-06,
+      "loss": 0.2054,
+      "step": 500
+    },
+    {
+      "epoch": 0.3652058432934927,
+      "grad_norm": 4.573270797729492,
+      "learning_rate": 1.2364860476798024e-06,
+      "loss": 0.1555,
+      "step": 550
+    },
+    {
+      "epoch": 0.398406374501992,
+      "grad_norm": 15.748998641967773,
+      "learning_rate": 1.2322252411750478e-06,
+      "loss": 0.1486,
+      "step": 600
+    },
+    {
+      "epoch": 0.4316069057104914,
+      "grad_norm": 12.240307807922363,
+      "learning_rate": 1.227964434670293e-06,
+      "loss": 0.1552,
+      "step": 650
+    },
+    {
+      "epoch": 0.4648074369189907,
+      "grad_norm": 17.192546844482422,
+      "learning_rate": 1.2237036281655385e-06,
+      "loss": 0.1234,
+      "step": 700
+    },
+    {
+      "epoch": 0.49800796812749004,
+      "grad_norm": 11.04953670501709,
+      "learning_rate": 1.2194428216607839e-06,
+      "loss": 0.1212,
+      "step": 750
+    },
+    {
+      "epoch": 0.5312084993359893,
+      "grad_norm": 4.883615016937256,
+      "learning_rate": 1.215182015156029e-06,
+      "loss": 0.1059,
+      "step": 800
+    },
+    {
+      "epoch": 0.5644090305444888,
+      "grad_norm": 4.633565425872803,
+      "learning_rate": 1.2109212086512745e-06,
+      "loss": 0.0788,
+      "step": 850
+    },
+    {
+      "epoch": 0.5976095617529881,
+      "grad_norm": 2.6228833198547363,
+      "learning_rate": 1.20666040214652e-06,
+      "loss": 0.087,
+      "step": 900
+    },
+    {
+      "epoch": 0.6308100929614874,
+      "grad_norm": 6.4782915115356445,
+      "learning_rate": 1.2023995956417651e-06,
+      "loss": 0.0802,
+      "step": 950
+    },
+    {
+      "epoch": 0.6640106241699867,
+      "grad_norm": 5.229304313659668,
+      "learning_rate": 1.1981387891370103e-06,
+      "loss": 0.077,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6972111553784861,
+      "grad_norm": 6.034313201904297,
+      "learning_rate": 1.1938779826322558e-06,
+      "loss": 0.0703,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7304116865869854,
+      "grad_norm": 9.29736614227295,
+      "learning_rate": 1.1896171761275012e-06,
+      "loss": 0.066,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7636122177954847,
+      "grad_norm": 0.6172637343406677,
+      "learning_rate": 1.1853563696227464e-06,
+      "loss": 0.0692,
+      "step": 1150
+    },
+    {
+      "epoch": 0.796812749003984,
+      "grad_norm": 1.642548680305481,
+      "learning_rate": 1.1810955631179918e-06,
+      "loss": 0.0437,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8300132802124834,
+      "grad_norm": 3.888737916946411,
+      "learning_rate": 1.176834756613237e-06,
+      "loss": 0.0474,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8632138114209827,
+      "grad_norm": 14.787779808044434,
+      "learning_rate": 1.1725739501084824e-06,
+      "loss": 0.0501,
+      "step": 1300
+    },
+    {
+      "epoch": 0.896414342629482,
+      "grad_norm": 0.8571153283119202,
+      "learning_rate": 1.1683131436037278e-06,
+      "loss": 0.0439,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9296148738379814,
+      "grad_norm": 0.6915457248687744,
+      "learning_rate": 1.164052337098973e-06,
+      "loss": 0.0455,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9628154050464808,
+      "grad_norm": 8.8081636428833,
+      "learning_rate": 1.1597915305942185e-06,
+      "loss": 0.0347,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9960159362549801,
+      "grad_norm": 8.551522254943848,
+      "learning_rate": 1.1555307240894639e-06,
+      "loss": 0.0346,
+      "step": 1500
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.982824427480916,
+      "eval_f1": 0.9838970307302017,
+      "eval_loss": 0.05475565418601036,
+      "eval_precision": 0.986134299459291,
+      "eval_recall": 0.982824427480916,
+      "eval_runtime": 31.8933,
+      "eval_samples_per_second": 262.877,
+      "eval_steps_per_second": 8.215,
+      "step": 1506
+    },
+    {
+      "epoch": 1.0292164674634794,
+      "grad_norm": 13.078969955444336,
+      "learning_rate": 1.151269917584709e-06,
+      "loss": 0.0379,
+      "step": 1550
+    },
+    {
+      "epoch": 1.0624169986719787,
+      "grad_norm": 1.906078815460205,
+      "learning_rate": 1.1470091110799545e-06,
+      "loss": 0.0338,
+      "step": 1600
+    },
+    {
+      "epoch": 1.095617529880478,
+      "grad_norm": 0.4020080864429474,
+      "learning_rate": 1.1427483045752e-06,
+      "loss": 0.0298,
+      "step": 1650
+    },
+    {
+      "epoch": 1.1288180610889773,
+      "grad_norm": 2.647258758544922,
+      "learning_rate": 1.1384874980704451e-06,
+      "loss": 0.023,
+      "step": 1700
+    },
+    {
+      "epoch": 1.1620185922974768,
+      "grad_norm": 2.046747922897339,
+      "learning_rate": 1.1342266915656906e-06,
+      "loss": 0.0253,
+      "step": 1750
+    },
+    {
+      "epoch": 1.1952191235059761,
+      "grad_norm": 13.14510726928711,
+      "learning_rate": 1.129965885060936e-06,
+      "loss": 0.0268,
+      "step": 1800
+    },
+    {
+      "epoch": 1.2284196547144755,
+      "grad_norm": 0.12764006853103638,
+      "learning_rate": 1.1257050785561812e-06,
+      "loss": 0.0099,
+      "step": 1850
+    },
+    {
+      "epoch": 1.2616201859229748,
+      "grad_norm": 1.6261545419692993,
+      "learning_rate": 1.1214442720514266e-06,
+      "loss": 0.0252,
+      "step": 1900
+    },
+    {
+      "epoch": 1.294820717131474,
+      "grad_norm": 5.552518844604492,
+      "learning_rate": 1.117183465546672e-06,
+      "loss": 0.036,
+      "step": 1950
+    },
+    {
+      "epoch": 1.3280212483399734,
+      "grad_norm": 24.064516067504883,
+      "learning_rate": 1.1129226590419172e-06,
+      "loss": 0.0169,
+      "step": 2000
+    },
+    {
+      "epoch": 1.361221779548473,
+      "grad_norm": 0.00925782322883606,
+      "learning_rate": 1.1086618525371626e-06,
+      "loss": 0.0184,
+      "step": 2050
+    },
+    {
+      "epoch": 1.3944223107569722,
+      "grad_norm": 16.54283905029297,
+      "learning_rate": 1.1044010460324078e-06,
+      "loss": 0.0139,
+      "step": 2100
+    },
+    {
+      "epoch": 1.4276228419654715,
+      "grad_norm": 0.24406713247299194,
+      "learning_rate": 1.1001402395276533e-06,
+      "loss": 0.0126,
+      "step": 2150
+    },
+    {
+      "epoch": 1.4608233731739708,
+      "grad_norm": 0.02731563337147236,
+      "learning_rate": 1.0958794330228987e-06,
+      "loss": 0.0198,
+      "step": 2200
+    },
+    {
+      "epoch": 1.4940239043824701,
+      "grad_norm": 17.53055191040039,
+      "learning_rate": 1.0916186265181439e-06,
+      "loss": 0.0303,
+      "step": 2250
+    },
+    {
+      "epoch": 1.5272244355909694,
+      "grad_norm": 0.07282107323408127,
+      "learning_rate": 1.0873578200133893e-06,
+      "loss": 0.0016,
+      "step": 2300
+    },
+    {
+      "epoch": 1.5604249667994687,
+      "grad_norm": 20.794416427612305,
+      "learning_rate": 1.0830970135086347e-06,
+      "loss": 0.0225,
+      "step": 2350
+    },
+    {
+      "epoch": 1.593625498007968,
+      "grad_norm": 0.052418053150177,
+      "learning_rate": 1.07883620700388e-06,
+      "loss": 0.0076,
+      "step": 2400
+    },
+    {
+      "epoch": 1.6268260292164674,
+      "grad_norm": 0.21063362061977386,
+      "learning_rate": 1.0745754004991254e-06,
+      "loss": 0.0159,
+      "step": 2450
+    },
+    {
+      "epoch": 1.6600265604249667,
+      "grad_norm": 10.455537796020508,
+      "learning_rate": 1.0703145939943708e-06,
+      "loss": 0.0105,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6932270916334662,
+      "grad_norm": 6.205326557159424,
+      "learning_rate": 1.066053787489616e-06,
+      "loss": 0.0081,
+      "step": 2550
+    },
+    {
+      "epoch": 1.7264276228419655,
+      "grad_norm": 6.523694038391113,
+      "learning_rate": 1.0617929809848614e-06,
+      "loss": 0.0159,
+      "step": 2600
+    },
+    {
+      "epoch": 1.7596281540504648,
+      "grad_norm": 0.010043232701718807,
+      "learning_rate": 1.0575321744801068e-06,
+      "loss": 0.0113,
+      "step": 2650
+    },
+    {
+      "epoch": 1.792828685258964,
+      "grad_norm": 0.00458578672260046,
+      "learning_rate": 1.053271367975352e-06,
+      "loss": 0.0086,
+      "step": 2700
+    },
+    {
+      "epoch": 1.8260292164674636,
+      "grad_norm": 0.10986531525850296,
+      "learning_rate": 1.0490105614705974e-06,
+      "loss": 0.008,
+      "step": 2750
+    },
+    {
+      "epoch": 1.859229747675963,
+      "grad_norm": 0.12284637242555618,
+      "learning_rate": 1.0447497549658429e-06,
+      "loss": 0.0052,
+      "step": 2800
+    },
+    {
+      "epoch": 1.8924302788844622,
+      "grad_norm": 0.14606119692325592,
+      "learning_rate": 1.040488948461088e-06,
+      "loss": 0.0176,
+      "step": 2850
+    },
+    {
+      "epoch": 1.9256308100929616,
+      "grad_norm": 0.020491423085331917,
+      "learning_rate": 1.0362281419563333e-06,
+      "loss": 0.0102,
+      "step": 2900
+    },
+    {
+      "epoch": 1.9588313413014609,
+      "grad_norm": 0.05764462426304817,
+      "learning_rate": 1.0319673354515787e-06,
+      "loss": 0.0044,
+      "step": 2950
+    },
+    {
+      "epoch": 1.9920318725099602,
+      "grad_norm": 0.7329011559486389,
+      "learning_rate": 1.027706528946824e-06,
+      "loss": 0.0139,
+      "step": 3000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9924856870229007,
+      "eval_f1": 0.9924235722235019,
+      "eval_loss": 0.0418265163898468,
+      "eval_precision": 0.9923830636545329,
+      "eval_recall": 0.9924856870229007,
+      "eval_runtime": 31.6222,
+      "eval_samples_per_second": 265.131,
+      "eval_steps_per_second": 8.285,
+      "step": 3012
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 15060,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.282861088518144e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

trial-5/checkpoint-3012/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5a07ff58876babfad1d92462cc9e7062c8f5b0af8d8ba9142ab6f5e8880cf2
+size 5368

trial-6/checkpoint-6022/config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "answerdotai/ModernBERT-base",
+  "architectures": [
+    "ModernBertForSequenceClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "reference_compile": true,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 50368
+}

trial-6/checkpoint-6022/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a60e2fc558ad0e5a9a4825234c28006f4c14c02aab969b5ebf7cb43d8f890d9e
+size 598439784

trial-6/checkpoint-6022/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ccfa5cc878422afe6f38c7ea21cef7e9f532ec15d2d9169693197daa8b04fb0
+size 1196967418

trial-6/checkpoint-6022/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
+size 14244