diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..53199983ebc33906dd07db26b461142803b111fa --- /dev/null +++ b/README.md @@ -0,0 +1,73 @@ +--- +library_name: transformers +license: apache-2.0 +base_model: answerdotai/ModernBERT-base +tags: +- generated_from_trainer +metrics: +- accuracy +- precision +- recall +- f1 +model-index: +- name: answerdotai-ModernBERT-base-finetuned + results: [] +--- + + + +# answerdotai-ModernBERT-base-finetuned + +This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the None dataset. +It achieves the following results on the evaluation set: +- Loss: 0.0116 +- Accuracy: 0.9976 +- Precision: 0.9977 +- Recall: 0.9976 +- F1: 0.9976 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 4.244005797262286e-05 +- train_batch_size: 32 +- eval_batch_size: 32 +- seed: 42 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: linear +- num_epochs: 7 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Accuracy | Precision | Recall | F1 | +|:-------------:|:-----:|:-----:|:---------------:|:--------:|:---------:|:------:|:------:| +| 0.0175 | 1.0 | 1506 | 0.0195 | 0.9971 | 0.9971 | 0.9971 | 0.9971 | +| 0.0134 | 2.0 | 3012 | 0.0153 | 0.9970 | 0.9970 | 0.9970 | 0.9970 | +| 0.0 | 3.0 | 4518 | 0.0228 | 0.9976 | 0.9976 | 0.9976 | 0.9976 | +| 0.0 | 4.0 | 6024 | 0.0270 | 0.9976 | 0.9976 | 0.9976 | 0.9976 | +| 0.0 | 5.0 | 7530 | 0.0272 | 0.9976 | 0.9976 | 0.9976 | 0.9976 | +| 0.0 | 6.0 | 9036 | 0.0279 | 0.9975 | 0.9975 | 0.9975 | 0.9975 | +| 0.0 | 7.0 | 10542 | 0.0283 | 0.9975 | 0.9975 | 0.9975 | 0.9975 | + + +### Framework versions + +- Transformers 4.48.0.dev0 +- Pytorch 2.5.1+cu124 +- Datasets 3.2.0 +- Tokenizers 0.21.0 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6db42373842b3d2170c94e43770efb05eece8d27 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd2c8555404b25095196f950baad8216db0404ff16448d62a6d453105d7bd0c7 +size 598439784 diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..53db8c652951b05fb4cc2463b5ac012b3537cf3b --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33b0c987e99ad21c3b9517dc831f21fd66bcbcd55d62a62f0a28008a0e8674e2 +size 5432 diff --git a/trial-0/checkpoint-1506/config.json b/trial-0/checkpoint-1506/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-0/checkpoint-1506/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-0/checkpoint-1506/model.safetensors b/trial-0/checkpoint-1506/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..38ac238ee34c059b26319be2afc84b0906a866bb --- /dev/null +++ b/trial-0/checkpoint-1506/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68eefa4a9be7b2db68618e1cb44c2cdf2163fb53cc3380fc52767266b121ddd2 +size 598439784 diff --git a/trial-0/checkpoint-1506/optimizer.pt b/trial-0/checkpoint-1506/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6047f5e3f57300ba9e2600b46a7177595090dc1e --- /dev/null +++ b/trial-0/checkpoint-1506/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08a1a4cc69805f73befa2723d41c1d97c0a2f799125f15e25de8295d6c23580c +size 1196967418 diff --git a/trial-0/checkpoint-1506/rng_state.pth b/trial-0/checkpoint-1506/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf3d91c5392ca6b7d7e0880933b7830a896d7c9e --- /dev/null +++ b/trial-0/checkpoint-1506/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59 +size 14244 diff --git a/trial-0/checkpoint-1506/scheduler.pt b/trial-0/checkpoint-1506/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4cea32b21906cb3dbe285f9886b2ec4db548048 --- /dev/null +++ b/trial-0/checkpoint-1506/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5bddebb63f2196cebff07c6da8f9e668e8379463981f8be40fb7e151e6c09ff +size 1064 diff --git a/trial-0/checkpoint-1506/trainer_state.json b/trial-0/checkpoint-1506/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c4400a234fad0c2009013341eb9a53199e2d87b9 --- /dev/null +++ b/trial-0/checkpoint-1506/trainer_state.json @@ -0,0 +1,255 @@ +{ + "best_metric": 0.02135350927710533, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-0/checkpoint-1506", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1506, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.033200531208499334, + "grad_norm": 11.822611808776855, + "learning_rate": 4.4935320035267014e-05, + "loss": 0.295, + "step": 50 + }, + { + "epoch": 0.06640106241699867, + "grad_norm": 0.11557121574878693, + "learning_rate": 4.463495024893502e-05, + "loss": 0.0808, + "step": 100 + }, + { + "epoch": 0.099601593625498, + "grad_norm": 0.01743650808930397, + "learning_rate": 4.433458046260302e-05, + "loss": 0.052, + "step": 150 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 4.474731922149658, + "learning_rate": 4.4034210676271024e-05, + "loss": 0.0491, + "step": 200 + }, + { + "epoch": 0.16600265604249667, + "grad_norm": 4.205756664276123, + "learning_rate": 4.373384088993902e-05, + "loss": 0.0344, + "step": 250 + }, + { + "epoch": 0.199203187250996, + "grad_norm": 4.239188194274902, + "learning_rate": 4.343347110360703e-05, + "loss": 0.0295, + "step": 300 + }, + { + "epoch": 0.23240371845949534, + "grad_norm": 0.19662700593471527, + "learning_rate": 4.3133101317275027e-05, + "loss": 0.0342, + "step": 350 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.008393031544983387, + "learning_rate": 4.2832731530943025e-05, + "loss": 0.0245, + "step": 400 + }, + { + "epoch": 0.29880478087649404, + "grad_norm": 0.06995929777622223, + "learning_rate": 4.253236174461103e-05, + "loss": 0.0281, + "step": 450 + }, + { + "epoch": 0.33200531208499334, + "grad_norm": 0.010315222665667534, + "learning_rate": 4.223199195827902e-05, + "loss": 0.0188, + "step": 500 + }, + { + "epoch": 0.3652058432934927, + "grad_norm": 3.1021769046783447, + "learning_rate": 4.193162217194703e-05, + "loss": 0.018, + "step": 550 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.00041495164623484015, + "learning_rate": 4.1631252385615027e-05, + "loss": 0.0053, + "step": 600 + }, + { + "epoch": 0.4316069057104914, + "grad_norm": 0.19596342742443085, + "learning_rate": 4.133088259928303e-05, + "loss": 0.0178, + "step": 650 + }, + { + "epoch": 0.4648074369189907, + "grad_norm": 0.0566418319940567, + "learning_rate": 4.103051281295103e-05, + "loss": 0.0101, + "step": 700 + }, + { + "epoch": 0.49800796812749004, + "grad_norm": 0.005816417746245861, + "learning_rate": 4.0730143026619036e-05, + "loss": 0.0166, + "step": 750 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 2.2474324703216553, + "learning_rate": 4.0429773240287035e-05, + "loss": 0.0156, + "step": 800 + }, + { + "epoch": 0.5644090305444888, + "grad_norm": 0.06311876326799393, + "learning_rate": 4.0129403453955033e-05, + "loss": 0.0166, + "step": 850 + }, + { + "epoch": 0.5976095617529881, + "grad_norm": 0.012764506973326206, + "learning_rate": 3.982903366762304e-05, + "loss": 0.0175, + "step": 900 + }, + { + "epoch": 0.6308100929614874, + "grad_norm": 0.00253055221401155, + "learning_rate": 3.952866388129104e-05, + "loss": 0.0047, + "step": 950 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.03604559600353241, + "learning_rate": 3.922829409495904e-05, + "loss": 0.016, + "step": 1000 + }, + { + "epoch": 0.6972111553784861, + "grad_norm": 0.006498202681541443, + "learning_rate": 3.892792430862704e-05, + "loss": 0.0055, + "step": 1050 + }, + { + "epoch": 0.7304116865869854, + "grad_norm": 0.11296769976615906, + "learning_rate": 3.862755452229504e-05, + "loss": 0.0122, + "step": 1100 + }, + { + "epoch": 0.7636122177954847, + "grad_norm": 0.0005851402529515326, + "learning_rate": 3.8327184735963046e-05, + "loss": 0.01, + "step": 1150 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.018440622836351395, + "learning_rate": 3.8026814949631044e-05, + "loss": 0.0064, + "step": 1200 + }, + { + "epoch": 0.8300132802124834, + "grad_norm": 0.0023099363315850496, + "learning_rate": 3.772644516329905e-05, + "loss": 0.0011, + "step": 1250 + }, + { + "epoch": 0.8632138114209827, + "grad_norm": 0.07595626264810562, + "learning_rate": 3.742607537696705e-05, + "loss": 0.0156, + "step": 1300 + }, + { + "epoch": 0.896414342629482, + "grad_norm": 0.0008996099350042641, + "learning_rate": 3.7125705590635054e-05, + "loss": 0.0103, + "step": 1350 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 3.656134504126385e-05, + "learning_rate": 3.682533580430305e-05, + "loss": 0.0027, + "step": 1400 + }, + { + "epoch": 0.9628154050464808, + "grad_norm": 0.2666904032230377, + "learning_rate": 3.652496601797105e-05, + "loss": 0.0152, + "step": 1450 + }, + { + "epoch": 0.9960159362549801, + "grad_norm": 0.011590929701924324, + "learning_rate": 3.622459623163905e-05, + "loss": 0.0115, + "step": 1500 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9963024809160306, + "eval_f1": 0.9962997469825083, + "eval_loss": 0.02135350927710533, + "eval_precision": 0.9962971957079396, + "eval_recall": 0.9963024809160306, + "eval_runtime": 34.0647, + "eval_samples_per_second": 246.12, + "eval_steps_per_second": 7.691, + "step": 1506 + } + ], + "logging_steps": 50, + "max_steps": 7530, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.641430544259072e+16, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/trial-0/checkpoint-1506/training_args.bin b/trial-0/checkpoint-1506/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0fc19de9741f9cf3edab1a1fa1574f04d82d4230 --- /dev/null +++ b/trial-0/checkpoint-1506/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f87e0989b8aabc63686d8b1c4f4f6463501f9b534fd10b5dda472e02e5c6d200 +size 5368 diff --git a/trial-1/checkpoint-6022/config.json b/trial-1/checkpoint-6022/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-1/checkpoint-6022/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-1/checkpoint-6022/model.safetensors b/trial-1/checkpoint-6022/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..80faa709e3d36df7440067820aa5d5abd8ad496c --- /dev/null +++ b/trial-1/checkpoint-6022/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9376e02caf20a3536db5adaec49e89c8583378974c975bdfa4e4fa72bb7ed87c +size 598439784 diff --git a/trial-1/checkpoint-6022/optimizer.pt b/trial-1/checkpoint-6022/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a59f32c3ea7e53f052a8ef9cb24760a539fd4a95 --- /dev/null +++ b/trial-1/checkpoint-6022/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f989a18c3b9f0cb969ade19c78b7d7d4405053c69000081f12d16f8076c4691 +size 1196967418 diff --git a/trial-1/checkpoint-6022/rng_state.pth b/trial-1/checkpoint-6022/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1 --- /dev/null +++ b/trial-1/checkpoint-6022/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef +size 14244 diff --git a/trial-1/checkpoint-6022/scheduler.pt b/trial-1/checkpoint-6022/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..857465863c7713d2a96855a82d291ced6f6cc956 --- /dev/null +++ b/trial-1/checkpoint-6022/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04bd594b0cd8e46cee28cfc34b0ba6a02854df28789c81eb4c180d9356f4de00 +size 1064 diff --git a/trial-1/checkpoint-6022/trainer_state.json b/trial-1/checkpoint-6022/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f02d164201316343f5d90f30868ba4645b8db0aa --- /dev/null +++ b/trial-1/checkpoint-6022/trainer_state.json @@ -0,0 +1,897 @@ +{ + "best_metric": 0.0445549376308918, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-1/checkpoint-6022", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 6022, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016605778811026237, + "grad_norm": 15.757351875305176, + "learning_rate": 2.4306427769118723e-06, + "loss": 0.6703, + "step": 50 + }, + { + "epoch": 0.033211557622052475, + "grad_norm": 14.056926727294922, + "learning_rate": 2.425586942863882e-06, + "loss": 0.4736, + "step": 100 + }, + { + "epoch": 0.04981733643307871, + "grad_norm": 15.678231239318848, + "learning_rate": 2.4205311088158915e-06, + "loss": 0.338, + "step": 150 + }, + { + "epoch": 0.06642311524410495, + "grad_norm": 4.84220552444458, + "learning_rate": 2.4154752747679013e-06, + "loss": 0.2931, + "step": 200 + }, + { + "epoch": 0.08302889405513118, + "grad_norm": 5.182389736175537, + "learning_rate": 2.4104194407199107e-06, + "loss": 0.251, + "step": 250 + }, + { + "epoch": 0.09963467286615742, + "grad_norm": 1.5187151432037354, + "learning_rate": 2.4053636066719205e-06, + "loss": 0.2133, + "step": 300 + }, + { + "epoch": 0.11624045167718366, + "grad_norm": 16.253589630126953, + "learning_rate": 2.40030777262393e-06, + "loss": 0.1518, + "step": 350 + }, + { + "epoch": 0.1328462304882099, + "grad_norm": 6.757865905761719, + "learning_rate": 2.3952519385759397e-06, + "loss": 0.1508, + "step": 400 + }, + { + "epoch": 0.14945200929923613, + "grad_norm": 2.119438886642456, + "learning_rate": 2.390196104527949e-06, + "loss": 0.1175, + "step": 450 + }, + { + "epoch": 0.16605778811026237, + "grad_norm": 15.932334899902344, + "learning_rate": 2.3851402704799585e-06, + "loss": 0.1401, + "step": 500 + }, + { + "epoch": 0.1826635669212886, + "grad_norm": 22.459735870361328, + "learning_rate": 2.3800844364319683e-06, + "loss": 0.1384, + "step": 550 + }, + { + "epoch": 0.19926934573231483, + "grad_norm": 10.65778923034668, + "learning_rate": 2.3750286023839777e-06, + "loss": 0.1179, + "step": 600 + }, + { + "epoch": 0.2158751245433411, + "grad_norm": 6.71965217590332, + "learning_rate": 2.3699727683359876e-06, + "loss": 0.0782, + "step": 650 + }, + { + "epoch": 0.23248090335436733, + "grad_norm": 3.6098344326019287, + "learning_rate": 2.364916934287997e-06, + "loss": 0.138, + "step": 700 + }, + { + "epoch": 0.24908668216539356, + "grad_norm": 2.3249447345733643, + "learning_rate": 2.3598611002400068e-06, + "loss": 0.1087, + "step": 750 + }, + { + "epoch": 0.2656924609764198, + "grad_norm": 15.047837257385254, + "learning_rate": 2.354805266192016e-06, + "loss": 0.0868, + "step": 800 + }, + { + "epoch": 0.282298239787446, + "grad_norm": 6.7322773933410645, + "learning_rate": 2.349749432144026e-06, + "loss": 0.0954, + "step": 850 + }, + { + "epoch": 0.29890401859847227, + "grad_norm": 12.954623222351074, + "learning_rate": 2.3446935980960354e-06, + "loss": 0.0689, + "step": 900 + }, + { + "epoch": 0.3155097974094985, + "grad_norm": 1.4312756061553955, + "learning_rate": 2.3396377640480448e-06, + "loss": 0.0908, + "step": 950 + }, + { + "epoch": 0.33211557622052473, + "grad_norm": 0.21316280961036682, + "learning_rate": 2.3345819300000546e-06, + "loss": 0.0766, + "step": 1000 + }, + { + "epoch": 0.348721355031551, + "grad_norm": 13.642809867858887, + "learning_rate": 2.329526095952064e-06, + "loss": 0.0533, + "step": 1050 + }, + { + "epoch": 0.3653271338425772, + "grad_norm": 14.525202751159668, + "learning_rate": 2.324470261904074e-06, + "loss": 0.0745, + "step": 1100 + }, + { + "epoch": 0.38193291265360346, + "grad_norm": 0.5210687518119812, + "learning_rate": 2.319414427856083e-06, + "loss": 0.0618, + "step": 1150 + }, + { + "epoch": 0.39853869146462967, + "grad_norm": 0.07292640954256058, + "learning_rate": 2.314358593808093e-06, + "loss": 0.0307, + "step": 1200 + }, + { + "epoch": 0.41514447027565593, + "grad_norm": 0.08236780017614365, + "learning_rate": 2.309302759760103e-06, + "loss": 0.0321, + "step": 1250 + }, + { + "epoch": 0.4317502490866822, + "grad_norm": 28.97471809387207, + "learning_rate": 2.304246925712112e-06, + "loss": 0.0748, + "step": 1300 + }, + { + "epoch": 0.4483560278977084, + "grad_norm": 0.4781515896320343, + "learning_rate": 2.2991910916641216e-06, + "loss": 0.0733, + "step": 1350 + }, + { + "epoch": 0.46496180670873466, + "grad_norm": 3.214794397354126, + "learning_rate": 2.2941352576161314e-06, + "loss": 0.0149, + "step": 1400 + }, + { + "epoch": 0.48156758551976087, + "grad_norm": 0.3289443850517273, + "learning_rate": 2.289079423568141e-06, + "loss": 0.0401, + "step": 1450 + }, + { + "epoch": 0.4981733643307871, + "grad_norm": 0.12368986010551453, + "learning_rate": 2.28402358952015e-06, + "loss": 0.0334, + "step": 1500 + }, + { + "epoch": 0.5147791431418134, + "grad_norm": 0.08283340185880661, + "learning_rate": 2.27896775547216e-06, + "loss": 0.0331, + "step": 1550 + }, + { + "epoch": 0.5313849219528396, + "grad_norm": 2.650063991546631, + "learning_rate": 2.2739119214241694e-06, + "loss": 0.0496, + "step": 1600 + }, + { + "epoch": 0.5479907007638658, + "grad_norm": 3.296297311782837, + "learning_rate": 2.2688560873761792e-06, + "loss": 0.0365, + "step": 1650 + }, + { + "epoch": 0.564596479574892, + "grad_norm": 0.032304324209690094, + "learning_rate": 2.263800253328189e-06, + "loss": 0.005, + "step": 1700 + }, + { + "epoch": 0.5812022583859183, + "grad_norm": 0.003552216337993741, + "learning_rate": 2.2587444192801985e-06, + "loss": 0.0183, + "step": 1750 + }, + { + "epoch": 0.5978080371969445, + "grad_norm": 0.0315885953605175, + "learning_rate": 2.253688585232208e-06, + "loss": 0.0184, + "step": 1800 + }, + { + "epoch": 0.6144138160079707, + "grad_norm": 0.004702410195022821, + "learning_rate": 2.2486327511842177e-06, + "loss": 0.0346, + "step": 1850 + }, + { + "epoch": 0.631019594818997, + "grad_norm": 0.07862639427185059, + "learning_rate": 2.243576917136227e-06, + "loss": 0.0296, + "step": 1900 + }, + { + "epoch": 0.6476253736300233, + "grad_norm": 0.3578585982322693, + "learning_rate": 2.2385210830882364e-06, + "loss": 0.0266, + "step": 1950 + }, + { + "epoch": 0.6642311524410495, + "grad_norm": 0.045335959643125534, + "learning_rate": 2.2334652490402463e-06, + "loss": 0.032, + "step": 2000 + }, + { + "epoch": 0.6808369312520757, + "grad_norm": 1.6869137287139893, + "learning_rate": 2.2284094149922557e-06, + "loss": 0.0297, + "step": 2050 + }, + { + "epoch": 0.697442710063102, + "grad_norm": 0.6017621755599976, + "learning_rate": 2.2233535809442655e-06, + "loss": 0.0119, + "step": 2100 + }, + { + "epoch": 0.7140484888741282, + "grad_norm": 0.13145552575588226, + "learning_rate": 2.2182977468962753e-06, + "loss": 0.0157, + "step": 2150 + }, + { + "epoch": 0.7306542676851544, + "grad_norm": 0.00971242692321539, + "learning_rate": 2.2132419128482847e-06, + "loss": 0.0099, + "step": 2200 + }, + { + "epoch": 0.7472600464961807, + "grad_norm": 0.5801131725311279, + "learning_rate": 2.208186078800294e-06, + "loss": 0.0235, + "step": 2250 + }, + { + "epoch": 0.7638658253072069, + "grad_norm": 0.008363746106624603, + "learning_rate": 2.203130244752304e-06, + "loss": 0.0275, + "step": 2300 + }, + { + "epoch": 0.7804716041182331, + "grad_norm": 0.23013177514076233, + "learning_rate": 2.1980744107043133e-06, + "loss": 0.0022, + "step": 2350 + }, + { + "epoch": 0.7970773829292593, + "grad_norm": 0.044313572347164154, + "learning_rate": 2.1930185766563227e-06, + "loss": 0.0185, + "step": 2400 + }, + { + "epoch": 0.8136831617402857, + "grad_norm": 0.008519169874489307, + "learning_rate": 2.1879627426083325e-06, + "loss": 0.0023, + "step": 2450 + }, + { + "epoch": 0.8302889405513119, + "grad_norm": 0.0008576350519433618, + "learning_rate": 2.182906908560342e-06, + "loss": 0.0062, + "step": 2500 + }, + { + "epoch": 0.8468947193623381, + "grad_norm": 0.56068354845047, + "learning_rate": 2.1778510745123517e-06, + "loss": 0.0106, + "step": 2550 + }, + { + "epoch": 0.8635004981733644, + "grad_norm": 33.770652770996094, + "learning_rate": 2.1727952404643615e-06, + "loss": 0.0298, + "step": 2600 + }, + { + "epoch": 0.8801062769843906, + "grad_norm": 0.0006891911034472287, + "learning_rate": 2.167739406416371e-06, + "loss": 0.0046, + "step": 2650 + }, + { + "epoch": 0.8967120557954168, + "grad_norm": 0.000691475928761065, + "learning_rate": 2.1626835723683803e-06, + "loss": 0.0014, + "step": 2700 + }, + { + "epoch": 0.913317834606443, + "grad_norm": 0.022216275334358215, + "learning_rate": 2.15762773832039e-06, + "loss": 0.0152, + "step": 2750 + }, + { + "epoch": 0.9299236134174693, + "grad_norm": 0.0004267705953679979, + "learning_rate": 2.1525719042723995e-06, + "loss": 0.0117, + "step": 2800 + }, + { + "epoch": 0.9465293922284955, + "grad_norm": 0.016712836921215057, + "learning_rate": 2.147516070224409e-06, + "loss": 0.0009, + "step": 2850 + }, + { + "epoch": 0.9631351710395217, + "grad_norm": 23.74860382080078, + "learning_rate": 2.1424602361764187e-06, + "loss": 0.0233, + "step": 2900 + }, + { + "epoch": 0.9797409498505479, + "grad_norm": 0.0039037028327584267, + "learning_rate": 2.137404402128428e-06, + "loss": 0.0193, + "step": 2950 + }, + { + "epoch": 0.9963467286615743, + "grad_norm": 0.0023961260449141264, + "learning_rate": 2.132348568080438e-06, + "loss": 0.0068, + "step": 3000 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9921278625954199, + "eval_f1": 0.9921278625954199, + "eval_loss": 0.046909503638744354, + "eval_precision": 0.9921278625954199, + "eval_recall": 0.9921278625954199, + "eval_runtime": 36.762, + "eval_samples_per_second": 228.061, + "eval_steps_per_second": 14.254, + "step": 3011 + }, + { + "epoch": 1.0129525074726005, + "grad_norm": 0.0033601378090679646, + "learning_rate": 2.1272927340324478e-06, + "loss": 0.0005, + "step": 3050 + }, + { + "epoch": 1.0295582862836268, + "grad_norm": 0.038166940212249756, + "learning_rate": 2.122236899984457e-06, + "loss": 0.0002, + "step": 3100 + }, + { + "epoch": 1.0461640650946529, + "grad_norm": 0.0003456630220171064, + "learning_rate": 2.1171810659364666e-06, + "loss": 0.0139, + "step": 3150 + }, + { + "epoch": 1.0627698439056792, + "grad_norm": 0.004587268922477961, + "learning_rate": 2.1121252318884764e-06, + "loss": 0.0001, + "step": 3200 + }, + { + "epoch": 1.0793756227167055, + "grad_norm": 0.08502045273780823, + "learning_rate": 2.1070693978404858e-06, + "loss": 0.0216, + "step": 3250 + }, + { + "epoch": 1.0959814015277316, + "grad_norm": 0.10945820808410645, + "learning_rate": 2.102013563792495e-06, + "loss": 0.0256, + "step": 3300 + }, + { + "epoch": 1.112587180338758, + "grad_norm": 0.03236968442797661, + "learning_rate": 2.096957729744505e-06, + "loss": 0.005, + "step": 3350 + }, + { + "epoch": 1.1291929591497842, + "grad_norm": 0.007731316145509481, + "learning_rate": 2.0919018956965144e-06, + "loss": 0.0101, + "step": 3400 + }, + { + "epoch": 1.1457987379608103, + "grad_norm": 0.00674546230584383, + "learning_rate": 2.086846061648524e-06, + "loss": 0.0051, + "step": 3450 + }, + { + "epoch": 1.1624045167718366, + "grad_norm": 0.004380326252430677, + "learning_rate": 2.081790227600534e-06, + "loss": 0.0039, + "step": 3500 + }, + { + "epoch": 1.1790102955828627, + "grad_norm": 0.031456008553504944, + "learning_rate": 2.0767343935525434e-06, + "loss": 0.0001, + "step": 3550 + }, + { + "epoch": 1.195616074393889, + "grad_norm": 0.017602458596229553, + "learning_rate": 2.071678559504553e-06, + "loss": 0.006, + "step": 3600 + }, + { + "epoch": 1.2122218532049154, + "grad_norm": 0.009589639492332935, + "learning_rate": 2.0666227254565626e-06, + "loss": 0.001, + "step": 3650 + }, + { + "epoch": 1.2288276320159415, + "grad_norm": 0.003254746785387397, + "learning_rate": 2.061566891408572e-06, + "loss": 0.0, + "step": 3700 + }, + { + "epoch": 1.2454334108269678, + "grad_norm": 0.0011986729223281145, + "learning_rate": 2.056511057360582e-06, + "loss": 0.0126, + "step": 3750 + }, + { + "epoch": 1.2620391896379939, + "grad_norm": 0.006293583195656538, + "learning_rate": 2.0514552233125912e-06, + "loss": 0.0006, + "step": 3800 + }, + { + "epoch": 1.2786449684490202, + "grad_norm": 0.11370380967855453, + "learning_rate": 2.0463993892646006e-06, + "loss": 0.0252, + "step": 3850 + }, + { + "epoch": 1.2952507472600465, + "grad_norm": 0.0018469190690666437, + "learning_rate": 2.0413435552166104e-06, + "loss": 0.0004, + "step": 3900 + }, + { + "epoch": 1.3118565260710726, + "grad_norm": 0.0002411604655208066, + "learning_rate": 2.0362877211686202e-06, + "loss": 0.003, + "step": 3950 + }, + { + "epoch": 1.328462304882099, + "grad_norm": 4.065009852638468e-05, + "learning_rate": 2.0312318871206296e-06, + "loss": 0.0165, + "step": 4000 + }, + { + "epoch": 1.3450680836931252, + "grad_norm": 0.005062599666416645, + "learning_rate": 2.0261760530726395e-06, + "loss": 0.0028, + "step": 4050 + }, + { + "epoch": 1.3616738625041513, + "grad_norm": 0.017400013282895088, + "learning_rate": 2.021120219024649e-06, + "loss": 0.001, + "step": 4100 + }, + { + "epoch": 1.3782796413151777, + "grad_norm": 0.05683843046426773, + "learning_rate": 2.0160643849766582e-06, + "loss": 0.0124, + "step": 4150 + }, + { + "epoch": 1.394885420126204, + "grad_norm": 0.0027029893826693296, + "learning_rate": 2.011008550928668e-06, + "loss": 0.0003, + "step": 4200 + }, + { + "epoch": 1.41149119893723, + "grad_norm": 0.002034110017120838, + "learning_rate": 2.0059527168806775e-06, + "loss": 0.0073, + "step": 4250 + }, + { + "epoch": 1.4280969777482564, + "grad_norm": 0.001398180378600955, + "learning_rate": 2.000896882832687e-06, + "loss": 0.0044, + "step": 4300 + }, + { + "epoch": 1.4447027565592827, + "grad_norm": 0.00037716259248554707, + "learning_rate": 1.9958410487846967e-06, + "loss": 0.0228, + "step": 4350 + }, + { + "epoch": 1.4613085353703088, + "grad_norm": 0.015627387911081314, + "learning_rate": 1.9907852147367065e-06, + "loss": 0.0114, + "step": 4400 + }, + { + "epoch": 1.4779143141813351, + "grad_norm": 0.008964600041508675, + "learning_rate": 1.985729380688716e-06, + "loss": 0.0032, + "step": 4450 + }, + { + "epoch": 1.4945200929923614, + "grad_norm": 0.003252738853916526, + "learning_rate": 1.9806735466407257e-06, + "loss": 0.0082, + "step": 4500 + }, + { + "epoch": 1.5111258718033875, + "grad_norm": 0.00012037971464451402, + "learning_rate": 1.975617712592735e-06, + "loss": 0.0001, + "step": 4550 + }, + { + "epoch": 1.5277316506144138, + "grad_norm": 0.010974590666592121, + "learning_rate": 1.9705618785447445e-06, + "loss": 0.0, + "step": 4600 + }, + { + "epoch": 1.5443374294254402, + "grad_norm": 0.08398176729679108, + "learning_rate": 1.9655060444967543e-06, + "loss": 0.0002, + "step": 4650 + }, + { + "epoch": 1.5609432082364663, + "grad_norm": 0.03629281371831894, + "learning_rate": 1.9604502104487637e-06, + "loss": 0.006, + "step": 4700 + }, + { + "epoch": 1.5775489870474926, + "grad_norm": 0.00034110501292161644, + "learning_rate": 1.955394376400773e-06, + "loss": 0.0003, + "step": 4750 + }, + { + "epoch": 1.594154765858519, + "grad_norm": 0.0027959852013736963, + "learning_rate": 1.950338542352783e-06, + "loss": 0.0, + "step": 4800 + }, + { + "epoch": 1.610760544669545, + "grad_norm": 0.0001677741383900866, + "learning_rate": 1.9452827083047927e-06, + "loss": 0.0023, + "step": 4850 + }, + { + "epoch": 1.627366323480571, + "grad_norm": 0.055583104491233826, + "learning_rate": 1.940226874256802e-06, + "loss": 0.0225, + "step": 4900 + }, + { + "epoch": 1.6439721022915976, + "grad_norm": 8.664117194712162e-05, + "learning_rate": 1.935171040208812e-06, + "loss": 0.0009, + "step": 4950 + }, + { + "epoch": 1.6605778811026237, + "grad_norm": 0.0017323939828202128, + "learning_rate": 1.9301152061608213e-06, + "loss": 0.008, + "step": 5000 + }, + { + "epoch": 1.6771836599136498, + "grad_norm": 0.0034425491467118263, + "learning_rate": 1.9250593721128307e-06, + "loss": 0.0, + "step": 5050 + }, + { + "epoch": 1.6937894387246761, + "grad_norm": 6.076216959627345e-05, + "learning_rate": 1.9200035380648405e-06, + "loss": 0.0041, + "step": 5100 + }, + { + "epoch": 1.7103952175357025, + "grad_norm": 0.0018082900205627084, + "learning_rate": 1.91494770401685e-06, + "loss": 0.0017, + "step": 5150 + }, + { + "epoch": 1.7270009963467285, + "grad_norm": 0.008552160114049911, + "learning_rate": 1.9098918699688593e-06, + "loss": 0.0137, + "step": 5200 + }, + { + "epoch": 1.7436067751577549, + "grad_norm": 0.08908296376466751, + "learning_rate": 1.9048360359208694e-06, + "loss": 0.0092, + "step": 5250 + }, + { + "epoch": 1.7602125539687812, + "grad_norm": 0.002973488997668028, + "learning_rate": 1.8997802018728788e-06, + "loss": 0.0002, + "step": 5300 + }, + { + "epoch": 1.7768183327798073, + "grad_norm": 0.005116044543683529, + "learning_rate": 1.8947243678248884e-06, + "loss": 0.0079, + "step": 5350 + }, + { + "epoch": 1.7934241115908336, + "grad_norm": 0.002092874376103282, + "learning_rate": 1.889668533776898e-06, + "loss": 0.0, + "step": 5400 + }, + { + "epoch": 1.81002989040186, + "grad_norm": 0.0070649790577590466, + "learning_rate": 1.8846126997289076e-06, + "loss": 0.0, + "step": 5450 + }, + { + "epoch": 1.826635669212886, + "grad_norm": 0.001974167302250862, + "learning_rate": 1.879556865680917e-06, + "loss": 0.016, + "step": 5500 + }, + { + "epoch": 1.8432414480239123, + "grad_norm": 0.0012006360339000821, + "learning_rate": 1.8745010316329268e-06, + "loss": 0.0, + "step": 5550 + }, + { + "epoch": 1.8598472268349386, + "grad_norm": 0.006318301893770695, + "learning_rate": 1.8694451975849362e-06, + "loss": 0.0, + "step": 5600 + }, + { + "epoch": 1.8764530056459647, + "grad_norm": 0.0020722977351397276, + "learning_rate": 1.8643893635369458e-06, + "loss": 0.0104, + "step": 5650 + }, + { + "epoch": 1.893058784456991, + "grad_norm": 0.0874456912279129, + "learning_rate": 1.8593335294889556e-06, + "loss": 0.0023, + "step": 5700 + }, + { + "epoch": 1.9096645632680174, + "grad_norm": 0.00042386740096844733, + "learning_rate": 1.854277695440965e-06, + "loss": 0.0105, + "step": 5750 + }, + { + "epoch": 1.9262703420790435, + "grad_norm": 0.05140538513660431, + "learning_rate": 1.8492218613929746e-06, + "loss": 0.0008, + "step": 5800 + }, + { + "epoch": 1.9428761208900698, + "grad_norm": 0.00046465068589895964, + "learning_rate": 1.8441660273449842e-06, + "loss": 0.0176, + "step": 5850 + }, + { + "epoch": 1.959481899701096, + "grad_norm": 0.001875279936939478, + "learning_rate": 1.8391101932969938e-06, + "loss": 0.0002, + "step": 5900 + }, + { + "epoch": 1.9760876785121222, + "grad_norm": 0.0012590339174494147, + "learning_rate": 1.8340543592490032e-06, + "loss": 0.001, + "step": 5950 + }, + { + "epoch": 1.9926934573231485, + "grad_norm": 25.133811950683594, + "learning_rate": 1.828998525201013e-06, + "loss": 0.0229, + "step": 6000 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.995706106870229, + "eval_f1": 0.9956269879098661, + "eval_loss": 0.0445549376308918, + "eval_precision": 0.9956596696711074, + "eval_recall": 0.995706106870229, + "eval_runtime": 38.3077, + "eval_samples_per_second": 218.859, + "eval_steps_per_second": 13.679, + "step": 6022 + } + ], + "logging_steps": 50, + "max_steps": 24088, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.282861088518144e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/trial-1/checkpoint-6022/training_args.bin b/trial-1/checkpoint-6022/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4087ec56e47476361d22dfe17ea11d79a64f155b --- /dev/null +++ b/trial-1/checkpoint-6022/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:161830f01fe4451cf2afb08516c24e569c5b229b44b735c51814ae17b5494e10 +size 5368 diff --git a/trial-2/checkpoint-6022/config.json b/trial-2/checkpoint-6022/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-2/checkpoint-6022/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-2/checkpoint-6022/model.safetensors b/trial-2/checkpoint-6022/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..34f8425ab92253d1fda1671277bdad1214b2fbc3 --- /dev/null +++ b/trial-2/checkpoint-6022/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33d8242e8a21a76a0ad8b21949fe7bd68e94de5ce2da543a151336909fcb8e83 +size 598439784 diff --git a/trial-2/checkpoint-6022/optimizer.pt b/trial-2/checkpoint-6022/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c5acf7f402f99bbcf6d7ad4b4a890b5bb0c4d5d --- /dev/null +++ b/trial-2/checkpoint-6022/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89405c1def95fb7d1e0ff7deac188ca136134ebd620d1451c9f0d4ed557d77a +size 1196967418 diff --git a/trial-2/checkpoint-6022/rng_state.pth b/trial-2/checkpoint-6022/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1 --- /dev/null +++ b/trial-2/checkpoint-6022/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef +size 14244 diff --git a/trial-2/checkpoint-6022/scheduler.pt b/trial-2/checkpoint-6022/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..346ec0135f9cbb01269c18402c5bb87704740a2f --- /dev/null +++ b/trial-2/checkpoint-6022/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daebe5b6f96508652ee77aa623e80e4943a4ab7b8acffe2720aa77d58c2624f9 +size 1064 diff --git a/trial-2/checkpoint-6022/trainer_state.json b/trial-2/checkpoint-6022/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..22b3bdb366ae8c61b15e647467848d653a68713f --- /dev/null +++ b/trial-2/checkpoint-6022/trainer_state.json @@ -0,0 +1,897 @@ +{ + "best_metric": 0.031979888677597046, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-2/checkpoint-6022", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 6022, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016605778811026237, + "grad_norm": 21.788597106933594, + "learning_rate": 5.429575351871404e-06, + "loss": 0.5789, + "step": 50 + }, + { + "epoch": 0.033211557622052475, + "grad_norm": 20.038349151611328, + "learning_rate": 5.416664391316233e-06, + "loss": 0.37, + "step": 100 + }, + { + "epoch": 0.04981733643307871, + "grad_norm": 23.927526473999023, + "learning_rate": 5.403753430761063e-06, + "loss": 0.25, + "step": 150 + }, + { + "epoch": 0.06642311524410495, + "grad_norm": 4.1712799072265625, + "learning_rate": 5.390842470205893e-06, + "loss": 0.1921, + "step": 200 + }, + { + "epoch": 0.08302889405513118, + "grad_norm": 6.138601303100586, + "learning_rate": 5.3779315096507225e-06, + "loss": 0.1365, + "step": 250 + }, + { + "epoch": 0.09963467286615742, + "grad_norm": 0.9431160092353821, + "learning_rate": 5.3650205490955514e-06, + "loss": 0.1473, + "step": 300 + }, + { + "epoch": 0.11624045167718366, + "grad_norm": 25.303245544433594, + "learning_rate": 5.352109588540381e-06, + "loss": 0.0875, + "step": 350 + }, + { + "epoch": 0.1328462304882099, + "grad_norm": 14.83379077911377, + "learning_rate": 5.33919862798521e-06, + "loss": 0.111, + "step": 400 + }, + { + "epoch": 0.14945200929923613, + "grad_norm": 0.2346535325050354, + "learning_rate": 5.32628766743004e-06, + "loss": 0.0722, + "step": 450 + }, + { + "epoch": 0.16605778811026237, + "grad_norm": 19.045169830322266, + "learning_rate": 5.31337670687487e-06, + "loss": 0.1236, + "step": 500 + }, + { + "epoch": 0.1826635669212886, + "grad_norm": 10.871609687805176, + "learning_rate": 5.300465746319699e-06, + "loss": 0.1018, + "step": 550 + }, + { + "epoch": 0.19926934573231483, + "grad_norm": 8.278830528259277, + "learning_rate": 5.287554785764528e-06, + "loss": 0.0608, + "step": 600 + }, + { + "epoch": 0.2158751245433411, + "grad_norm": 3.4486818313598633, + "learning_rate": 5.274643825209358e-06, + "loss": 0.0684, + "step": 650 + }, + { + "epoch": 0.23248090335436733, + "grad_norm": 9.789453506469727, + "learning_rate": 5.261732864654187e-06, + "loss": 0.0826, + "step": 700 + }, + { + "epoch": 0.24908668216539356, + "grad_norm": 0.013454285450279713, + "learning_rate": 5.248821904099017e-06, + "loss": 0.0672, + "step": 750 + }, + { + "epoch": 0.2656924609764198, + "grad_norm": 0.8878294825553894, + "learning_rate": 5.2359109435438465e-06, + "loss": 0.0472, + "step": 800 + }, + { + "epoch": 0.282298239787446, + "grad_norm": 15.41006088256836, + "learning_rate": 5.222999982988676e-06, + "loss": 0.0616, + "step": 850 + }, + { + "epoch": 0.29890401859847227, + "grad_norm": 0.04324938729405403, + "learning_rate": 5.210089022433506e-06, + "loss": 0.0215, + "step": 900 + }, + { + "epoch": 0.3155097974094985, + "grad_norm": 0.011849366128444672, + "learning_rate": 5.197178061878335e-06, + "loss": 0.0398, + "step": 950 + }, + { + "epoch": 0.33211557622052473, + "grad_norm": 0.0020897299982607365, + "learning_rate": 5.184267101323165e-06, + "loss": 0.0294, + "step": 1000 + }, + { + "epoch": 0.348721355031551, + "grad_norm": 0.00038467388367280364, + "learning_rate": 5.171356140767994e-06, + "loss": 0.0328, + "step": 1050 + }, + { + "epoch": 0.3653271338425772, + "grad_norm": 0.0022064056247472763, + "learning_rate": 5.158445180212823e-06, + "loss": 0.0216, + "step": 1100 + }, + { + "epoch": 0.38193291265360346, + "grad_norm": 0.012603014707565308, + "learning_rate": 5.145534219657653e-06, + "loss": 0.0293, + "step": 1150 + }, + { + "epoch": 0.39853869146462967, + "grad_norm": 0.002970542525872588, + "learning_rate": 5.132623259102483e-06, + "loss": 0.0133, + "step": 1200 + }, + { + "epoch": 0.41514447027565593, + "grad_norm": 0.09289965778589249, + "learning_rate": 5.119712298547312e-06, + "loss": 0.0189, + "step": 1250 + }, + { + "epoch": 0.4317502490866822, + "grad_norm": 0.030116688460111618, + "learning_rate": 5.106801337992142e-06, + "loss": 0.0266, + "step": 1300 + }, + { + "epoch": 0.4483560278977084, + "grad_norm": 23.291847229003906, + "learning_rate": 5.0938903774369705e-06, + "loss": 0.0378, + "step": 1350 + }, + { + "epoch": 0.46496180670873466, + "grad_norm": 0.00580954784527421, + "learning_rate": 5.0809794168818e-06, + "loss": 0.0002, + "step": 1400 + }, + { + "epoch": 0.48156758551976087, + "grad_norm": 0.0036250711418688297, + "learning_rate": 5.06806845632663e-06, + "loss": 0.0297, + "step": 1450 + }, + { + "epoch": 0.4981733643307871, + "grad_norm": 0.0013630707981064916, + "learning_rate": 5.05515749577146e-06, + "loss": 0.0114, + "step": 1500 + }, + { + "epoch": 0.5147791431418134, + "grad_norm": 0.025447094812989235, + "learning_rate": 5.042246535216289e-06, + "loss": 0.0019, + "step": 1550 + }, + { + "epoch": 0.5313849219528396, + "grad_norm": 18.81841468811035, + "learning_rate": 5.0293355746611185e-06, + "loss": 0.0286, + "step": 1600 + }, + { + "epoch": 0.5479907007638658, + "grad_norm": 0.0033424277789890766, + "learning_rate": 5.016424614105948e-06, + "loss": 0.0393, + "step": 1650 + }, + { + "epoch": 0.564596479574892, + "grad_norm": 0.039123374968767166, + "learning_rate": 5.003513653550777e-06, + "loss": 0.0186, + "step": 1700 + }, + { + "epoch": 0.5812022583859183, + "grad_norm": 0.0005275913863442838, + "learning_rate": 4.990602692995607e-06, + "loss": 0.0003, + "step": 1750 + }, + { + "epoch": 0.5978080371969445, + "grad_norm": 0.005070064682513475, + "learning_rate": 4.977691732440437e-06, + "loss": 0.01, + "step": 1800 + }, + { + "epoch": 0.6144138160079707, + "grad_norm": 0.003932475112378597, + "learning_rate": 4.9647807718852664e-06, + "loss": 0.0222, + "step": 1850 + }, + { + "epoch": 0.631019594818997, + "grad_norm": 0.6544032692909241, + "learning_rate": 4.951869811330095e-06, + "loss": 0.0138, + "step": 1900 + }, + { + "epoch": 0.6476253736300233, + "grad_norm": 0.008768323808908463, + "learning_rate": 4.938958850774925e-06, + "loss": 0.0056, + "step": 1950 + }, + { + "epoch": 0.6642311524410495, + "grad_norm": 0.0021180976182222366, + "learning_rate": 4.926047890219754e-06, + "loss": 0.0049, + "step": 2000 + }, + { + "epoch": 0.6808369312520757, + "grad_norm": 0.002039346843957901, + "learning_rate": 4.913136929664584e-06, + "loss": 0.0142, + "step": 2050 + }, + { + "epoch": 0.697442710063102, + "grad_norm": 0.012900142930448055, + "learning_rate": 4.9002259691094136e-06, + "loss": 0.0105, + "step": 2100 + }, + { + "epoch": 0.7140484888741282, + "grad_norm": 0.0022153747268021107, + "learning_rate": 4.887315008554243e-06, + "loss": 0.0142, + "step": 2150 + }, + { + "epoch": 0.7306542676851544, + "grad_norm": 0.001426122267730534, + "learning_rate": 4.874404047999072e-06, + "loss": 0.0068, + "step": 2200 + }, + { + "epoch": 0.7472600464961807, + "grad_norm": 0.0008603449095971882, + "learning_rate": 4.861493087443902e-06, + "loss": 0.0119, + "step": 2250 + }, + { + "epoch": 0.7638658253072069, + "grad_norm": 0.0006780526018701494, + "learning_rate": 4.848582126888731e-06, + "loss": 0.0108, + "step": 2300 + }, + { + "epoch": 0.7804716041182331, + "grad_norm": 0.014527379535138607, + "learning_rate": 4.835671166333561e-06, + "loss": 0.0002, + "step": 2350 + }, + { + "epoch": 0.7970773829292593, + "grad_norm": 0.00022624376288149506, + "learning_rate": 4.8227602057783904e-06, + "loss": 0.0092, + "step": 2400 + }, + { + "epoch": 0.8136831617402857, + "grad_norm": 0.0044932495802640915, + "learning_rate": 4.80984924522322e-06, + "loss": 0.0001, + "step": 2450 + }, + { + "epoch": 0.8302889405513119, + "grad_norm": 0.0009355309884995222, + "learning_rate": 4.79693828466805e-06, + "loss": 0.0002, + "step": 2500 + }, + { + "epoch": 0.8468947193623381, + "grad_norm": 0.12550997734069824, + "learning_rate": 4.784027324112879e-06, + "loss": 0.0024, + "step": 2550 + }, + { + "epoch": 0.8635004981733644, + "grad_norm": 0.02399071305990219, + "learning_rate": 4.771116363557709e-06, + "loss": 0.0099, + "step": 2600 + }, + { + "epoch": 0.8801062769843906, + "grad_norm": 0.008470265194773674, + "learning_rate": 4.7582054030025375e-06, + "loss": 0.0157, + "step": 2650 + }, + { + "epoch": 0.8967120557954168, + "grad_norm": 3.967735028709285e-05, + "learning_rate": 4.745294442447367e-06, + "loss": 0.0013, + "step": 2700 + }, + { + "epoch": 0.913317834606443, + "grad_norm": 0.0005532742943614721, + "learning_rate": 4.732383481892197e-06, + "loss": 0.0025, + "step": 2750 + }, + { + "epoch": 0.9299236134174693, + "grad_norm": 9.227233022102155e-06, + "learning_rate": 4.719472521337027e-06, + "loss": 0.0028, + "step": 2800 + }, + { + "epoch": 0.9465293922284955, + "grad_norm": 0.280258446931839, + "learning_rate": 4.706561560781856e-06, + "loss": 0.0004, + "step": 2850 + }, + { + "epoch": 0.9631351710395217, + "grad_norm": 27.427757263183594, + "learning_rate": 4.6936506002266855e-06, + "loss": 0.0127, + "step": 2900 + }, + { + "epoch": 0.9797409498505479, + "grad_norm": 176.85423278808594, + "learning_rate": 4.680739639671514e-06, + "loss": 0.0298, + "step": 2950 + }, + { + "epoch": 0.9963467286615743, + "grad_norm": 0.00011263355554547161, + "learning_rate": 4.667828679116344e-06, + "loss": 0.001, + "step": 3000 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9963024809160306, + "eval_f1": 0.9962431632227496, + "eval_loss": 0.04071500524878502, + "eval_precision": 0.9962693439313673, + "eval_recall": 0.9963024809160306, + "eval_runtime": 38.0003, + "eval_samples_per_second": 220.63, + "eval_steps_per_second": 13.789, + "step": 3011 + }, + { + "epoch": 1.0129525074726005, + "grad_norm": 0.05092976614832878, + "learning_rate": 4.654917718561174e-06, + "loss": 0.018, + "step": 3050 + }, + { + "epoch": 1.0295582862836268, + "grad_norm": 3.4633874747669324e-05, + "learning_rate": 4.642006758006004e-06, + "loss": 0.0, + "step": 3100 + }, + { + "epoch": 1.0461640650946529, + "grad_norm": 8.058391540544108e-05, + "learning_rate": 4.629095797450833e-06, + "loss": 0.0, + "step": 3150 + }, + { + "epoch": 1.0627698439056792, + "grad_norm": 0.00043129033292643726, + "learning_rate": 4.616184836895662e-06, + "loss": 0.0, + "step": 3200 + }, + { + "epoch": 1.0793756227167055, + "grad_norm": 0.012417804449796677, + "learning_rate": 4.603273876340492e-06, + "loss": 0.0204, + "step": 3250 + }, + { + "epoch": 1.0959814015277316, + "grad_norm": 0.07707448303699493, + "learning_rate": 4.590362915785321e-06, + "loss": 0.0089, + "step": 3300 + }, + { + "epoch": 1.112587180338758, + "grad_norm": 0.0019856118597090244, + "learning_rate": 4.577451955230151e-06, + "loss": 0.0003, + "step": 3350 + }, + { + "epoch": 1.1291929591497842, + "grad_norm": 0.0003844090970233083, + "learning_rate": 4.564540994674981e-06, + "loss": 0.0, + "step": 3400 + }, + { + "epoch": 1.1457987379608103, + "grad_norm": 0.004796341527253389, + "learning_rate": 4.55163003411981e-06, + "loss": 0.0054, + "step": 3450 + }, + { + "epoch": 1.1624045167718366, + "grad_norm": 0.0021394495852291584, + "learning_rate": 4.538719073564639e-06, + "loss": 0.0001, + "step": 3500 + }, + { + "epoch": 1.1790102955828627, + "grad_norm": 0.00016287445032503456, + "learning_rate": 4.525808113009469e-06, + "loss": 0.0017, + "step": 3550 + }, + { + "epoch": 1.195616074393889, + "grad_norm": 0.005753168836236, + "learning_rate": 4.512897152454298e-06, + "loss": 0.0132, + "step": 3600 + }, + { + "epoch": 1.2122218532049154, + "grad_norm": 0.00012519631127361208, + "learning_rate": 4.499986191899128e-06, + "loss": 0.0, + "step": 3650 + }, + { + "epoch": 1.2288276320159415, + "grad_norm": 0.0009526669164188206, + "learning_rate": 4.487075231343957e-06, + "loss": 0.0083, + "step": 3700 + }, + { + "epoch": 1.2454334108269678, + "grad_norm": 6.90124070388265e-05, + "learning_rate": 4.474164270788787e-06, + "loss": 0.0114, + "step": 3750 + }, + { + "epoch": 1.2620391896379939, + "grad_norm": 0.0029422417283058167, + "learning_rate": 4.461253310233616e-06, + "loss": 0.0001, + "step": 3800 + }, + { + "epoch": 1.2786449684490202, + "grad_norm": 1.6564589738845825, + "learning_rate": 4.448342349678446e-06, + "loss": 0.0065, + "step": 3850 + }, + { + "epoch": 1.2952507472600465, + "grad_norm": 4.6906425268389285e-05, + "learning_rate": 4.435431389123275e-06, + "loss": 0.0, + "step": 3900 + }, + { + "epoch": 1.3118565260710726, + "grad_norm": 1.4456440112553537e-05, + "learning_rate": 4.4225204285681046e-06, + "loss": 0.0, + "step": 3950 + }, + { + "epoch": 1.328462304882099, + "grad_norm": 4.6707005822099745e-05, + "learning_rate": 4.409609468012934e-06, + "loss": 0.0227, + "step": 4000 + }, + { + "epoch": 1.3450680836931252, + "grad_norm": 4.7155015636235476e-05, + "learning_rate": 4.396698507457763e-06, + "loss": 0.0002, + "step": 4050 + }, + { + "epoch": 1.3616738625041513, + "grad_norm": 0.01696430891752243, + "learning_rate": 4.383787546902593e-06, + "loss": 0.0188, + "step": 4100 + }, + { + "epoch": 1.3782796413151777, + "grad_norm": 0.0008329456904903054, + "learning_rate": 4.370876586347423e-06, + "loss": 0.0178, + "step": 4150 + }, + { + "epoch": 1.394885420126204, + "grad_norm": 9.179511835100129e-05, + "learning_rate": 4.3579656257922525e-06, + "loss": 0.0, + "step": 4200 + }, + { + "epoch": 1.41149119893723, + "grad_norm": 2.924172622442711e-05, + "learning_rate": 4.3450546652370814e-06, + "loss": 0.0013, + "step": 4250 + }, + { + "epoch": 1.4280969777482564, + "grad_norm": 0.015076125971972942, + "learning_rate": 4.332143704681911e-06, + "loss": 0.0104, + "step": 4300 + }, + { + "epoch": 1.4447027565592827, + "grad_norm": 5.385762415244244e-05, + "learning_rate": 4.31923274412674e-06, + "loss": 0.014, + "step": 4350 + }, + { + "epoch": 1.4613085353703088, + "grad_norm": 0.0007110639126040041, + "learning_rate": 4.30632178357157e-06, + "loss": 0.0126, + "step": 4400 + }, + { + "epoch": 1.4779143141813351, + "grad_norm": 0.00014339391782414168, + "learning_rate": 4.2934108230164e-06, + "loss": 0.0003, + "step": 4450 + }, + { + "epoch": 1.4945200929923614, + "grad_norm": 0.0006024091853760183, + "learning_rate": 4.280499862461229e-06, + "loss": 0.0118, + "step": 4500 + }, + { + "epoch": 1.5111258718033875, + "grad_norm": 0.0002353072923142463, + "learning_rate": 4.267588901906058e-06, + "loss": 0.0086, + "step": 4550 + }, + { + "epoch": 1.5277316506144138, + "grad_norm": 0.0008946498855948448, + "learning_rate": 4.254677941350888e-06, + "loss": 0.0, + "step": 4600 + }, + { + "epoch": 1.5443374294254402, + "grad_norm": 7.315174298128113e-05, + "learning_rate": 4.241766980795717e-06, + "loss": 0.0003, + "step": 4650 + }, + { + "epoch": 1.5609432082364663, + "grad_norm": 9.232313459506258e-05, + "learning_rate": 4.228856020240547e-06, + "loss": 0.0001, + "step": 4700 + }, + { + "epoch": 1.5775489870474926, + "grad_norm": 1.4020029084349517e-05, + "learning_rate": 4.2159450596853765e-06, + "loss": 0.0, + "step": 4750 + }, + { + "epoch": 1.594154765858519, + "grad_norm": 4.0607475966680795e-05, + "learning_rate": 4.203034099130206e-06, + "loss": 0.0, + "step": 4800 + }, + { + "epoch": 1.610760544669545, + "grad_norm": 4.69290571345482e-05, + "learning_rate": 4.190123138575036e-06, + "loss": 0.0177, + "step": 4850 + }, + { + "epoch": 1.627366323480571, + "grad_norm": 0.14096687734127045, + "learning_rate": 4.177212178019865e-06, + "loss": 0.0115, + "step": 4900 + }, + { + "epoch": 1.6439721022915976, + "grad_norm": 0.00020342542848084122, + "learning_rate": 4.164301217464695e-06, + "loss": 0.0001, + "step": 4950 + }, + { + "epoch": 1.6605778811026237, + "grad_norm": 0.0002786288969218731, + "learning_rate": 4.151390256909524e-06, + "loss": 0.0, + "step": 5000 + }, + { + "epoch": 1.6771836599136498, + "grad_norm": 2.8438846129574813e-05, + "learning_rate": 4.138479296354353e-06, + "loss": 0.0032, + "step": 5050 + }, + { + "epoch": 1.6937894387246761, + "grad_norm": 5.944320037087891e-06, + "learning_rate": 4.125568335799183e-06, + "loss": 0.0001, + "step": 5100 + }, + { + "epoch": 1.7103952175357025, + "grad_norm": 0.005958211608231068, + "learning_rate": 4.112657375244013e-06, + "loss": 0.0, + "step": 5150 + }, + { + "epoch": 1.7270009963467285, + "grad_norm": 0.002004456939175725, + "learning_rate": 4.099746414688842e-06, + "loss": 0.0106, + "step": 5200 + }, + { + "epoch": 1.7436067751577549, + "grad_norm": 0.0008562383009120822, + "learning_rate": 4.086835454133672e-06, + "loss": 0.0081, + "step": 5250 + }, + { + "epoch": 1.7602125539687812, + "grad_norm": 0.03570560738444328, + "learning_rate": 4.0739244935785005e-06, + "loss": 0.025, + "step": 5300 + }, + { + "epoch": 1.7768183327798073, + "grad_norm": 0.001486024702899158, + "learning_rate": 4.06101353302333e-06, + "loss": 0.0145, + "step": 5350 + }, + { + "epoch": 1.7934241115908336, + "grad_norm": 0.0015331929316744208, + "learning_rate": 4.04810257246816e-06, + "loss": 0.0001, + "step": 5400 + }, + { + "epoch": 1.81002989040186, + "grad_norm": 0.004162834957242012, + "learning_rate": 4.03519161191299e-06, + "loss": 0.0005, + "step": 5450 + }, + { + "epoch": 1.826635669212886, + "grad_norm": 0.0003064811462536454, + "learning_rate": 4.022280651357819e-06, + "loss": 0.0, + "step": 5500 + }, + { + "epoch": 1.8432414480239123, + "grad_norm": 0.000830256671179086, + "learning_rate": 4.0093696908026485e-06, + "loss": 0.0034, + "step": 5550 + }, + { + "epoch": 1.8598472268349386, + "grad_norm": 0.001540405093692243, + "learning_rate": 3.996458730247478e-06, + "loss": 0.0, + "step": 5600 + }, + { + "epoch": 1.8764530056459647, + "grad_norm": 0.011221639811992645, + "learning_rate": 3.983547769692307e-06, + "loss": 0.0116, + "step": 5650 + }, + { + "epoch": 1.893058784456991, + "grad_norm": 0.0031693174969404936, + "learning_rate": 3.970636809137137e-06, + "loss": 0.0061, + "step": 5700 + }, + { + "epoch": 1.9096645632680174, + "grad_norm": 7.828649540897459e-05, + "learning_rate": 3.957725848581967e-06, + "loss": 0.0, + "step": 5750 + }, + { + "epoch": 1.9262703420790435, + "grad_norm": 0.00892726145684719, + "learning_rate": 3.9448148880267964e-06, + "loss": 0.0003, + "step": 5800 + }, + { + "epoch": 1.9428761208900698, + "grad_norm": 0.0033830904867500067, + "learning_rate": 3.931903927471625e-06, + "loss": 0.0007, + "step": 5850 + }, + { + "epoch": 1.959481899701096, + "grad_norm": 0.017441514879465103, + "learning_rate": 3.918992966916455e-06, + "loss": 0.0109, + "step": 5900 + }, + { + "epoch": 1.9760876785121222, + "grad_norm": 0.006790176033973694, + "learning_rate": 3.906082006361284e-06, + "loss": 0.0101, + "step": 5950 + }, + { + "epoch": 1.9926934573231485, + "grad_norm": 0.0004248483164701611, + "learning_rate": 3.893171045806114e-06, + "loss": 0.0103, + "step": 6000 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9959446564885496, + "eval_f1": 0.9958827988724177, + "eval_loss": 0.031979888677597046, + "eval_precision": 0.9958978797187497, + "eval_recall": 0.9959446564885496, + "eval_runtime": 37.4063, + "eval_samples_per_second": 224.134, + "eval_steps_per_second": 14.008, + "step": 6022 + } + ], + "logging_steps": 50, + "max_steps": 21077, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.282861088518144e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/trial-2/checkpoint-6022/training_args.bin b/trial-2/checkpoint-6022/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0df248ab0a69e9e5a85d1cc73b799d697b96402c --- /dev/null +++ b/trial-2/checkpoint-6022/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9657a8731817c986f017540c64090098467c35e79328bfa7cab093c33da6a8e9 +size 5368 diff --git a/trial-3/checkpoint-1506/config.json b/trial-3/checkpoint-1506/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-3/checkpoint-1506/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-3/checkpoint-1506/model.safetensors b/trial-3/checkpoint-1506/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..215b630a963742d9d4bbfed6eb6e55d3b754920c --- /dev/null +++ b/trial-3/checkpoint-1506/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:577af3b8b0a6d7db7f2ff1054a5c4c43704103dd0ed797800f9d9582a3237033 +size 598439784 diff --git a/trial-3/checkpoint-1506/optimizer.pt b/trial-3/checkpoint-1506/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a6762b6abd952285de010b6b0370cd57b63be85 --- /dev/null +++ b/trial-3/checkpoint-1506/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309810681fe0458054a9e76c6bfbb6fc2862ae83f89b084906874442e8913f57 +size 1196967418 diff --git a/trial-3/checkpoint-1506/rng_state.pth b/trial-3/checkpoint-1506/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf3d91c5392ca6b7d7e0880933b7830a896d7c9e --- /dev/null +++ b/trial-3/checkpoint-1506/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59 +size 14244 diff --git a/trial-3/checkpoint-1506/scheduler.pt b/trial-3/checkpoint-1506/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f131c33e989612c23ac3cf1568fa31b1782a8ae8 --- /dev/null +++ b/trial-3/checkpoint-1506/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77511df67542c270c7a8ed9a3ae9f0a88d6822756582e31cb89e7ee9b503abfb +size 1064 diff --git a/trial-3/checkpoint-1506/trainer_state.json b/trial-3/checkpoint-1506/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..394b21d4c91185d069ba4c10fe805b462822da99 --- /dev/null +++ b/trial-3/checkpoint-1506/trainer_state.json @@ -0,0 +1,255 @@ +{ + "best_metric": 0.03509189188480377, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-3/checkpoint-1506", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1506, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.033200531208499334, + "grad_norm": 6.976862907409668, + "learning_rate": 2.8972663455552343e-06, + "loss": 0.5378, + "step": 50 + }, + { + "epoch": 0.06640106241699867, + "grad_norm": 3.674832344055176, + "learning_rate": 2.8648439379281615e-06, + "loss": 0.3375, + "step": 100 + }, + { + "epoch": 0.099601593625498, + "grad_norm": 2.678229570388794, + "learning_rate": 2.8324215303010886e-06, + "loss": 0.2213, + "step": 150 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 6.4370551109313965, + "learning_rate": 2.7999991226740153e-06, + "loss": 0.1558, + "step": 200 + }, + { + "epoch": 0.16600265604249667, + "grad_norm": 6.4544525146484375, + "learning_rate": 2.767576715046943e-06, + "loss": 0.1457, + "step": 250 + }, + { + "epoch": 0.199203187250996, + "grad_norm": 2.4753177165985107, + "learning_rate": 2.7351543074198696e-06, + "loss": 0.1349, + "step": 300 + }, + { + "epoch": 0.23240371845949534, + "grad_norm": 3.116945743560791, + "learning_rate": 2.7027318997927968e-06, + "loss": 0.1144, + "step": 350 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 10.000889778137207, + "learning_rate": 2.670309492165724e-06, + "loss": 0.0942, + "step": 400 + }, + { + "epoch": 0.29880478087649404, + "grad_norm": 0.3915446996688843, + "learning_rate": 2.637887084538651e-06, + "loss": 0.0841, + "step": 450 + }, + { + "epoch": 0.33200531208499334, + "grad_norm": 0.7093335390090942, + "learning_rate": 2.605464676911578e-06, + "loss": 0.0815, + "step": 500 + }, + { + "epoch": 0.3652058432934927, + "grad_norm": 5.660763263702393, + "learning_rate": 2.5730422692845053e-06, + "loss": 0.058, + "step": 550 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 9.372917175292969, + "learning_rate": 2.5406198616574325e-06, + "loss": 0.0521, + "step": 600 + }, + { + "epoch": 0.4316069057104914, + "grad_norm": 6.086747169494629, + "learning_rate": 2.5081974540303596e-06, + "loss": 0.0671, + "step": 650 + }, + { + "epoch": 0.4648074369189907, + "grad_norm": 5.661391735076904, + "learning_rate": 2.4757750464032863e-06, + "loss": 0.0354, + "step": 700 + }, + { + "epoch": 0.49800796812749004, + "grad_norm": 1.4707638025283813, + "learning_rate": 2.443352638776214e-06, + "loss": 0.0386, + "step": 750 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 7.550576686859131, + "learning_rate": 2.4109302311491406e-06, + "loss": 0.0363, + "step": 800 + }, + { + "epoch": 0.5644090305444888, + "grad_norm": 11.072442054748535, + "learning_rate": 2.3785078235220678e-06, + "loss": 0.0254, + "step": 850 + }, + { + "epoch": 0.5976095617529881, + "grad_norm": 0.3040500581264496, + "learning_rate": 2.346085415894995e-06, + "loss": 0.018, + "step": 900 + }, + { + "epoch": 0.6308100929614874, + "grad_norm": 11.503410339355469, + "learning_rate": 2.313663008267922e-06, + "loss": 0.0302, + "step": 950 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.7599239945411682, + "learning_rate": 2.281240600640849e-06, + "loss": 0.0267, + "step": 1000 + }, + { + "epoch": 0.6972111553784861, + "grad_norm": 0.21025581657886505, + "learning_rate": 2.2488181930137764e-06, + "loss": 0.0211, + "step": 1050 + }, + { + "epoch": 0.7304116865869854, + "grad_norm": 11.052717208862305, + "learning_rate": 2.2163957853867035e-06, + "loss": 0.0112, + "step": 1100 + }, + { + "epoch": 0.7636122177954847, + "grad_norm": 0.0778539627790451, + "learning_rate": 2.1839733777596302e-06, + "loss": 0.0212, + "step": 1150 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.050592467188835144, + "learning_rate": 2.151550970132558e-06, + "loss": 0.0082, + "step": 1200 + }, + { + "epoch": 0.8300132802124834, + "grad_norm": 0.04680703952908516, + "learning_rate": 2.1191285625054845e-06, + "loss": 0.008, + "step": 1250 + }, + { + "epoch": 0.8632138114209827, + "grad_norm": 127.69743347167969, + "learning_rate": 2.0867061548784117e-06, + "loss": 0.0192, + "step": 1300 + }, + { + "epoch": 0.896414342629482, + "grad_norm": 0.013791153207421303, + "learning_rate": 2.0542837472513392e-06, + "loss": 0.0063, + "step": 1350 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.011688283644616604, + "learning_rate": 2.021861339624266e-06, + "loss": 0.0068, + "step": 1400 + }, + { + "epoch": 0.9628154050464808, + "grad_norm": 14.885448455810547, + "learning_rate": 1.989438931997193e-06, + "loss": 0.004, + "step": 1450 + }, + { + "epoch": 0.9960159362549801, + "grad_norm": 0.38216766715049744, + "learning_rate": 1.9570165243701202e-06, + "loss": 0.0069, + "step": 1500 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.992604961832061, + "eval_f1": 0.9926480803352735, + "eval_loss": 0.03509189188480377, + "eval_precision": 0.9927020529431649, + "eval_recall": 0.992604961832061, + "eval_runtime": 31.6693, + "eval_samples_per_second": 264.736, + "eval_steps_per_second": 8.273, + "step": 1506 + } + ], + "logging_steps": 50, + "max_steps": 4518, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.641430544259072e+16, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/trial-3/checkpoint-1506/training_args.bin b/trial-3/checkpoint-1506/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..007bd7f3de5345c09391c213b1b1e412ba04ab11 --- /dev/null +++ b/trial-3/checkpoint-1506/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed06b7fefd178dad53ae3fef61fd304580c1d532a37d5010e58ca8f39e302fa +size 5368 diff --git a/trial-4/checkpoint-3011/config.json b/trial-4/checkpoint-3011/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-4/checkpoint-3011/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-4/checkpoint-3011/model.safetensors b/trial-4/checkpoint-3011/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..962e142faf8e034b148e7e4d0c0bbe22787b4c06 --- /dev/null +++ b/trial-4/checkpoint-3011/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6998cd19c83cb7aad4574fdf2f2d1d911f7f01e8d94fcb558dc40e5561e3d188 +size 598439784 diff --git a/trial-4/checkpoint-3011/optimizer.pt b/trial-4/checkpoint-3011/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7e34671467adb09e9efd41322ae8421a33e01bd --- /dev/null +++ b/trial-4/checkpoint-3011/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0517bca24af0d5ed5988e5100a9e9f6f59df1b0d3e7ca53764baa7878d5d5e3 +size 1196967418 diff --git a/trial-4/checkpoint-3011/rng_state.pth b/trial-4/checkpoint-3011/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf3d91c5392ca6b7d7e0880933b7830a896d7c9e --- /dev/null +++ b/trial-4/checkpoint-3011/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59 +size 14244 diff --git a/trial-4/checkpoint-3011/scheduler.pt b/trial-4/checkpoint-3011/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..386cae275a4fe4ba708aad4344c553b29e37f764 --- /dev/null +++ b/trial-4/checkpoint-3011/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c0dbc7f9aff9e32282e3dcfb80127104b5c3d0089b59d9cb1b981e6af6f8c41 +size 1064 diff --git a/trial-4/checkpoint-3011/trainer_state.json b/trial-4/checkpoint-3011/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0823a4d65e30d5a08293e438db6f325203ea94f0 --- /dev/null +++ b/trial-4/checkpoint-3011/trainer_state.json @@ -0,0 +1,465 @@ +{ + "best_metric": 0.02325253002345562, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-4/checkpoint-3011", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3011, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016605778811026237, + "grad_norm": 7.4845476150512695, + "learning_rate": 1.3209406688296726e-05, + "loss": 0.427, + "step": 50 + }, + { + "epoch": 0.033211557622052475, + "grad_norm": 8.739913940429688, + "learning_rate": 1.3184989137392264e-05, + "loss": 0.2079, + "step": 100 + }, + { + "epoch": 0.04981733643307871, + "grad_norm": 10.918631553649902, + "learning_rate": 1.31605715864878e-05, + "loss": 0.1374, + "step": 150 + }, + { + "epoch": 0.06642311524410495, + "grad_norm": 0.09207049757242203, + "learning_rate": 1.3136154035583336e-05, + "loss": 0.0971, + "step": 200 + }, + { + "epoch": 0.08302889405513118, + "grad_norm": 0.1270512193441391, + "learning_rate": 1.3111736484678873e-05, + "loss": 0.0431, + "step": 250 + }, + { + "epoch": 0.09963467286615742, + "grad_norm": 0.01078485231846571, + "learning_rate": 1.3087318933774408e-05, + "loss": 0.0679, + "step": 300 + }, + { + "epoch": 0.11624045167718366, + "grad_norm": 0.16803160309791565, + "learning_rate": 1.3062901382869945e-05, + "loss": 0.0364, + "step": 350 + }, + { + "epoch": 0.1328462304882099, + "grad_norm": 0.2863476872444153, + "learning_rate": 1.303848383196548e-05, + "loss": 0.0802, + "step": 400 + }, + { + "epoch": 0.14945200929923613, + "grad_norm": 0.018498318269848824, + "learning_rate": 1.3014066281061019e-05, + "loss": 0.0324, + "step": 450 + }, + { + "epoch": 0.16605778811026237, + "grad_norm": 12.099262237548828, + "learning_rate": 1.2989648730156554e-05, + "loss": 0.0567, + "step": 500 + }, + { + "epoch": 0.1826635669212886, + "grad_norm": 0.04201498255133629, + "learning_rate": 1.296523117925209e-05, + "loss": 0.0265, + "step": 550 + }, + { + "epoch": 0.19926934573231483, + "grad_norm": 13.225788116455078, + "learning_rate": 1.2940813628347628e-05, + "loss": 0.027, + "step": 600 + }, + { + "epoch": 0.2158751245433411, + "grad_norm": 2.1863136291503906, + "learning_rate": 1.2916396077443163e-05, + "loss": 0.0325, + "step": 650 + }, + { + "epoch": 0.23248090335436733, + "grad_norm": 0.0031948979012668133, + "learning_rate": 1.28919785265387e-05, + "loss": 0.0378, + "step": 700 + }, + { + "epoch": 0.24908668216539356, + "grad_norm": 0.0001850352855399251, + "learning_rate": 1.2867560975634237e-05, + "loss": 0.0242, + "step": 750 + }, + { + "epoch": 0.2656924609764198, + "grad_norm": 0.0007033672300167382, + "learning_rate": 1.2843143424729772e-05, + "loss": 0.0306, + "step": 800 + }, + { + "epoch": 0.282298239787446, + "grad_norm": 13.938993453979492, + "learning_rate": 1.2818725873825309e-05, + "loss": 0.0458, + "step": 850 + }, + { + "epoch": 0.29890401859847227, + "grad_norm": 0.02099405601620674, + "learning_rate": 1.2794308322920844e-05, + "loss": 0.0306, + "step": 900 + }, + { + "epoch": 0.3155097974094985, + "grad_norm": 0.024268606677651405, + "learning_rate": 1.2769890772016383e-05, + "loss": 0.0142, + "step": 950 + }, + { + "epoch": 0.33211557622052473, + "grad_norm": 0.004759958013892174, + "learning_rate": 1.2745473221111918e-05, + "loss": 0.0141, + "step": 1000 + }, + { + "epoch": 0.348721355031551, + "grad_norm": 0.0019629066810011864, + "learning_rate": 1.2721055670207453e-05, + "loss": 0.0345, + "step": 1050 + }, + { + "epoch": 0.3653271338425772, + "grad_norm": 0.00019358922145329416, + "learning_rate": 1.2696638119302992e-05, + "loss": 0.0089, + "step": 1100 + }, + { + "epoch": 0.38193291265360346, + "grad_norm": 0.0028237327933311462, + "learning_rate": 1.2672220568398527e-05, + "loss": 0.0239, + "step": 1150 + }, + { + "epoch": 0.39853869146462967, + "grad_norm": 0.00010467255196999758, + "learning_rate": 1.2647803017494064e-05, + "loss": 0.0094, + "step": 1200 + }, + { + "epoch": 0.41514447027565593, + "grad_norm": 0.05774892866611481, + "learning_rate": 1.26233854665896e-05, + "loss": 0.0246, + "step": 1250 + }, + { + "epoch": 0.4317502490866822, + "grad_norm": 0.024394717067480087, + "learning_rate": 1.2598967915685136e-05, + "loss": 0.0328, + "step": 1300 + }, + { + "epoch": 0.4483560278977084, + "grad_norm": 2.231964349746704, + "learning_rate": 1.2574550364780673e-05, + "loss": 0.0204, + "step": 1350 + }, + { + "epoch": 0.46496180670873466, + "grad_norm": 0.0014322358183562756, + "learning_rate": 1.2550132813876208e-05, + "loss": 0.0001, + "step": 1400 + }, + { + "epoch": 0.48156758551976087, + "grad_norm": 0.001744006876833737, + "learning_rate": 1.2525715262971747e-05, + "loss": 0.0392, + "step": 1450 + }, + { + "epoch": 0.4981733643307871, + "grad_norm": 0.027050139382481575, + "learning_rate": 1.2501297712067282e-05, + "loss": 0.0151, + "step": 1500 + }, + { + "epoch": 0.5147791431418134, + "grad_norm": 0.0001924823591252789, + "learning_rate": 1.2476880161162817e-05, + "loss": 0.0036, + "step": 1550 + }, + { + "epoch": 0.5313849219528396, + "grad_norm": 4.767300128936768, + "learning_rate": 1.2452462610258356e-05, + "loss": 0.0148, + "step": 1600 + }, + { + "epoch": 0.5479907007638658, + "grad_norm": 0.0022574588656425476, + "learning_rate": 1.242804505935389e-05, + "loss": 0.0384, + "step": 1650 + }, + { + "epoch": 0.564596479574892, + "grad_norm": 0.12995891273021698, + "learning_rate": 1.2403627508449428e-05, + "loss": 0.018, + "step": 1700 + }, + { + "epoch": 0.5812022583859183, + "grad_norm": 0.0005374422180466354, + "learning_rate": 1.2379209957544964e-05, + "loss": 0.0039, + "step": 1750 + }, + { + "epoch": 0.5978080371969445, + "grad_norm": 0.004592420998960733, + "learning_rate": 1.23547924066405e-05, + "loss": 0.0136, + "step": 1800 + }, + { + "epoch": 0.6144138160079707, + "grad_norm": 0.0008812470478005707, + "learning_rate": 1.2330374855736037e-05, + "loss": 0.0167, + "step": 1850 + }, + { + "epoch": 0.631019594818997, + "grad_norm": 28.337797164916992, + "learning_rate": 1.2305957304831572e-05, + "loss": 0.0098, + "step": 1900 + }, + { + "epoch": 0.6476253736300233, + "grad_norm": 0.0003208396374247968, + "learning_rate": 1.228153975392711e-05, + "loss": 0.0083, + "step": 1950 + }, + { + "epoch": 0.6642311524410495, + "grad_norm": 0.004917904268950224, + "learning_rate": 1.2257122203022646e-05, + "loss": 0.012, + "step": 2000 + }, + { + "epoch": 0.6808369312520757, + "grad_norm": 0.0006444657919928432, + "learning_rate": 1.2232704652118182e-05, + "loss": 0.0006, + "step": 2050 + }, + { + "epoch": 0.697442710063102, + "grad_norm": 0.00020880017837043852, + "learning_rate": 1.220828710121372e-05, + "loss": 0.0169, + "step": 2100 + }, + { + "epoch": 0.7140484888741282, + "grad_norm": 0.009818737395107746, + "learning_rate": 1.2183869550309254e-05, + "loss": 0.0143, + "step": 2150 + }, + { + "epoch": 0.7306542676851544, + "grad_norm": 0.0009041284793056548, + "learning_rate": 1.2159451999404791e-05, + "loss": 0.0026, + "step": 2200 + }, + { + "epoch": 0.7472600464961807, + "grad_norm": 2.3109569549560547, + "learning_rate": 1.2135034448500328e-05, + "loss": 0.0062, + "step": 2250 + }, + { + "epoch": 0.7638658253072069, + "grad_norm": 9.242107807949651e-06, + "learning_rate": 1.2110616897595863e-05, + "loss": 0.0029, + "step": 2300 + }, + { + "epoch": 0.7804716041182331, + "grad_norm": 0.00020709235104732215, + "learning_rate": 1.20861993466914e-05, + "loss": 0.0, + "step": 2350 + }, + { + "epoch": 0.7970773829292593, + "grad_norm": 0.0008476360817439854, + "learning_rate": 1.2061781795786937e-05, + "loss": 0.019, + "step": 2400 + }, + { + "epoch": 0.8136831617402857, + "grad_norm": 0.0002165739715564996, + "learning_rate": 1.2037364244882474e-05, + "loss": 0.0, + "step": 2450 + }, + { + "epoch": 0.8302889405513119, + "grad_norm": 0.029956847429275513, + "learning_rate": 1.201294669397801e-05, + "loss": 0.0012, + "step": 2500 + }, + { + "epoch": 0.8468947193623381, + "grad_norm": 0.0002400112134637311, + "learning_rate": 1.1988529143073546e-05, + "loss": 0.0191, + "step": 2550 + }, + { + "epoch": 0.8635004981733644, + "grad_norm": 0.0070993551053106785, + "learning_rate": 1.1964111592169083e-05, + "loss": 0.0155, + "step": 2600 + }, + { + "epoch": 0.8801062769843906, + "grad_norm": 5.127764234202914e-05, + "learning_rate": 1.1939694041264618e-05, + "loss": 0.0185, + "step": 2650 + }, + { + "epoch": 0.8967120557954168, + "grad_norm": 0.056577421724796295, + "learning_rate": 1.1915276490360155e-05, + "loss": 0.0063, + "step": 2700 + }, + { + "epoch": 0.913317834606443, + "grad_norm": 4.399678437039256e-05, + "learning_rate": 1.1890858939455692e-05, + "loss": 0.012, + "step": 2750 + }, + { + "epoch": 0.9299236134174693, + "grad_norm": 6.6589759626367595e-06, + "learning_rate": 1.1866441388551227e-05, + "loss": 0.0001, + "step": 2800 + }, + { + "epoch": 0.9465293922284955, + "grad_norm": 0.009270718321204185, + "learning_rate": 1.1842023837646764e-05, + "loss": 0.0001, + "step": 2850 + }, + { + "epoch": 0.9631351710395217, + "grad_norm": 6.743930339813232, + "learning_rate": 1.1817606286742301e-05, + "loss": 0.0019, + "step": 2900 + }, + { + "epoch": 0.9797409498505479, + "grad_norm": 10.679564476013184, + "learning_rate": 1.1793188735837838e-05, + "loss": 0.0258, + "step": 2950 + }, + { + "epoch": 0.9963467286615743, + "grad_norm": 0.0007653234642930329, + "learning_rate": 1.1768771184933373e-05, + "loss": 0.0018, + "step": 3000 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.997256679389313, + "eval_f1": 0.9972464717374746, + "eval_loss": 0.02325253002345562, + "eval_precision": 0.997240941740882, + "eval_recall": 0.997256679389313, + "eval_runtime": 36.6991, + "eval_samples_per_second": 228.453, + "eval_steps_per_second": 14.278, + "step": 3011 + } + ], + "logging_steps": 50, + "max_steps": 27099, + "num_input_tokens_seen": 0, + "num_train_epochs": 9, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.641430544259072e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/trial-4/checkpoint-3011/training_args.bin b/trial-4/checkpoint-3011/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c3d749965ab6105cf60004e03a98b462c58a9dda --- /dev/null +++ b/trial-4/checkpoint-3011/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89fb66224a4a1dbc68c030610c33a1d3f64ca676b2064b388b8e2a7385785f5d +size 5368 diff --git a/trial-5/checkpoint-3012/config.json b/trial-5/checkpoint-3012/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-5/checkpoint-3012/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-5/checkpoint-3012/model.safetensors b/trial-5/checkpoint-3012/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cd374aa8cef0f5a071b2d854957020e3677a43b8 --- /dev/null +++ b/trial-5/checkpoint-3012/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49ba330b843aca1a1d0454785b900ed96671619efb6df36ea614d0870f5ef2aa +size 598439784 diff --git a/trial-5/checkpoint-3012/optimizer.pt b/trial-5/checkpoint-3012/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb5b62653bc37e0704cf26972fd6718323668a1d --- /dev/null +++ b/trial-5/checkpoint-3012/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60fb304abd0c5b9d4e6de61faca1856b99e71865a5c592f8acaa47567b9139d9 +size 1196967418 diff --git a/trial-5/checkpoint-3012/rng_state.pth b/trial-5/checkpoint-3012/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1 --- /dev/null +++ b/trial-5/checkpoint-3012/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef +size 14244 diff --git a/trial-5/checkpoint-3012/scheduler.pt b/trial-5/checkpoint-3012/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..478a2e191ea69f4a9f4e75e30b6dd1e8c7827fa8 --- /dev/null +++ b/trial-5/checkpoint-3012/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c69ec29ae0867d661613f53dea74fb003b51f72db6450102f05c6dfa235171f +size 1064 diff --git a/trial-5/checkpoint-3012/trainer_state.json b/trial-5/checkpoint-3012/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c471c3d8a2ad97774aac1e6eac580c42f92cf28b --- /dev/null +++ b/trial-5/checkpoint-3012/trainer_state.json @@ -0,0 +1,477 @@ +{ + "best_metric": 0.0418265163898468, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-5/checkpoint-3012", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 3012, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.033200531208499334, + "grad_norm": 6.311530113220215, + "learning_rate": 1.279094112727349e-06, + "loss": 0.7104, + "step": 50 + }, + { + "epoch": 0.06640106241699867, + "grad_norm": 17.497058868408203, + "learning_rate": 1.2748333062225943e-06, + "loss": 0.5729, + "step": 100 + }, + { + "epoch": 0.099601593625498, + "grad_norm": 7.590151309967041, + "learning_rate": 1.2705724997178397e-06, + "loss": 0.4714, + "step": 150 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 6.96728515625, + "learning_rate": 1.2663116932130851e-06, + "loss": 0.3881, + "step": 200 + }, + { + "epoch": 0.16600265604249667, + "grad_norm": 4.9838714599609375, + "learning_rate": 1.2620508867083303e-06, + "loss": 0.3194, + "step": 250 + }, + { + "epoch": 0.199203187250996, + "grad_norm": 6.317371368408203, + "learning_rate": 1.2577900802035758e-06, + "loss": 0.2976, + "step": 300 + }, + { + "epoch": 0.23240371845949534, + "grad_norm": 15.331583023071289, + "learning_rate": 1.2535292736988212e-06, + "loss": 0.2392, + "step": 350 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 15.493165016174316, + "learning_rate": 1.2492684671940664e-06, + "loss": 0.2337, + "step": 400 + }, + { + "epoch": 0.29880478087649404, + "grad_norm": 3.7081472873687744, + "learning_rate": 1.2450076606893118e-06, + "loss": 0.2037, + "step": 450 + }, + { + "epoch": 0.33200531208499334, + "grad_norm": 4.029483318328857, + "learning_rate": 1.240746854184557e-06, + "loss": 0.2054, + "step": 500 + }, + { + "epoch": 0.3652058432934927, + "grad_norm": 4.573270797729492, + "learning_rate": 1.2364860476798024e-06, + "loss": 0.1555, + "step": 550 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 15.748998641967773, + "learning_rate": 1.2322252411750478e-06, + "loss": 0.1486, + "step": 600 + }, + { + "epoch": 0.4316069057104914, + "grad_norm": 12.240307807922363, + "learning_rate": 1.227964434670293e-06, + "loss": 0.1552, + "step": 650 + }, + { + "epoch": 0.4648074369189907, + "grad_norm": 17.192546844482422, + "learning_rate": 1.2237036281655385e-06, + "loss": 0.1234, + "step": 700 + }, + { + "epoch": 0.49800796812749004, + "grad_norm": 11.04953670501709, + "learning_rate": 1.2194428216607839e-06, + "loss": 0.1212, + "step": 750 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 4.883615016937256, + "learning_rate": 1.215182015156029e-06, + "loss": 0.1059, + "step": 800 + }, + { + "epoch": 0.5644090305444888, + "grad_norm": 4.633565425872803, + "learning_rate": 1.2109212086512745e-06, + "loss": 0.0788, + "step": 850 + }, + { + "epoch": 0.5976095617529881, + "grad_norm": 2.6228833198547363, + "learning_rate": 1.20666040214652e-06, + "loss": 0.087, + "step": 900 + }, + { + "epoch": 0.6308100929614874, + "grad_norm": 6.4782915115356445, + "learning_rate": 1.2023995956417651e-06, + "loss": 0.0802, + "step": 950 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 5.229304313659668, + "learning_rate": 1.1981387891370103e-06, + "loss": 0.077, + "step": 1000 + }, + { + "epoch": 0.6972111553784861, + "grad_norm": 6.034313201904297, + "learning_rate": 1.1938779826322558e-06, + "loss": 0.0703, + "step": 1050 + }, + { + "epoch": 0.7304116865869854, + "grad_norm": 9.29736614227295, + "learning_rate": 1.1896171761275012e-06, + "loss": 0.066, + "step": 1100 + }, + { + "epoch": 0.7636122177954847, + "grad_norm": 0.6172637343406677, + "learning_rate": 1.1853563696227464e-06, + "loss": 0.0692, + "step": 1150 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 1.642548680305481, + "learning_rate": 1.1810955631179918e-06, + "loss": 0.0437, + "step": 1200 + }, + { + "epoch": 0.8300132802124834, + "grad_norm": 3.888737916946411, + "learning_rate": 1.176834756613237e-06, + "loss": 0.0474, + "step": 1250 + }, + { + "epoch": 0.8632138114209827, + "grad_norm": 14.787779808044434, + "learning_rate": 1.1725739501084824e-06, + "loss": 0.0501, + "step": 1300 + }, + { + "epoch": 0.896414342629482, + "grad_norm": 0.8571153283119202, + "learning_rate": 1.1683131436037278e-06, + "loss": 0.0439, + "step": 1350 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.6915457248687744, + "learning_rate": 1.164052337098973e-06, + "loss": 0.0455, + "step": 1400 + }, + { + "epoch": 0.9628154050464808, + "grad_norm": 8.8081636428833, + "learning_rate": 1.1597915305942185e-06, + "loss": 0.0347, + "step": 1450 + }, + { + "epoch": 0.9960159362549801, + "grad_norm": 8.551522254943848, + "learning_rate": 1.1555307240894639e-06, + "loss": 0.0346, + "step": 1500 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.982824427480916, + "eval_f1": 0.9838970307302017, + "eval_loss": 0.05475565418601036, + "eval_precision": 0.986134299459291, + "eval_recall": 0.982824427480916, + "eval_runtime": 31.8933, + "eval_samples_per_second": 262.877, + "eval_steps_per_second": 8.215, + "step": 1506 + }, + { + "epoch": 1.0292164674634794, + "grad_norm": 13.078969955444336, + "learning_rate": 1.151269917584709e-06, + "loss": 0.0379, + "step": 1550 + }, + { + "epoch": 1.0624169986719787, + "grad_norm": 1.906078815460205, + "learning_rate": 1.1470091110799545e-06, + "loss": 0.0338, + "step": 1600 + }, + { + "epoch": 1.095617529880478, + "grad_norm": 0.4020080864429474, + "learning_rate": 1.1427483045752e-06, + "loss": 0.0298, + "step": 1650 + }, + { + "epoch": 1.1288180610889773, + "grad_norm": 2.647258758544922, + "learning_rate": 1.1384874980704451e-06, + "loss": 0.023, + "step": 1700 + }, + { + "epoch": 1.1620185922974768, + "grad_norm": 2.046747922897339, + "learning_rate": 1.1342266915656906e-06, + "loss": 0.0253, + "step": 1750 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 13.14510726928711, + "learning_rate": 1.129965885060936e-06, + "loss": 0.0268, + "step": 1800 + }, + { + "epoch": 1.2284196547144755, + "grad_norm": 0.12764006853103638, + "learning_rate": 1.1257050785561812e-06, + "loss": 0.0099, + "step": 1850 + }, + { + "epoch": 1.2616201859229748, + "grad_norm": 1.6261545419692993, + "learning_rate": 1.1214442720514266e-06, + "loss": 0.0252, + "step": 1900 + }, + { + "epoch": 1.294820717131474, + "grad_norm": 5.552518844604492, + "learning_rate": 1.117183465546672e-06, + "loss": 0.036, + "step": 1950 + }, + { + "epoch": 1.3280212483399734, + "grad_norm": 24.064516067504883, + "learning_rate": 1.1129226590419172e-06, + "loss": 0.0169, + "step": 2000 + }, + { + "epoch": 1.361221779548473, + "grad_norm": 0.00925782322883606, + "learning_rate": 1.1086618525371626e-06, + "loss": 0.0184, + "step": 2050 + }, + { + "epoch": 1.3944223107569722, + "grad_norm": 16.54283905029297, + "learning_rate": 1.1044010460324078e-06, + "loss": 0.0139, + "step": 2100 + }, + { + "epoch": 1.4276228419654715, + "grad_norm": 0.24406713247299194, + "learning_rate": 1.1001402395276533e-06, + "loss": 0.0126, + "step": 2150 + }, + { + "epoch": 1.4608233731739708, + "grad_norm": 0.02731563337147236, + "learning_rate": 1.0958794330228987e-06, + "loss": 0.0198, + "step": 2200 + }, + { + "epoch": 1.4940239043824701, + "grad_norm": 17.53055191040039, + "learning_rate": 1.0916186265181439e-06, + "loss": 0.0303, + "step": 2250 + }, + { + "epoch": 1.5272244355909694, + "grad_norm": 0.07282107323408127, + "learning_rate": 1.0873578200133893e-06, + "loss": 0.0016, + "step": 2300 + }, + { + "epoch": 1.5604249667994687, + "grad_norm": 20.794416427612305, + "learning_rate": 1.0830970135086347e-06, + "loss": 0.0225, + "step": 2350 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.052418053150177, + "learning_rate": 1.07883620700388e-06, + "loss": 0.0076, + "step": 2400 + }, + { + "epoch": 1.6268260292164674, + "grad_norm": 0.21063362061977386, + "learning_rate": 1.0745754004991254e-06, + "loss": 0.0159, + "step": 2450 + }, + { + "epoch": 1.6600265604249667, + "grad_norm": 10.455537796020508, + "learning_rate": 1.0703145939943708e-06, + "loss": 0.0105, + "step": 2500 + }, + { + "epoch": 1.6932270916334662, + "grad_norm": 6.205326557159424, + "learning_rate": 1.066053787489616e-06, + "loss": 0.0081, + "step": 2550 + }, + { + "epoch": 1.7264276228419655, + "grad_norm": 6.523694038391113, + "learning_rate": 1.0617929809848614e-06, + "loss": 0.0159, + "step": 2600 + }, + { + "epoch": 1.7596281540504648, + "grad_norm": 0.010043232701718807, + "learning_rate": 1.0575321744801068e-06, + "loss": 0.0113, + "step": 2650 + }, + { + "epoch": 1.792828685258964, + "grad_norm": 0.00458578672260046, + "learning_rate": 1.053271367975352e-06, + "loss": 0.0086, + "step": 2700 + }, + { + "epoch": 1.8260292164674636, + "grad_norm": 0.10986531525850296, + "learning_rate": 1.0490105614705974e-06, + "loss": 0.008, + "step": 2750 + }, + { + "epoch": 1.859229747675963, + "grad_norm": 0.12284637242555618, + "learning_rate": 1.0447497549658429e-06, + "loss": 0.0052, + "step": 2800 + }, + { + "epoch": 1.8924302788844622, + "grad_norm": 0.14606119692325592, + "learning_rate": 1.040488948461088e-06, + "loss": 0.0176, + "step": 2850 + }, + { + "epoch": 1.9256308100929616, + "grad_norm": 0.020491423085331917, + "learning_rate": 1.0362281419563333e-06, + "loss": 0.0102, + "step": 2900 + }, + { + "epoch": 1.9588313413014609, + "grad_norm": 0.05764462426304817, + "learning_rate": 1.0319673354515787e-06, + "loss": 0.0044, + "step": 2950 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.7329011559486389, + "learning_rate": 1.027706528946824e-06, + "loss": 0.0139, + "step": 3000 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9924856870229007, + "eval_f1": 0.9924235722235019, + "eval_loss": 0.0418265163898468, + "eval_precision": 0.9923830636545329, + "eval_recall": 0.9924856870229007, + "eval_runtime": 31.6222, + "eval_samples_per_second": 265.131, + "eval_steps_per_second": 8.285, + "step": 3012 + } + ], + "logging_steps": 50, + "max_steps": 15060, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.282861088518144e+16, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/trial-5/checkpoint-3012/training_args.bin b/trial-5/checkpoint-3012/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4692f33409907d8d382ead52b8f9423cf80dd960 --- /dev/null +++ b/trial-5/checkpoint-3012/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b5a07ff58876babfad1d92462cc9e7062c8f5b0af8d8ba9142ab6f5e8880cf2 +size 5368 diff --git a/trial-6/checkpoint-6022/config.json b/trial-6/checkpoint-6022/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-6/checkpoint-6022/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-6/checkpoint-6022/model.safetensors b/trial-6/checkpoint-6022/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f3124a5206ff69a1d2328df7fd8330a3b17025d0 --- /dev/null +++ b/trial-6/checkpoint-6022/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a60e2fc558ad0e5a9a4825234c28006f4c14c02aab969b5ebf7cb43d8f890d9e +size 598439784 diff --git a/trial-6/checkpoint-6022/optimizer.pt b/trial-6/checkpoint-6022/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..df6eee38753d5beb9baf27e2f30a8ffe07b42512 --- /dev/null +++ b/trial-6/checkpoint-6022/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ccfa5cc878422afe6f38c7ea21cef7e9f532ec15d2d9169693197daa8b04fb0 +size 1196967418 diff --git a/trial-6/checkpoint-6022/rng_state.pth b/trial-6/checkpoint-6022/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1 --- /dev/null +++ b/trial-6/checkpoint-6022/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef +size 14244 diff --git a/trial-6/checkpoint-6022/scheduler.pt b/trial-6/checkpoint-6022/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e8e492dc07ae1b174c07babfc899bd22becedc0 --- /dev/null +++ b/trial-6/checkpoint-6022/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f20e2229466448de860622239acb9999c7ec64084a1decca7269c9cb3644988 +size 1064 diff --git a/trial-6/checkpoint-6022/trainer_state.json b/trial-6/checkpoint-6022/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..693b4b5a4df0a852da03bc416238e936836f6edb --- /dev/null +++ b/trial-6/checkpoint-6022/trainer_state.json @@ -0,0 +1,897 @@ +{ + "best_metric": 0.03524520993232727, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-6/checkpoint-6022", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 6022, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016605778811026237, + "grad_norm": 7.985777378082275, + "learning_rate": 3.663949947127632e-06, + "loss": 0.6449, + "step": 50 + }, + { + "epoch": 0.033211557622052475, + "grad_norm": 16.946643829345703, + "learning_rate": 3.64867585196702e-06, + "loss": 0.4098, + "step": 100 + }, + { + "epoch": 0.04981733643307871, + "grad_norm": 15.02230167388916, + "learning_rate": 3.633401756806408e-06, + "loss": 0.2997, + "step": 150 + }, + { + "epoch": 0.06642311524410495, + "grad_norm": 2.651068925857544, + "learning_rate": 3.6181276616457957e-06, + "loss": 0.2322, + "step": 200 + }, + { + "epoch": 0.08302889405513118, + "grad_norm": 71.46488189697266, + "learning_rate": 3.602853566485183e-06, + "loss": 0.1922, + "step": 250 + }, + { + "epoch": 0.09963467286615742, + "grad_norm": 2.4328176975250244, + "learning_rate": 3.5875794713245715e-06, + "loss": 0.1731, + "step": 300 + }, + { + "epoch": 0.11624045167718366, + "grad_norm": 8.744805335998535, + "learning_rate": 3.5723053761639594e-06, + "loss": 0.1264, + "step": 350 + }, + { + "epoch": 0.1328462304882099, + "grad_norm": 10.860421180725098, + "learning_rate": 3.557031281003347e-06, + "loss": 0.1423, + "step": 400 + }, + { + "epoch": 0.14945200929923613, + "grad_norm": 1.3849588632583618, + "learning_rate": 3.5417571858427352e-06, + "loss": 0.1, + "step": 450 + }, + { + "epoch": 0.16605778811026237, + "grad_norm": 18.67996597290039, + "learning_rate": 3.526483090682123e-06, + "loss": 0.1297, + "step": 500 + }, + { + "epoch": 0.1826635669212886, + "grad_norm": 22.31239128112793, + "learning_rate": 3.5112089955215106e-06, + "loss": 0.1266, + "step": 550 + }, + { + "epoch": 0.19926934573231483, + "grad_norm": 7.551675319671631, + "learning_rate": 3.4959349003608985e-06, + "loss": 0.0872, + "step": 600 + }, + { + "epoch": 0.2158751245433411, + "grad_norm": 0.4732609987258911, + "learning_rate": 3.480660805200287e-06, + "loss": 0.0735, + "step": 650 + }, + { + "epoch": 0.23248090335436733, + "grad_norm": 0.4966350495815277, + "learning_rate": 3.4653867100396748e-06, + "loss": 0.1583, + "step": 700 + }, + { + "epoch": 0.24908668216539356, + "grad_norm": 0.5777727961540222, + "learning_rate": 3.4501126148790623e-06, + "loss": 0.0954, + "step": 750 + }, + { + "epoch": 0.2656924609764198, + "grad_norm": 3.709627389907837, + "learning_rate": 3.4348385197184506e-06, + "loss": 0.07, + "step": 800 + }, + { + "epoch": 0.282298239787446, + "grad_norm": 7.013435363769531, + "learning_rate": 3.4195644245578385e-06, + "loss": 0.1039, + "step": 850 + }, + { + "epoch": 0.29890401859847227, + "grad_norm": 0.41413068771362305, + "learning_rate": 3.404290329397226e-06, + "loss": 0.0699, + "step": 900 + }, + { + "epoch": 0.3155097974094985, + "grad_norm": 0.23823711276054382, + "learning_rate": 3.3890162342366143e-06, + "loss": 0.0836, + "step": 950 + }, + { + "epoch": 0.33211557622052473, + "grad_norm": 0.011693170294165611, + "learning_rate": 3.3737421390760022e-06, + "loss": 0.0571, + "step": 1000 + }, + { + "epoch": 0.348721355031551, + "grad_norm": 0.003961833659559488, + "learning_rate": 3.3584680439153897e-06, + "loss": 0.0516, + "step": 1050 + }, + { + "epoch": 0.3653271338425772, + "grad_norm": 0.007026594132184982, + "learning_rate": 3.3431939487547776e-06, + "loss": 0.0723, + "step": 1100 + }, + { + "epoch": 0.38193291265360346, + "grad_norm": 0.024607744067907333, + "learning_rate": 3.327919853594166e-06, + "loss": 0.0412, + "step": 1150 + }, + { + "epoch": 0.39853869146462967, + "grad_norm": 0.005524761509150267, + "learning_rate": 3.3126457584335534e-06, + "loss": 0.0246, + "step": 1200 + }, + { + "epoch": 0.41514447027565593, + "grad_norm": 0.13576415181159973, + "learning_rate": 3.2973716632729413e-06, + "loss": 0.0349, + "step": 1250 + }, + { + "epoch": 0.4317502490866822, + "grad_norm": 3.3155462741851807, + "learning_rate": 3.2820975681123297e-06, + "loss": 0.0547, + "step": 1300 + }, + { + "epoch": 0.4483560278977084, + "grad_norm": 0.3045770823955536, + "learning_rate": 3.266823472951717e-06, + "loss": 0.0406, + "step": 1350 + }, + { + "epoch": 0.46496180670873466, + "grad_norm": 0.003651317674666643, + "learning_rate": 3.251549377791105e-06, + "loss": 0.0064, + "step": 1400 + }, + { + "epoch": 0.48156758551976087, + "grad_norm": 0.06915970891714096, + "learning_rate": 3.236275282630493e-06, + "loss": 0.0259, + "step": 1450 + }, + { + "epoch": 0.4981733643307871, + "grad_norm": 0.006649952847510576, + "learning_rate": 3.221001187469881e-06, + "loss": 0.0209, + "step": 1500 + }, + { + "epoch": 0.5147791431418134, + "grad_norm": 0.0077498299069702625, + "learning_rate": 3.2057270923092688e-06, + "loss": 0.0128, + "step": 1550 + }, + { + "epoch": 0.5313849219528396, + "grad_norm": 17.972652435302734, + "learning_rate": 3.1904529971486567e-06, + "loss": 0.0522, + "step": 1600 + }, + { + "epoch": 0.5479907007638658, + "grad_norm": 0.02570178173482418, + "learning_rate": 3.1751789019880446e-06, + "loss": 0.0236, + "step": 1650 + }, + { + "epoch": 0.564596479574892, + "grad_norm": 0.01210557110607624, + "learning_rate": 3.1599048068274325e-06, + "loss": 0.0116, + "step": 1700 + }, + { + "epoch": 0.5812022583859183, + "grad_norm": 0.0021121352910995483, + "learning_rate": 3.1446307116668204e-06, + "loss": 0.004, + "step": 1750 + }, + { + "epoch": 0.5978080371969445, + "grad_norm": 0.0030424538999795914, + "learning_rate": 3.129356616506208e-06, + "loss": 0.0086, + "step": 1800 + }, + { + "epoch": 0.6144138160079707, + "grad_norm": 0.01042268518358469, + "learning_rate": 3.114082521345596e-06, + "loss": 0.0302, + "step": 1850 + }, + { + "epoch": 0.631019594818997, + "grad_norm": 1.67741858959198, + "learning_rate": 3.098808426184984e-06, + "loss": 0.0318, + "step": 1900 + }, + { + "epoch": 0.6476253736300233, + "grad_norm": 3.324981689453125, + "learning_rate": 3.083534331024372e-06, + "loss": 0.0308, + "step": 1950 + }, + { + "epoch": 0.6642311524410495, + "grad_norm": 0.05241026356816292, + "learning_rate": 3.06826023586376e-06, + "loss": 0.0229, + "step": 2000 + }, + { + "epoch": 0.6808369312520757, + "grad_norm": 0.09731736034154892, + "learning_rate": 3.052986140703148e-06, + "loss": 0.027, + "step": 2050 + }, + { + "epoch": 0.697442710063102, + "grad_norm": 0.09534373879432678, + "learning_rate": 3.0377120455425357e-06, + "loss": 0.0278, + "step": 2100 + }, + { + "epoch": 0.7140484888741282, + "grad_norm": 0.028926922008395195, + "learning_rate": 3.0224379503819232e-06, + "loss": 0.0195, + "step": 2150 + }, + { + "epoch": 0.7306542676851544, + "grad_norm": 0.003165798494592309, + "learning_rate": 3.0071638552213115e-06, + "loss": 0.0127, + "step": 2200 + }, + { + "epoch": 0.7472600464961807, + "grad_norm": 1.1501398086547852, + "learning_rate": 2.9918897600606995e-06, + "loss": 0.0228, + "step": 2250 + }, + { + "epoch": 0.7638658253072069, + "grad_norm": 0.015178106725215912, + "learning_rate": 2.976615664900087e-06, + "loss": 0.0211, + "step": 2300 + }, + { + "epoch": 0.7804716041182331, + "grad_norm": 0.043032608926296234, + "learning_rate": 2.9613415697394753e-06, + "loss": 0.0007, + "step": 2350 + }, + { + "epoch": 0.7970773829292593, + "grad_norm": 0.0029065206181257963, + "learning_rate": 2.946067474578863e-06, + "loss": 0.0083, + "step": 2400 + }, + { + "epoch": 0.8136831617402857, + "grad_norm": 0.006119361147284508, + "learning_rate": 2.9307933794182507e-06, + "loss": 0.0005, + "step": 2450 + }, + { + "epoch": 0.8302889405513119, + "grad_norm": 0.0004126826534047723, + "learning_rate": 2.915519284257639e-06, + "loss": 0.002, + "step": 2500 + }, + { + "epoch": 0.8468947193623381, + "grad_norm": 0.0028823954053223133, + "learning_rate": 2.900245189097027e-06, + "loss": 0.0105, + "step": 2550 + }, + { + "epoch": 0.8635004981733644, + "grad_norm": 0.03489808738231659, + "learning_rate": 2.8849710939364144e-06, + "loss": 0.0132, + "step": 2600 + }, + { + "epoch": 0.8801062769843906, + "grad_norm": 0.0013511483557522297, + "learning_rate": 2.8696969987758023e-06, + "loss": 0.0018, + "step": 2650 + }, + { + "epoch": 0.8967120557954168, + "grad_norm": 0.00025652506155893207, + "learning_rate": 2.8544229036151906e-06, + "loss": 0.0059, + "step": 2700 + }, + { + "epoch": 0.913317834606443, + "grad_norm": 0.005165331996977329, + "learning_rate": 2.839148808454578e-06, + "loss": 0.0063, + "step": 2750 + }, + { + "epoch": 0.9299236134174693, + "grad_norm": 0.00012050822988385335, + "learning_rate": 2.823874713293966e-06, + "loss": 0.0009, + "step": 2800 + }, + { + "epoch": 0.9465293922284955, + "grad_norm": 0.026708438992500305, + "learning_rate": 2.8086006181333543e-06, + "loss": 0.0, + "step": 2850 + }, + { + "epoch": 0.9631351710395217, + "grad_norm": 35.39009475708008, + "learning_rate": 2.793326522972742e-06, + "loss": 0.0154, + "step": 2900 + }, + { + "epoch": 0.9797409498505479, + "grad_norm": 143.70465087890625, + "learning_rate": 2.7780524278121297e-06, + "loss": 0.0299, + "step": 2950 + }, + { + "epoch": 0.9963467286615743, + "grad_norm": 0.00032583833672106266, + "learning_rate": 2.7627783326515176e-06, + "loss": 0.0058, + "step": 3000 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9959446564885496, + "eval_f1": 0.9958699330259847, + "eval_loss": 0.03994645178318024, + "eval_precision": 0.9959073754230947, + "eval_recall": 0.9959446564885496, + "eval_runtime": 36.9535, + "eval_samples_per_second": 226.88, + "eval_steps_per_second": 14.18, + "step": 3011 + }, + { + "epoch": 1.0129525074726005, + "grad_norm": 0.00038893838063813746, + "learning_rate": 2.7475042374909055e-06, + "loss": 0.0094, + "step": 3050 + }, + { + "epoch": 1.0295582862836268, + "grad_norm": 0.0011424238327890635, + "learning_rate": 2.7322301423302934e-06, + "loss": 0.0001, + "step": 3100 + }, + { + "epoch": 1.0461640650946529, + "grad_norm": 0.01706782355904579, + "learning_rate": 2.7169560471696814e-06, + "loss": 0.0041, + "step": 3150 + }, + { + "epoch": 1.0627698439056792, + "grad_norm": 0.00026497532962821424, + "learning_rate": 2.7016819520090697e-06, + "loss": 0.0001, + "step": 3200 + }, + { + "epoch": 1.0793756227167055, + "grad_norm": 0.4866068363189697, + "learning_rate": 2.686407856848457e-06, + "loss": 0.026, + "step": 3250 + }, + { + "epoch": 1.0959814015277316, + "grad_norm": 8.705830987310037e-05, + "learning_rate": 2.671133761687845e-06, + "loss": 0.0156, + "step": 3300 + }, + { + "epoch": 1.112587180338758, + "grad_norm": 0.004105957690626383, + "learning_rate": 2.655859666527233e-06, + "loss": 0.0028, + "step": 3350 + }, + { + "epoch": 1.1291929591497842, + "grad_norm": 0.001343347830697894, + "learning_rate": 2.640585571366621e-06, + "loss": 0.0115, + "step": 3400 + }, + { + "epoch": 1.1457987379608103, + "grad_norm": 8.608686039224267e-05, + "learning_rate": 2.625311476206009e-06, + "loss": 0.0065, + "step": 3450 + }, + { + "epoch": 1.1624045167718366, + "grad_norm": 0.001792514231055975, + "learning_rate": 2.6100373810453967e-06, + "loss": 0.0078, + "step": 3500 + }, + { + "epoch": 1.1790102955828627, + "grad_norm": 0.0008498657844029367, + "learning_rate": 2.5947632858847846e-06, + "loss": 0.0, + "step": 3550 + }, + { + "epoch": 1.195616074393889, + "grad_norm": 0.012572677806019783, + "learning_rate": 2.5794891907241725e-06, + "loss": 0.0005, + "step": 3600 + }, + { + "epoch": 1.2122218532049154, + "grad_norm": 0.0010890236590057611, + "learning_rate": 2.5642150955635604e-06, + "loss": 0.0, + "step": 3650 + }, + { + "epoch": 1.2288276320159415, + "grad_norm": 0.0009271232993341982, + "learning_rate": 2.548941000402948e-06, + "loss": 0.0, + "step": 3700 + }, + { + "epoch": 1.2454334108269678, + "grad_norm": 0.0008255397551693022, + "learning_rate": 2.5336669052423362e-06, + "loss": 0.0155, + "step": 3750 + }, + { + "epoch": 1.2620391896379939, + "grad_norm": 0.0051245614886283875, + "learning_rate": 2.518392810081724e-06, + "loss": 0.0022, + "step": 3800 + }, + { + "epoch": 1.2786449684490202, + "grad_norm": 0.01625339686870575, + "learning_rate": 2.5031187149211116e-06, + "loss": 0.006, + "step": 3850 + }, + { + "epoch": 1.2952507472600465, + "grad_norm": 0.0009482129826210439, + "learning_rate": 2.4878446197605e-06, + "loss": 0.0001, + "step": 3900 + }, + { + "epoch": 1.3118565260710726, + "grad_norm": 0.00012260080256965011, + "learning_rate": 2.472570524599888e-06, + "loss": 0.0001, + "step": 3950 + }, + { + "epoch": 1.328462304882099, + "grad_norm": 0.0005531097413040698, + "learning_rate": 2.4572964294392753e-06, + "loss": 0.0206, + "step": 4000 + }, + { + "epoch": 1.3450680836931252, + "grad_norm": 0.00046819329145364463, + "learning_rate": 2.4420223342786637e-06, + "loss": 0.0015, + "step": 4050 + }, + { + "epoch": 1.3616738625041513, + "grad_norm": 0.0008780017960816622, + "learning_rate": 2.4267482391180516e-06, + "loss": 0.0004, + "step": 4100 + }, + { + "epoch": 1.3782796413151777, + "grad_norm": 0.0001749313232721761, + "learning_rate": 2.411474143957439e-06, + "loss": 0.0097, + "step": 4150 + }, + { + "epoch": 1.394885420126204, + "grad_norm": 0.0004841366899199784, + "learning_rate": 2.396200048796827e-06, + "loss": 0.0, + "step": 4200 + }, + { + "epoch": 1.41149119893723, + "grad_norm": 0.0015521385939791799, + "learning_rate": 2.3809259536362153e-06, + "loss": 0.0006, + "step": 4250 + }, + { + "epoch": 1.4280969777482564, + "grad_norm": 0.0003654654719866812, + "learning_rate": 2.3656518584756028e-06, + "loss": 0.0007, + "step": 4300 + }, + { + "epoch": 1.4447027565592827, + "grad_norm": 0.0011144432937726378, + "learning_rate": 2.3503777633149907e-06, + "loss": 0.0131, + "step": 4350 + }, + { + "epoch": 1.4613085353703088, + "grad_norm": 0.011592933908104897, + "learning_rate": 2.335103668154379e-06, + "loss": 0.0036, + "step": 4400 + }, + { + "epoch": 1.4779143141813351, + "grad_norm": 0.026564495638012886, + "learning_rate": 2.319829572993767e-06, + "loss": 0.0001, + "step": 4450 + }, + { + "epoch": 1.4945200929923614, + "grad_norm": 0.003402173984795809, + "learning_rate": 2.3045554778331544e-06, + "loss": 0.0052, + "step": 4500 + }, + { + "epoch": 1.5111258718033875, + "grad_norm": 0.0031449920497834682, + "learning_rate": 2.2892813826725423e-06, + "loss": 0.0, + "step": 4550 + }, + { + "epoch": 1.5277316506144138, + "grad_norm": 0.000741615192964673, + "learning_rate": 2.2740072875119306e-06, + "loss": 0.0, + "step": 4600 + }, + { + "epoch": 1.5443374294254402, + "grad_norm": 0.00041236402466893196, + "learning_rate": 2.258733192351318e-06, + "loss": 0.0038, + "step": 4650 + }, + { + "epoch": 1.5609432082364663, + "grad_norm": 0.02141823247075081, + "learning_rate": 2.243459097190706e-06, + "loss": 0.0, + "step": 4700 + }, + { + "epoch": 1.5775489870474926, + "grad_norm": 0.00028093060245737433, + "learning_rate": 2.2281850020300944e-06, + "loss": 0.0, + "step": 4750 + }, + { + "epoch": 1.594154765858519, + "grad_norm": 0.0016667908057570457, + "learning_rate": 2.212910906869482e-06, + "loss": 0.0, + "step": 4800 + }, + { + "epoch": 1.610760544669545, + "grad_norm": 0.0005989013588987291, + "learning_rate": 2.1976368117088698e-06, + "loss": 0.0, + "step": 4850 + }, + { + "epoch": 1.627366323480571, + "grad_norm": 0.0007902457728050649, + "learning_rate": 2.1823627165482577e-06, + "loss": 0.0023, + "step": 4900 + }, + { + "epoch": 1.6439721022915976, + "grad_norm": 6.165813829284161e-05, + "learning_rate": 2.1670886213876456e-06, + "loss": 0.0, + "step": 4950 + }, + { + "epoch": 1.6605778811026237, + "grad_norm": 0.0002811133745126426, + "learning_rate": 2.1518145262270335e-06, + "loss": 0.0, + "step": 5000 + }, + { + "epoch": 1.6771836599136498, + "grad_norm": 0.00117580930236727, + "learning_rate": 2.1365404310664214e-06, + "loss": 0.0, + "step": 5050 + }, + { + "epoch": 1.6937894387246761, + "grad_norm": 3.999446926172823e-05, + "learning_rate": 2.1212663359058093e-06, + "loss": 0.0, + "step": 5100 + }, + { + "epoch": 1.7103952175357025, + "grad_norm": 0.0005360537325032055, + "learning_rate": 2.105992240745197e-06, + "loss": 0.0, + "step": 5150 + }, + { + "epoch": 1.7270009963467285, + "grad_norm": 0.8813786506652832, + "learning_rate": 2.090718145584585e-06, + "loss": 0.0137, + "step": 5200 + }, + { + "epoch": 1.7436067751577549, + "grad_norm": 0.046743907034397125, + "learning_rate": 2.0754440504239726e-06, + "loss": 0.0071, + "step": 5250 + }, + { + "epoch": 1.7602125539687812, + "grad_norm": 0.00034072858397848904, + "learning_rate": 2.060169955263361e-06, + "loss": 0.0119, + "step": 5300 + }, + { + "epoch": 1.7768183327798073, + "grad_norm": 0.007919142954051495, + "learning_rate": 2.044895860102749e-06, + "loss": 0.0142, + "step": 5350 + }, + { + "epoch": 1.7934241115908336, + "grad_norm": 0.0014535001246258616, + "learning_rate": 2.0296217649421363e-06, + "loss": 0.0, + "step": 5400 + }, + { + "epoch": 1.81002989040186, + "grad_norm": 0.005964061710983515, + "learning_rate": 2.0143476697815246e-06, + "loss": 0.0006, + "step": 5450 + }, + { + "epoch": 1.826635669212886, + "grad_norm": 0.0018063083989545703, + "learning_rate": 1.9990735746209125e-06, + "loss": 0.0, + "step": 5500 + }, + { + "epoch": 1.8432414480239123, + "grad_norm": 0.00010997989738825709, + "learning_rate": 1.9837994794603e-06, + "loss": 0.0, + "step": 5550 + }, + { + "epoch": 1.8598472268349386, + "grad_norm": 0.004599976819008589, + "learning_rate": 1.968525384299688e-06, + "loss": 0.0, + "step": 5600 + }, + { + "epoch": 1.8764530056459647, + "grad_norm": 0.00086441938765347, + "learning_rate": 1.9532512891390763e-06, + "loss": 0.0066, + "step": 5650 + }, + { + "epoch": 1.893058784456991, + "grad_norm": 0.0016492039430886507, + "learning_rate": 1.937977193978464e-06, + "loss": 0.0019, + "step": 5700 + }, + { + "epoch": 1.9096645632680174, + "grad_norm": 4.4919357605976984e-05, + "learning_rate": 1.9227030988178516e-06, + "loss": 0.0, + "step": 5750 + }, + { + "epoch": 1.9262703420790435, + "grad_norm": 1.0092087984085083, + "learning_rate": 1.90742900365724e-06, + "loss": 0.0049, + "step": 5800 + }, + { + "epoch": 1.9428761208900698, + "grad_norm": 0.0003246455453336239, + "learning_rate": 1.8921549084966279e-06, + "loss": 0.0002, + "step": 5850 + }, + { + "epoch": 1.959481899701096, + "grad_norm": 0.0008708651876077056, + "learning_rate": 1.8768808133360154e-06, + "loss": 0.0215, + "step": 5900 + }, + { + "epoch": 1.9760876785121222, + "grad_norm": 0.003138365224003792, + "learning_rate": 1.8616067181754035e-06, + "loss": 0.0, + "step": 5950 + }, + { + "epoch": 1.9926934573231485, + "grad_norm": 10.618377685546875, + "learning_rate": 1.8463326230147916e-06, + "loss": 0.0306, + "step": 6000 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9964217557251909, + "eval_f1": 0.9963727955130689, + "eval_loss": 0.03524520993232727, + "eval_precision": 0.9963861752950809, + "eval_recall": 0.9964217557251909, + "eval_runtime": 38.2329, + "eval_samples_per_second": 219.287, + "eval_steps_per_second": 13.705, + "step": 6022 + } + ], + "logging_steps": 50, + "max_steps": 12044, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.282861088518144e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/trial-6/checkpoint-6022/training_args.bin b/trial-6/checkpoint-6022/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ff64b4b12daf1b9553500a47074a85378615d75 --- /dev/null +++ b/trial-6/checkpoint-6022/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd2e4fb9a115884edb87825aefbd32c53d93671f7c6430a41871a9ca795015e8 +size 5368 diff --git a/trial-7/checkpoint-6022/config.json b/trial-7/checkpoint-6022/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-7/checkpoint-6022/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-7/checkpoint-6022/model.safetensors b/trial-7/checkpoint-6022/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ba72213ea6cdd9fa00f45ebe5294757054d14ecb --- /dev/null +++ b/trial-7/checkpoint-6022/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:999491f80f5e7d65107af9b9c39856993ea382919338a256f506e02282154a73 +size 598439784 diff --git a/trial-7/checkpoint-6022/optimizer.pt b/trial-7/checkpoint-6022/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8556aa85feb6b7c96bcabe93d030e972640a17db --- /dev/null +++ b/trial-7/checkpoint-6022/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f58dae64786998ee14f679a5c11f25c6e8e826a9be7e1dc502a4960619f5b73d +size 1196967418 diff --git a/trial-7/checkpoint-6022/rng_state.pth b/trial-7/checkpoint-6022/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1 --- /dev/null +++ b/trial-7/checkpoint-6022/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef +size 14244 diff --git a/trial-7/checkpoint-6022/scheduler.pt b/trial-7/checkpoint-6022/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..feffa73e78d93c8ae57663715e45d3c3914fb368 --- /dev/null +++ b/trial-7/checkpoint-6022/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c7a8af5cc58476915589f4479c98012a6f77921d3365ad0d2b9efb681952de5 +size 1064 diff --git a/trial-7/checkpoint-6022/trainer_state.json b/trial-7/checkpoint-6022/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c437b8cf7dba6cc29f8d97e55828db555f44696f --- /dev/null +++ b/trial-7/checkpoint-6022/trainer_state.json @@ -0,0 +1,897 @@ +{ + "best_metric": 0.021904850378632545, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-7/checkpoint-6022", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 6022, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016605778811026237, + "grad_norm": 4.708388328552246, + "learning_rate": 4.332282504686465e-05, + "loss": 0.3911, + "step": 50 + }, + { + "epoch": 0.033211557622052475, + "grad_norm": 3.693350315093994, + "learning_rate": 4.3142222974800016e-05, + "loss": 0.1288, + "step": 100 + }, + { + "epoch": 0.04981733643307871, + "grad_norm": 27.80695915222168, + "learning_rate": 4.2961620902735386e-05, + "loss": 0.1, + "step": 150 + }, + { + "epoch": 0.06642311524410495, + "grad_norm": 0.014199809171259403, + "learning_rate": 4.278101883067075e-05, + "loss": 0.0817, + "step": 200 + }, + { + "epoch": 0.08302889405513118, + "grad_norm": 0.024310972541570663, + "learning_rate": 4.260041675860611e-05, + "loss": 0.0234, + "step": 250 + }, + { + "epoch": 0.09963467286615742, + "grad_norm": 0.0038154239300638437, + "learning_rate": 4.2419814686541476e-05, + "loss": 0.0781, + "step": 300 + }, + { + "epoch": 0.11624045167718366, + "grad_norm": 0.002823322080075741, + "learning_rate": 4.2239212614476846e-05, + "loss": 0.0394, + "step": 350 + }, + { + "epoch": 0.1328462304882099, + "grad_norm": 12.188178062438965, + "learning_rate": 4.20586105424122e-05, + "loss": 0.0786, + "step": 400 + }, + { + "epoch": 0.14945200929923613, + "grad_norm": 0.02171366475522518, + "learning_rate": 4.187800847034757e-05, + "loss": 0.0337, + "step": 450 + }, + { + "epoch": 0.16605778811026237, + "grad_norm": 1.8111064434051514, + "learning_rate": 4.1697406398282937e-05, + "loss": 0.0427, + "step": 500 + }, + { + "epoch": 0.1826635669212886, + "grad_norm": 0.8548564910888672, + "learning_rate": 4.15168043262183e-05, + "loss": 0.0733, + "step": 550 + }, + { + "epoch": 0.19926934573231483, + "grad_norm": 0.03964327648282051, + "learning_rate": 4.133620225415367e-05, + "loss": 0.0411, + "step": 600 + }, + { + "epoch": 0.2158751245433411, + "grad_norm": 0.012529165484011173, + "learning_rate": 4.1155600182089034e-05, + "loss": 0.0453, + "step": 650 + }, + { + "epoch": 0.23248090335436733, + "grad_norm": 0.042264923453330994, + "learning_rate": 4.09749981100244e-05, + "loss": 0.0338, + "step": 700 + }, + { + "epoch": 0.24908668216539356, + "grad_norm": 0.004559572786092758, + "learning_rate": 4.079439603795976e-05, + "loss": 0.0467, + "step": 750 + }, + { + "epoch": 0.2656924609764198, + "grad_norm": 0.002454261528328061, + "learning_rate": 4.061379396589513e-05, + "loss": 0.0228, + "step": 800 + }, + { + "epoch": 0.282298239787446, + "grad_norm": 2.4341135025024414, + "learning_rate": 4.0433191893830494e-05, + "loss": 0.0572, + "step": 850 + }, + { + "epoch": 0.29890401859847227, + "grad_norm": 0.15002170205116272, + "learning_rate": 4.025258982176586e-05, + "loss": 0.0244, + "step": 900 + }, + { + "epoch": 0.3155097974094985, + "grad_norm": 0.00873472262173891, + "learning_rate": 4.007198774970122e-05, + "loss": 0.0207, + "step": 950 + }, + { + "epoch": 0.33211557622052473, + "grad_norm": 0.07583663612604141, + "learning_rate": 3.989138567763659e-05, + "loss": 0.0233, + "step": 1000 + }, + { + "epoch": 0.348721355031551, + "grad_norm": 0.002584233647212386, + "learning_rate": 3.9710783605571955e-05, + "loss": 0.0228, + "step": 1050 + }, + { + "epoch": 0.3653271338425772, + "grad_norm": 0.025093793869018555, + "learning_rate": 3.953018153350732e-05, + "loss": 0.0393, + "step": 1100 + }, + { + "epoch": 0.38193291265360346, + "grad_norm": 0.26810237765312195, + "learning_rate": 3.934957946144268e-05, + "loss": 0.0213, + "step": 1150 + }, + { + "epoch": 0.39853869146462967, + "grad_norm": 0.0005972657818347216, + "learning_rate": 3.9168977389378045e-05, + "loss": 0.0099, + "step": 1200 + }, + { + "epoch": 0.41514447027565593, + "grad_norm": 2.7341020107269287, + "learning_rate": 3.8988375317313415e-05, + "loss": 0.0165, + "step": 1250 + }, + { + "epoch": 0.4317502490866822, + "grad_norm": 30.906461715698242, + "learning_rate": 3.880777324524878e-05, + "loss": 0.0481, + "step": 1300 + }, + { + "epoch": 0.4483560278977084, + "grad_norm": 0.07481276988983154, + "learning_rate": 3.862717117318414e-05, + "loss": 0.0249, + "step": 1350 + }, + { + "epoch": 0.46496180670873466, + "grad_norm": 0.04692293331027031, + "learning_rate": 3.8446569101119505e-05, + "loss": 0.0047, + "step": 1400 + }, + { + "epoch": 0.48156758551976087, + "grad_norm": 0.008200657553970814, + "learning_rate": 3.8265967029054876e-05, + "loss": 0.0244, + "step": 1450 + }, + { + "epoch": 0.4981733643307871, + "grad_norm": 0.006684092804789543, + "learning_rate": 3.808536495699023e-05, + "loss": 0.0285, + "step": 1500 + }, + { + "epoch": 0.5147791431418134, + "grad_norm": 0.00020126289746258408, + "learning_rate": 3.79047628849256e-05, + "loss": 0.0187, + "step": 1550 + }, + { + "epoch": 0.5313849219528396, + "grad_norm": 0.5489906668663025, + "learning_rate": 3.7724160812860966e-05, + "loss": 0.0245, + "step": 1600 + }, + { + "epoch": 0.5479907007638658, + "grad_norm": 0.01115335151553154, + "learning_rate": 3.754355874079633e-05, + "loss": 0.0317, + "step": 1650 + }, + { + "epoch": 0.564596479574892, + "grad_norm": 0.0077936286106705666, + "learning_rate": 3.73629566687317e-05, + "loss": 0.0044, + "step": 1700 + }, + { + "epoch": 0.5812022583859183, + "grad_norm": 48.291107177734375, + "learning_rate": 3.718235459666706e-05, + "loss": 0.0039, + "step": 1750 + }, + { + "epoch": 0.5978080371969445, + "grad_norm": 0.005009625572711229, + "learning_rate": 3.7001752524602426e-05, + "loss": 0.0119, + "step": 1800 + }, + { + "epoch": 0.6144138160079707, + "grad_norm": 0.0016993529861792922, + "learning_rate": 3.682115045253779e-05, + "loss": 0.0111, + "step": 1850 + }, + { + "epoch": 0.631019594818997, + "grad_norm": 0.03398797661066055, + "learning_rate": 3.664054838047316e-05, + "loss": 0.0155, + "step": 1900 + }, + { + "epoch": 0.6476253736300233, + "grad_norm": 0.0077589512802660465, + "learning_rate": 3.6459946308408524e-05, + "loss": 0.0014, + "step": 1950 + }, + { + "epoch": 0.6642311524410495, + "grad_norm": 0.0004693228402175009, + "learning_rate": 3.627934423634389e-05, + "loss": 0.0209, + "step": 2000 + }, + { + "epoch": 0.6808369312520757, + "grad_norm": 0.0019584419205784798, + "learning_rate": 3.609874216427925e-05, + "loss": 0.0015, + "step": 2050 + }, + { + "epoch": 0.697442710063102, + "grad_norm": 0.0007614546921104193, + "learning_rate": 3.591814009221462e-05, + "loss": 0.0201, + "step": 2100 + }, + { + "epoch": 0.7140484888741282, + "grad_norm": 2.3300867080688477, + "learning_rate": 3.5737538020149984e-05, + "loss": 0.0249, + "step": 2150 + }, + { + "epoch": 0.7306542676851544, + "grad_norm": 0.00295511307194829, + "learning_rate": 3.555693594808535e-05, + "loss": 0.0177, + "step": 2200 + }, + { + "epoch": 0.7472600464961807, + "grad_norm": 0.0010525333927944303, + "learning_rate": 3.537633387602071e-05, + "loss": 0.0012, + "step": 2250 + }, + { + "epoch": 0.7638658253072069, + "grad_norm": 0.0007724681054241955, + "learning_rate": 3.5195731803956074e-05, + "loss": 0.0206, + "step": 2300 + }, + { + "epoch": 0.7804716041182331, + "grad_norm": 0.029788095504045486, + "learning_rate": 3.5015129731891445e-05, + "loss": 0.0077, + "step": 2350 + }, + { + "epoch": 0.7970773829292593, + "grad_norm": 0.0016215493669733405, + "learning_rate": 3.483452765982681e-05, + "loss": 0.0278, + "step": 2400 + }, + { + "epoch": 0.8136831617402857, + "grad_norm": 0.0008576202089898288, + "learning_rate": 3.465392558776217e-05, + "loss": 0.0001, + "step": 2450 + }, + { + "epoch": 0.8302889405513119, + "grad_norm": 0.0007036550086922944, + "learning_rate": 3.4473323515697535e-05, + "loss": 0.0064, + "step": 2500 + }, + { + "epoch": 0.8468947193623381, + "grad_norm": 0.007299103774130344, + "learning_rate": 3.4292721443632905e-05, + "loss": 0.0202, + "step": 2550 + }, + { + "epoch": 0.8635004981733644, + "grad_norm": 0.004318034276366234, + "learning_rate": 3.411211937156826e-05, + "loss": 0.0239, + "step": 2600 + }, + { + "epoch": 0.8801062769843906, + "grad_norm": 0.00030149793019518256, + "learning_rate": 3.393151729950363e-05, + "loss": 0.0011, + "step": 2650 + }, + { + "epoch": 0.8967120557954168, + "grad_norm": 0.00011602124141063541, + "learning_rate": 3.3750915227438995e-05, + "loss": 0.0003, + "step": 2700 + }, + { + "epoch": 0.913317834606443, + "grad_norm": 0.00016940826026257128, + "learning_rate": 3.357031315537436e-05, + "loss": 0.0001, + "step": 2750 + }, + { + "epoch": 0.9299236134174693, + "grad_norm": 0.00018533991533331573, + "learning_rate": 3.338971108330973e-05, + "loss": 0.0072, + "step": 2800 + }, + { + "epoch": 0.9465293922284955, + "grad_norm": 0.007014571689069271, + "learning_rate": 3.320910901124509e-05, + "loss": 0.0072, + "step": 2850 + }, + { + "epoch": 0.9631351710395217, + "grad_norm": 0.0949193611741066, + "learning_rate": 3.3028506939180456e-05, + "loss": 0.0021, + "step": 2900 + }, + { + "epoch": 0.9797409498505479, + "grad_norm": 9.85204792022705, + "learning_rate": 3.284790486711582e-05, + "loss": 0.0314, + "step": 2950 + }, + { + "epoch": 0.9963467286615743, + "grad_norm": 0.0009359152754768729, + "learning_rate": 3.266730279505119e-05, + "loss": 0.0169, + "step": 3000 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.997256679389313, + "eval_f1": 0.9972546509870224, + "eval_loss": 0.02280445024371147, + "eval_precision": 0.9972528075162768, + "eval_recall": 0.997256679389313, + "eval_runtime": 39.3237, + "eval_samples_per_second": 213.205, + "eval_steps_per_second": 13.325, + "step": 3011 + }, + { + "epoch": 1.0129525074726005, + "grad_norm": 0.001626566518098116, + "learning_rate": 3.2486700722986546e-05, + "loss": 0.0064, + "step": 3050 + }, + { + "epoch": 1.0295582862836268, + "grad_norm": 0.0007160088862292469, + "learning_rate": 3.2306098650921916e-05, + "loss": 0.0002, + "step": 3100 + }, + { + "epoch": 1.0461640650946529, + "grad_norm": 0.0008541371207684278, + "learning_rate": 3.212549657885728e-05, + "loss": 0.0009, + "step": 3150 + }, + { + "epoch": 1.0627698439056792, + "grad_norm": 0.0005321013741195202, + "learning_rate": 3.194489450679265e-05, + "loss": 0.0059, + "step": 3200 + }, + { + "epoch": 1.0793756227167055, + "grad_norm": 0.005570960231125355, + "learning_rate": 3.176429243472801e-05, + "loss": 0.0233, + "step": 3250 + }, + { + "epoch": 1.0959814015277316, + "grad_norm": 0.0008483761921525002, + "learning_rate": 3.158369036266338e-05, + "loss": 0.0051, + "step": 3300 + }, + { + "epoch": 1.112587180338758, + "grad_norm": 0.26837238669395447, + "learning_rate": 3.140308829059874e-05, + "loss": 0.0002, + "step": 3350 + }, + { + "epoch": 1.1291929591497842, + "grad_norm": 0.006303045898675919, + "learning_rate": 3.1222486218534104e-05, + "loss": 0.0009, + "step": 3400 + }, + { + "epoch": 1.1457987379608103, + "grad_norm": 0.0001649777841521427, + "learning_rate": 3.1041884146469474e-05, + "loss": 0.0145, + "step": 3450 + }, + { + "epoch": 1.1624045167718366, + "grad_norm": 0.00047482753871008754, + "learning_rate": 3.086128207440484e-05, + "loss": 0.0002, + "step": 3500 + }, + { + "epoch": 1.1790102955828627, + "grad_norm": 0.0005254672723822296, + "learning_rate": 3.06806800023402e-05, + "loss": 0.0033, + "step": 3550 + }, + { + "epoch": 1.195616074393889, + "grad_norm": 0.003435322782024741, + "learning_rate": 3.0500077930275568e-05, + "loss": 0.0121, + "step": 3600 + }, + { + "epoch": 1.2122218532049154, + "grad_norm": 0.0008208905346691608, + "learning_rate": 3.031947585821093e-05, + "loss": 0.0004, + "step": 3650 + }, + { + "epoch": 1.2288276320159415, + "grad_norm": 0.0008073259959928691, + "learning_rate": 3.0138873786146294e-05, + "loss": 0.0001, + "step": 3700 + }, + { + "epoch": 1.2454334108269678, + "grad_norm": 0.0022343341261148453, + "learning_rate": 2.995827171408166e-05, + "loss": 0.0132, + "step": 3750 + }, + { + "epoch": 1.2620391896379939, + "grad_norm": 0.003108004806563258, + "learning_rate": 2.9777669642017028e-05, + "loss": 0.0072, + "step": 3800 + }, + { + "epoch": 1.2786449684490202, + "grad_norm": 0.0003189484996255487, + "learning_rate": 2.9597067569952388e-05, + "loss": 0.0004, + "step": 3850 + }, + { + "epoch": 1.2952507472600465, + "grad_norm": 0.000715159869287163, + "learning_rate": 2.9416465497887755e-05, + "loss": 0.0001, + "step": 3900 + }, + { + "epoch": 1.3118565260710726, + "grad_norm": 0.00012114120909245685, + "learning_rate": 2.9235863425823122e-05, + "loss": 0.0046, + "step": 3950 + }, + { + "epoch": 1.328462304882099, + "grad_norm": 0.0006089384551160038, + "learning_rate": 2.9055261353758482e-05, + "loss": 0.0181, + "step": 4000 + }, + { + "epoch": 1.3450680836931252, + "grad_norm": 43.172584533691406, + "learning_rate": 2.887465928169385e-05, + "loss": 0.0177, + "step": 4050 + }, + { + "epoch": 1.3616738625041513, + "grad_norm": 0.000983994104899466, + "learning_rate": 2.8694057209629215e-05, + "loss": 0.0039, + "step": 4100 + }, + { + "epoch": 1.3782796413151777, + "grad_norm": 7.642904529348016e-05, + "learning_rate": 2.851345513756458e-05, + "loss": 0.0, + "step": 4150 + }, + { + "epoch": 1.394885420126204, + "grad_norm": 9.202577348332852e-05, + "learning_rate": 2.8332853065499946e-05, + "loss": 0.0018, + "step": 4200 + }, + { + "epoch": 1.41149119893723, + "grad_norm": 0.00020998790569137782, + "learning_rate": 2.8152250993435312e-05, + "loss": 0.0076, + "step": 4250 + }, + { + "epoch": 1.4280969777482564, + "grad_norm": 0.0004194548528175801, + "learning_rate": 2.7971648921370673e-05, + "loss": 0.0028, + "step": 4300 + }, + { + "epoch": 1.4447027565592827, + "grad_norm": 0.005269031506031752, + "learning_rate": 2.779104684930604e-05, + "loss": 0.0367, + "step": 4350 + }, + { + "epoch": 1.4613085353703088, + "grad_norm": 0.010580910369753838, + "learning_rate": 2.7610444777241406e-05, + "loss": 0.0058, + "step": 4400 + }, + { + "epoch": 1.4779143141813351, + "grad_norm": 0.019897054880857468, + "learning_rate": 2.7429842705176773e-05, + "loss": 0.0215, + "step": 4450 + }, + { + "epoch": 1.4945200929923614, + "grad_norm": 0.004522784613072872, + "learning_rate": 2.7249240633112133e-05, + "loss": 0.0048, + "step": 4500 + }, + { + "epoch": 1.5111258718033875, + "grad_norm": 0.0006406618049368262, + "learning_rate": 2.70686385610475e-05, + "loss": 0.0001, + "step": 4550 + }, + { + "epoch": 1.5277316506144138, + "grad_norm": 0.000602134910877794, + "learning_rate": 2.6888036488982867e-05, + "loss": 0.0035, + "step": 4600 + }, + { + "epoch": 1.5443374294254402, + "grad_norm": 0.000469192280434072, + "learning_rate": 2.670743441691823e-05, + "loss": 0.001, + "step": 4650 + }, + { + "epoch": 1.5609432082364663, + "grad_norm": 5.6851687986636534e-05, + "learning_rate": 2.6526832344853594e-05, + "loss": 0.0006, + "step": 4700 + }, + { + "epoch": 1.5775489870474926, + "grad_norm": 7.19124946044758e-05, + "learning_rate": 2.634623027278896e-05, + "loss": 0.0037, + "step": 4750 + }, + { + "epoch": 1.594154765858519, + "grad_norm": 7.235410885186866e-05, + "learning_rate": 2.6165628200724324e-05, + "loss": 0.0028, + "step": 4800 + }, + { + "epoch": 1.610760544669545, + "grad_norm": 0.000146635458804667, + "learning_rate": 2.598502612865969e-05, + "loss": 0.0055, + "step": 4850 + }, + { + "epoch": 1.627366323480571, + "grad_norm": 0.01404090877622366, + "learning_rate": 2.5804424056595057e-05, + "loss": 0.0186, + "step": 4900 + }, + { + "epoch": 1.6439721022915976, + "grad_norm": 0.00503704184666276, + "learning_rate": 2.5623821984530417e-05, + "loss": 0.0108, + "step": 4950 + }, + { + "epoch": 1.6605778811026237, + "grad_norm": 0.0004921660874970257, + "learning_rate": 2.5443219912465784e-05, + "loss": 0.0009, + "step": 5000 + }, + { + "epoch": 1.6771836599136498, + "grad_norm": 0.000255432038102299, + "learning_rate": 2.526261784040115e-05, + "loss": 0.0002, + "step": 5050 + }, + { + "epoch": 1.6937894387246761, + "grad_norm": 0.0001737813145155087, + "learning_rate": 2.508201576833651e-05, + "loss": 0.0007, + "step": 5100 + }, + { + "epoch": 1.7103952175357025, + "grad_norm": 4.374636773718521e-05, + "learning_rate": 2.4901413696271878e-05, + "loss": 0.0, + "step": 5150 + }, + { + "epoch": 1.7270009963467285, + "grad_norm": 0.00044321315363049507, + "learning_rate": 2.4720811624207245e-05, + "loss": 0.0095, + "step": 5200 + }, + { + "epoch": 1.7436067751577549, + "grad_norm": 0.00015154728316701949, + "learning_rate": 2.4540209552142608e-05, + "loss": 0.0165, + "step": 5250 + }, + { + "epoch": 1.7602125539687812, + "grad_norm": 6.615974416490644e-05, + "learning_rate": 2.4359607480077975e-05, + "loss": 0.0, + "step": 5300 + }, + { + "epoch": 1.7768183327798073, + "grad_norm": 0.0006762578268535435, + "learning_rate": 2.4179005408013342e-05, + "loss": 0.0137, + "step": 5350 + }, + { + "epoch": 1.7934241115908336, + "grad_norm": 0.00047573362826369703, + "learning_rate": 2.3998403335948702e-05, + "loss": 0.0, + "step": 5400 + }, + { + "epoch": 1.81002989040186, + "grad_norm": 0.0024170074611902237, + "learning_rate": 2.381780126388407e-05, + "loss": 0.0001, + "step": 5450 + }, + { + "epoch": 1.826635669212886, + "grad_norm": 0.00010628774907672778, + "learning_rate": 2.3637199191819436e-05, + "loss": 0.0061, + "step": 5500 + }, + { + "epoch": 1.8432414480239123, + "grad_norm": 0.0007334867841564119, + "learning_rate": 2.3456597119754796e-05, + "loss": 0.0, + "step": 5550 + }, + { + "epoch": 1.8598472268349386, + "grad_norm": 0.00021514961554203182, + "learning_rate": 2.3275995047690162e-05, + "loss": 0.0, + "step": 5600 + }, + { + "epoch": 1.8764530056459647, + "grad_norm": 0.0011384404497221112, + "learning_rate": 2.309539297562553e-05, + "loss": 0.0146, + "step": 5650 + }, + { + "epoch": 1.893058784456991, + "grad_norm": 0.00022749503841623664, + "learning_rate": 2.2914790903560896e-05, + "loss": 0.0001, + "step": 5700 + }, + { + "epoch": 1.9096645632680174, + "grad_norm": 0.00016585957200732082, + "learning_rate": 2.273418883149626e-05, + "loss": 0.0, + "step": 5750 + }, + { + "epoch": 1.9262703420790435, + "grad_norm": 0.0008972834912128747, + "learning_rate": 2.2553586759431623e-05, + "loss": 0.0052, + "step": 5800 + }, + { + "epoch": 1.9428761208900698, + "grad_norm": 0.00017760394257493317, + "learning_rate": 2.237298468736699e-05, + "loss": 0.0043, + "step": 5850 + }, + { + "epoch": 1.959481899701096, + "grad_norm": 0.2404995709657669, + "learning_rate": 2.2192382615302353e-05, + "loss": 0.0114, + "step": 5900 + }, + { + "epoch": 1.9760876785121222, + "grad_norm": 0.0012148089008405805, + "learning_rate": 2.201178054323772e-05, + "loss": 0.0001, + "step": 5950 + }, + { + "epoch": 1.9926934573231485, + "grad_norm": 0.04914182797074318, + "learning_rate": 2.1831178471173087e-05, + "loss": 0.0113, + "step": 6000 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9973759541984732, + "eval_f1": 0.997344144336412, + "eval_loss": 0.021904850378632545, + "eval_precision": 0.997361916418535, + "eval_recall": 0.9973759541984732, + "eval_runtime": 37.5313, + "eval_samples_per_second": 223.387, + "eval_steps_per_second": 13.962, + "step": 6022 + } + ], + "logging_steps": 50, + "max_steps": 12044, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.282861088518144e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/trial-7/checkpoint-6022/training_args.bin b/trial-7/checkpoint-6022/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fefe87215650a4566f77c3438918ef8ad7d881d --- /dev/null +++ b/trial-7/checkpoint-6022/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac8ed1456bca015299da067993fb69b9ff68148b43f14eb2eec1cb64894fdc05 +size 5368 diff --git a/trial-8/checkpoint-6022/config.json b/trial-8/checkpoint-6022/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-8/checkpoint-6022/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-8/checkpoint-6022/model.safetensors b/trial-8/checkpoint-6022/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..87796ccd0552c0f6a6f4c976e67fd85cf382603d --- /dev/null +++ b/trial-8/checkpoint-6022/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7f3420381a8242245db48bb0ab8abf24564fb2045fe804c8ad857e91a85c91b +size 598439784 diff --git a/trial-8/checkpoint-6022/optimizer.pt b/trial-8/checkpoint-6022/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5525387755445c53176210db5136c0125908cc2 --- /dev/null +++ b/trial-8/checkpoint-6022/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dd6138aab64b3d396432c35ce4db532bfba1306b4f99159aaa0c6d362152374 +size 1196967418 diff --git a/trial-8/checkpoint-6022/rng_state.pth b/trial-8/checkpoint-6022/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1 --- /dev/null +++ b/trial-8/checkpoint-6022/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef +size 14244 diff --git a/trial-8/checkpoint-6022/scheduler.pt b/trial-8/checkpoint-6022/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf460763ee7d22e479fe60bc5a6cf8fc3d9894b6 --- /dev/null +++ b/trial-8/checkpoint-6022/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbc92a16e2b5e4d8e3fa3b973f2760dc5f362ac0ce5d49f2d1336359c23db225 +size 1064 diff --git a/trial-8/checkpoint-6022/trainer_state.json b/trial-8/checkpoint-6022/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..02f21018346f9318a8b2121adaa18a08f0f4acb4 --- /dev/null +++ b/trial-8/checkpoint-6022/trainer_state.json @@ -0,0 +1,897 @@ +{ + "best_metric": 0.016700224950909615, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-8/checkpoint-6022", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 6022, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016605778811026237, + "grad_norm": 7.783297538757324, + "learning_rate": 1.75347410056435e-05, + "loss": 0.3917, + "step": 50 + }, + { + "epoch": 0.033211557622052475, + "grad_norm": 9.493297576904297, + "learning_rate": 1.74763113455114e-05, + "loss": 0.2198, + "step": 100 + }, + { + "epoch": 0.04981733643307871, + "grad_norm": 14.9419527053833, + "learning_rate": 1.7417881685379297e-05, + "loss": 0.1591, + "step": 150 + }, + { + "epoch": 0.06642311524410495, + "grad_norm": 12.10505199432373, + "learning_rate": 1.73594520252472e-05, + "loss": 0.1272, + "step": 200 + }, + { + "epoch": 0.08302889405513118, + "grad_norm": 0.3676553964614868, + "learning_rate": 1.7301022365115096e-05, + "loss": 0.0714, + "step": 250 + }, + { + "epoch": 0.09963467286615742, + "grad_norm": 0.059806231409311295, + "learning_rate": 1.7242592704982996e-05, + "loss": 0.0855, + "step": 300 + }, + { + "epoch": 0.11624045167718366, + "grad_norm": 0.017794443294405937, + "learning_rate": 1.7184163044850895e-05, + "loss": 0.0436, + "step": 350 + }, + { + "epoch": 0.1328462304882099, + "grad_norm": 5.694874286651611, + "learning_rate": 1.7125733384718795e-05, + "loss": 0.0805, + "step": 400 + }, + { + "epoch": 0.14945200929923613, + "grad_norm": 0.31493690609931946, + "learning_rate": 1.706730372458669e-05, + "loss": 0.0266, + "step": 450 + }, + { + "epoch": 0.16605778811026237, + "grad_norm": 3.3532514572143555, + "learning_rate": 1.700887406445459e-05, + "loss": 0.0457, + "step": 500 + }, + { + "epoch": 0.1826635669212886, + "grad_norm": 0.006790875922888517, + "learning_rate": 1.6950444404322493e-05, + "loss": 0.0178, + "step": 550 + }, + { + "epoch": 0.19926934573231483, + "grad_norm": 0.23334099352359772, + "learning_rate": 1.689201474419039e-05, + "loss": 0.0218, + "step": 600 + }, + { + "epoch": 0.2158751245433411, + "grad_norm": 0.07603476941585541, + "learning_rate": 1.683358508405829e-05, + "loss": 0.0273, + "step": 650 + }, + { + "epoch": 0.23248090335436733, + "grad_norm": 30.228376388549805, + "learning_rate": 1.677515542392619e-05, + "loss": 0.045, + "step": 700 + }, + { + "epoch": 0.24908668216539356, + "grad_norm": 0.00013399416639003903, + "learning_rate": 1.6716725763794088e-05, + "loss": 0.0442, + "step": 750 + }, + { + "epoch": 0.2656924609764198, + "grad_norm": 0.0007570835296064615, + "learning_rate": 1.6658296103661984e-05, + "loss": 0.0352, + "step": 800 + }, + { + "epoch": 0.282298239787446, + "grad_norm": 6.466372013092041, + "learning_rate": 1.6599866443529884e-05, + "loss": 0.057, + "step": 850 + }, + { + "epoch": 0.29890401859847227, + "grad_norm": 0.04902864992618561, + "learning_rate": 1.6541436783397787e-05, + "loss": 0.0282, + "step": 900 + }, + { + "epoch": 0.3155097974094985, + "grad_norm": 1.3140225410461426, + "learning_rate": 1.6483007123265683e-05, + "loss": 0.0128, + "step": 950 + }, + { + "epoch": 0.33211557622052473, + "grad_norm": 0.00038628041511401534, + "learning_rate": 1.6424577463133583e-05, + "loss": 0.0373, + "step": 1000 + }, + { + "epoch": 0.348721355031551, + "grad_norm": 0.0020163152366876602, + "learning_rate": 1.6366147803001482e-05, + "loss": 0.0281, + "step": 1050 + }, + { + "epoch": 0.3653271338425772, + "grad_norm": 0.005716539453715086, + "learning_rate": 1.630771814286938e-05, + "loss": 0.0178, + "step": 1100 + }, + { + "epoch": 0.38193291265360346, + "grad_norm": 0.0029005147516727448, + "learning_rate": 1.6249288482737278e-05, + "loss": 0.0028, + "step": 1150 + }, + { + "epoch": 0.39853869146462967, + "grad_norm": 0.0008354082820005715, + "learning_rate": 1.6190858822605177e-05, + "loss": 0.0128, + "step": 1200 + }, + { + "epoch": 0.41514447027565593, + "grad_norm": 0.04189394786953926, + "learning_rate": 1.613242916247308e-05, + "loss": 0.0358, + "step": 1250 + }, + { + "epoch": 0.4317502490866822, + "grad_norm": 0.30329668521881104, + "learning_rate": 1.6073999502340976e-05, + "loss": 0.0496, + "step": 1300 + }, + { + "epoch": 0.4483560278977084, + "grad_norm": 2.6213114261627197, + "learning_rate": 1.6015569842208876e-05, + "loss": 0.0296, + "step": 1350 + }, + { + "epoch": 0.46496180670873466, + "grad_norm": 0.005221163388341665, + "learning_rate": 1.5957140182076775e-05, + "loss": 0.0057, + "step": 1400 + }, + { + "epoch": 0.48156758551976087, + "grad_norm": 0.0011331464629620314, + "learning_rate": 1.5898710521944675e-05, + "loss": 0.0182, + "step": 1450 + }, + { + "epoch": 0.4981733643307871, + "grad_norm": 0.003077897010371089, + "learning_rate": 1.584028086181257e-05, + "loss": 0.0163, + "step": 1500 + }, + { + "epoch": 0.5147791431418134, + "grad_norm": 0.001127161318436265, + "learning_rate": 1.578185120168047e-05, + "loss": 0.008, + "step": 1550 + }, + { + "epoch": 0.5313849219528396, + "grad_norm": 20.000713348388672, + "learning_rate": 1.572342154154837e-05, + "loss": 0.0277, + "step": 1600 + }, + { + "epoch": 0.5479907007638658, + "grad_norm": 0.0031673621851950884, + "learning_rate": 1.566499188141627e-05, + "loss": 0.0295, + "step": 1650 + }, + { + "epoch": 0.564596479574892, + "grad_norm": 0.1044340580701828, + "learning_rate": 1.560656222128417e-05, + "loss": 0.0162, + "step": 1700 + }, + { + "epoch": 0.5812022583859183, + "grad_norm": 0.002000702079385519, + "learning_rate": 1.554813256115207e-05, + "loss": 0.0047, + "step": 1750 + }, + { + "epoch": 0.5978080371969445, + "grad_norm": 0.014721410349011421, + "learning_rate": 1.548970290101997e-05, + "loss": 0.0164, + "step": 1800 + }, + { + "epoch": 0.6144138160079707, + "grad_norm": 0.00020889069128315896, + "learning_rate": 1.5431273240887864e-05, + "loss": 0.0065, + "step": 1850 + }, + { + "epoch": 0.631019594818997, + "grad_norm": 0.000981863122433424, + "learning_rate": 1.5372843580755764e-05, + "loss": 0.0104, + "step": 1900 + }, + { + "epoch": 0.6476253736300233, + "grad_norm": 0.00036494643427431583, + "learning_rate": 1.5314413920623664e-05, + "loss": 0.0078, + "step": 1950 + }, + { + "epoch": 0.6642311524410495, + "grad_norm": 0.00018712542077992111, + "learning_rate": 1.5255984260491563e-05, + "loss": 0.0147, + "step": 2000 + }, + { + "epoch": 0.6808369312520757, + "grad_norm": 0.00041754008270800114, + "learning_rate": 1.5197554600359463e-05, + "loss": 0.0056, + "step": 2050 + }, + { + "epoch": 0.697442710063102, + "grad_norm": 0.0007413614075630903, + "learning_rate": 1.513912494022736e-05, + "loss": 0.0218, + "step": 2100 + }, + { + "epoch": 0.7140484888741282, + "grad_norm": 0.17359164357185364, + "learning_rate": 1.508069528009526e-05, + "loss": 0.0087, + "step": 2150 + }, + { + "epoch": 0.7306542676851544, + "grad_norm": 0.0031391121447086334, + "learning_rate": 1.5022265619963158e-05, + "loss": 0.0091, + "step": 2200 + }, + { + "epoch": 0.7472600464961807, + "grad_norm": 0.0002511175407562405, + "learning_rate": 1.4963835959831057e-05, + "loss": 0.0085, + "step": 2250 + }, + { + "epoch": 0.7638658253072069, + "grad_norm": 0.0005632131360471249, + "learning_rate": 1.4905406299698959e-05, + "loss": 0.0134, + "step": 2300 + }, + { + "epoch": 0.7804716041182331, + "grad_norm": 0.020430119708180428, + "learning_rate": 1.4846976639566856e-05, + "loss": 0.0123, + "step": 2350 + }, + { + "epoch": 0.7970773829292593, + "grad_norm": 0.002410025568678975, + "learning_rate": 1.4788546979434756e-05, + "loss": 0.0165, + "step": 2400 + }, + { + "epoch": 0.8136831617402857, + "grad_norm": 0.0005655758432112634, + "learning_rate": 1.4730117319302654e-05, + "loss": 0.0001, + "step": 2450 + }, + { + "epoch": 0.8302889405513119, + "grad_norm": 0.0001908275589812547, + "learning_rate": 1.4671687659170553e-05, + "loss": 0.0031, + "step": 2500 + }, + { + "epoch": 0.8468947193623381, + "grad_norm": 0.007557071279734373, + "learning_rate": 1.4613257999038451e-05, + "loss": 0.011, + "step": 2550 + }, + { + "epoch": 0.8635004981733644, + "grad_norm": 0.003107853000983596, + "learning_rate": 1.455482833890635e-05, + "loss": 0.0036, + "step": 2600 + }, + { + "epoch": 0.8801062769843906, + "grad_norm": 0.00015464833995793015, + "learning_rate": 1.4496398678774252e-05, + "loss": 0.0187, + "step": 2650 + }, + { + "epoch": 0.8967120557954168, + "grad_norm": 0.0004067471018061042, + "learning_rate": 1.443796901864215e-05, + "loss": 0.0003, + "step": 2700 + }, + { + "epoch": 0.913317834606443, + "grad_norm": 0.002043587388470769, + "learning_rate": 1.437953935851005e-05, + "loss": 0.0067, + "step": 2750 + }, + { + "epoch": 0.9299236134174693, + "grad_norm": 4.698836164607201e-06, + "learning_rate": 1.4321109698377947e-05, + "loss": 0.0027, + "step": 2800 + }, + { + "epoch": 0.9465293922284955, + "grad_norm": 0.004311679396778345, + "learning_rate": 1.4262680038245847e-05, + "loss": 0.0099, + "step": 2850 + }, + { + "epoch": 0.9631351710395217, + "grad_norm": 16.798845291137695, + "learning_rate": 1.4204250378113745e-05, + "loss": 0.0056, + "step": 2900 + }, + { + "epoch": 0.9797409498505479, + "grad_norm": 9.97236156463623, + "learning_rate": 1.4145820717981646e-05, + "loss": 0.0408, + "step": 2950 + }, + { + "epoch": 0.9963467286615743, + "grad_norm": 0.0034861546009778976, + "learning_rate": 1.4087391057849545e-05, + "loss": 0.0111, + "step": 3000 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9958253816793893, + "eval_f1": 0.9958527220108943, + "eval_loss": 0.02235870435833931, + "eval_precision": 0.9958943559410505, + "eval_recall": 0.9958253816793893, + "eval_runtime": 37.1995, + "eval_samples_per_second": 225.379, + "eval_steps_per_second": 14.086, + "step": 3011 + }, + { + "epoch": 1.0129525074726005, + "grad_norm": 0.0027454651426523924, + "learning_rate": 1.4028961397717443e-05, + "loss": 0.0031, + "step": 3050 + }, + { + "epoch": 1.0295582862836268, + "grad_norm": 0.00027677303296513855, + "learning_rate": 1.3970531737585343e-05, + "loss": 0.0, + "step": 3100 + }, + { + "epoch": 1.0461640650946529, + "grad_norm": 0.00034389185020700097, + "learning_rate": 1.391210207745324e-05, + "loss": 0.0, + "step": 3150 + }, + { + "epoch": 1.0627698439056792, + "grad_norm": 0.0010943470988422632, + "learning_rate": 1.385367241732114e-05, + "loss": 0.0119, + "step": 3200 + }, + { + "epoch": 1.0793756227167055, + "grad_norm": 0.007589911110699177, + "learning_rate": 1.3795242757189038e-05, + "loss": 0.0177, + "step": 3250 + }, + { + "epoch": 1.0959814015277316, + "grad_norm": 0.00021936999110039324, + "learning_rate": 1.3736813097056939e-05, + "loss": 0.0058, + "step": 3300 + }, + { + "epoch": 1.112587180338758, + "grad_norm": 0.005090941209346056, + "learning_rate": 1.3678383436924837e-05, + "loss": 0.0147, + "step": 3350 + }, + { + "epoch": 1.1291929591497842, + "grad_norm": 0.0033587052021175623, + "learning_rate": 1.3619953776792737e-05, + "loss": 0.0022, + "step": 3400 + }, + { + "epoch": 1.1457987379608103, + "grad_norm": 3.696617568493821e-05, + "learning_rate": 1.3561524116660634e-05, + "loss": 0.0001, + "step": 3450 + }, + { + "epoch": 1.1624045167718366, + "grad_norm": 0.00012033848179271445, + "learning_rate": 1.3503094456528534e-05, + "loss": 0.0158, + "step": 3500 + }, + { + "epoch": 1.1790102955828627, + "grad_norm": 0.0007672170177102089, + "learning_rate": 1.3444664796396433e-05, + "loss": 0.0001, + "step": 3550 + }, + { + "epoch": 1.195616074393889, + "grad_norm": 0.0013132853200659156, + "learning_rate": 1.3386235136264331e-05, + "loss": 0.0052, + "step": 3600 + }, + { + "epoch": 1.2122218532049154, + "grad_norm": 0.0014574166852980852, + "learning_rate": 1.3327805476132233e-05, + "loss": 0.0007, + "step": 3650 + }, + { + "epoch": 1.2288276320159415, + "grad_norm": 0.0005620878073386848, + "learning_rate": 1.326937581600013e-05, + "loss": 0.0, + "step": 3700 + }, + { + "epoch": 1.2454334108269678, + "grad_norm": 0.000902441912330687, + "learning_rate": 1.321094615586803e-05, + "loss": 0.0118, + "step": 3750 + }, + { + "epoch": 1.2620391896379939, + "grad_norm": 0.0004086974367965013, + "learning_rate": 1.3152516495735928e-05, + "loss": 0.0, + "step": 3800 + }, + { + "epoch": 1.2786449684490202, + "grad_norm": 0.00640166224911809, + "learning_rate": 1.3094086835603827e-05, + "loss": 0.0, + "step": 3850 + }, + { + "epoch": 1.2952507472600465, + "grad_norm": 0.00021929937065578997, + "learning_rate": 1.3035657175471725e-05, + "loss": 0.0045, + "step": 3900 + }, + { + "epoch": 1.3118565260710726, + "grad_norm": 0.0003949702368117869, + "learning_rate": 1.2977227515339625e-05, + "loss": 0.0, + "step": 3950 + }, + { + "epoch": 1.328462304882099, + "grad_norm": 0.00032811236451379955, + "learning_rate": 1.2918797855207526e-05, + "loss": 0.0004, + "step": 4000 + }, + { + "epoch": 1.3450680836931252, + "grad_norm": 0.0013841086765751243, + "learning_rate": 1.2860368195075424e-05, + "loss": 0.0, + "step": 4050 + }, + { + "epoch": 1.3616738625041513, + "grad_norm": 0.002687544096261263, + "learning_rate": 1.2801938534943323e-05, + "loss": 0.0061, + "step": 4100 + }, + { + "epoch": 1.3782796413151777, + "grad_norm": 0.0017038496444001794, + "learning_rate": 1.2743508874811221e-05, + "loss": 0.0163, + "step": 4150 + }, + { + "epoch": 1.394885420126204, + "grad_norm": 3.813268995145336e-05, + "learning_rate": 1.268507921467912e-05, + "loss": 0.0001, + "step": 4200 + }, + { + "epoch": 1.41149119893723, + "grad_norm": 5.391587546910159e-05, + "learning_rate": 1.2626649554547018e-05, + "loss": 0.0, + "step": 4250 + }, + { + "epoch": 1.4280969777482564, + "grad_norm": 0.0006178281037136912, + "learning_rate": 1.2568219894414918e-05, + "loss": 0.0069, + "step": 4300 + }, + { + "epoch": 1.4447027565592827, + "grad_norm": 0.0003027453494723886, + "learning_rate": 1.250979023428282e-05, + "loss": 0.0084, + "step": 4350 + }, + { + "epoch": 1.4613085353703088, + "grad_norm": 0.0006302434485405684, + "learning_rate": 1.2451360574150717e-05, + "loss": 0.0006, + "step": 4400 + }, + { + "epoch": 1.4779143141813351, + "grad_norm": 0.0009224305395036936, + "learning_rate": 1.2392930914018617e-05, + "loss": 0.0128, + "step": 4450 + }, + { + "epoch": 1.4945200929923614, + "grad_norm": 0.006426838226616383, + "learning_rate": 1.2334501253886514e-05, + "loss": 0.0001, + "step": 4500 + }, + { + "epoch": 1.5111258718033875, + "grad_norm": 0.00010298648703610525, + "learning_rate": 1.2276071593754414e-05, + "loss": 0.0, + "step": 4550 + }, + { + "epoch": 1.5277316506144138, + "grad_norm": 0.00034292592317797244, + "learning_rate": 1.2217641933622312e-05, + "loss": 0.0003, + "step": 4600 + }, + { + "epoch": 1.5443374294254402, + "grad_norm": 0.0020624478347599506, + "learning_rate": 1.2159212273490211e-05, + "loss": 0.0001, + "step": 4650 + }, + { + "epoch": 1.5609432082364663, + "grad_norm": 0.0019984941463917494, + "learning_rate": 1.2100782613358111e-05, + "loss": 0.0, + "step": 4700 + }, + { + "epoch": 1.5775489870474926, + "grad_norm": 0.00010821606701938435, + "learning_rate": 1.204235295322601e-05, + "loss": 0.0004, + "step": 4750 + }, + { + "epoch": 1.594154765858519, + "grad_norm": 2.993629277625587e-05, + "learning_rate": 1.198392329309391e-05, + "loss": 0.0001, + "step": 4800 + }, + { + "epoch": 1.610760544669545, + "grad_norm": 1.737935235723853e-05, + "learning_rate": 1.1925493632961808e-05, + "loss": 0.0045, + "step": 4850 + }, + { + "epoch": 1.627366323480571, + "grad_norm": 0.0006885113543830812, + "learning_rate": 1.1867063972829707e-05, + "loss": 0.0243, + "step": 4900 + }, + { + "epoch": 1.6439721022915976, + "grad_norm": 0.0001697670086286962, + "learning_rate": 1.1808634312697605e-05, + "loss": 0.0009, + "step": 4950 + }, + { + "epoch": 1.6605778811026237, + "grad_norm": 0.00014473537157755345, + "learning_rate": 1.1750204652565505e-05, + "loss": 0.0155, + "step": 5000 + }, + { + "epoch": 1.6771836599136498, + "grad_norm": 2.826682793966029e-05, + "learning_rate": 1.1691774992433404e-05, + "loss": 0.0, + "step": 5050 + }, + { + "epoch": 1.6937894387246761, + "grad_norm": 4.407271626405418e-05, + "learning_rate": 1.1633345332301304e-05, + "loss": 0.0, + "step": 5100 + }, + { + "epoch": 1.7103952175357025, + "grad_norm": 0.0044469875283539295, + "learning_rate": 1.1574915672169202e-05, + "loss": 0.003, + "step": 5150 + }, + { + "epoch": 1.7270009963467285, + "grad_norm": 2.422453326289542e-05, + "learning_rate": 1.1516486012037101e-05, + "loss": 0.0114, + "step": 5200 + }, + { + "epoch": 1.7436067751577549, + "grad_norm": 2.4912773369578645e-05, + "learning_rate": 1.1458056351905e-05, + "loss": 0.0, + "step": 5250 + }, + { + "epoch": 1.7602125539687812, + "grad_norm": 0.0004292759404052049, + "learning_rate": 1.1399626691772899e-05, + "loss": 0.0041, + "step": 5300 + }, + { + "epoch": 1.7768183327798073, + "grad_norm": 7.934037208557129, + "learning_rate": 1.1341197031640798e-05, + "loss": 0.0265, + "step": 5350 + }, + { + "epoch": 1.7934241115908336, + "grad_norm": 0.0004573618352878839, + "learning_rate": 1.1282767371508698e-05, + "loss": 0.0, + "step": 5400 + }, + { + "epoch": 1.81002989040186, + "grad_norm": 0.00449636485427618, + "learning_rate": 1.1224337711376597e-05, + "loss": 0.0, + "step": 5450 + }, + { + "epoch": 1.826635669212886, + "grad_norm": 0.00016635288193356246, + "learning_rate": 1.1165908051244495e-05, + "loss": 0.0001, + "step": 5500 + }, + { + "epoch": 1.8432414480239123, + "grad_norm": 0.0066105956211686134, + "learning_rate": 1.1107478391112395e-05, + "loss": 0.0046, + "step": 5550 + }, + { + "epoch": 1.8598472268349386, + "grad_norm": 6.935372948646545e-05, + "learning_rate": 1.1049048730980292e-05, + "loss": 0.0, + "step": 5600 + }, + { + "epoch": 1.8764530056459647, + "grad_norm": 0.0036935850512236357, + "learning_rate": 1.0990619070848192e-05, + "loss": 0.0135, + "step": 5650 + }, + { + "epoch": 1.893058784456991, + "grad_norm": 0.002138146897777915, + "learning_rate": 1.0932189410716091e-05, + "loss": 0.0087, + "step": 5700 + }, + { + "epoch": 1.9096645632680174, + "grad_norm": 0.00018446841568220407, + "learning_rate": 1.0873759750583991e-05, + "loss": 0.0, + "step": 5750 + }, + { + "epoch": 1.9262703420790435, + "grad_norm": 0.0010917658219113946, + "learning_rate": 1.081533009045189e-05, + "loss": 0.0, + "step": 5800 + }, + { + "epoch": 1.9428761208900698, + "grad_norm": 0.0001402223715558648, + "learning_rate": 1.0756900430319788e-05, + "loss": 0.0009, + "step": 5850 + }, + { + "epoch": 1.959481899701096, + "grad_norm": 0.0169499684125185, + "learning_rate": 1.0698470770187688e-05, + "loss": 0.0388, + "step": 5900 + }, + { + "epoch": 1.9760876785121222, + "grad_norm": 0.0036119220312684774, + "learning_rate": 1.0640041110055586e-05, + "loss": 0.0071, + "step": 5950 + }, + { + "epoch": 1.9926934573231485, + "grad_norm": 0.0004526925040408969, + "learning_rate": 1.0581611449923485e-05, + "loss": 0.008, + "step": 6000 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9970181297709924, + "eval_f1": 0.9970025459787301, + "eval_loss": 0.016700224950909615, + "eval_precision": 0.9969961560397819, + "eval_recall": 0.9970181297709924, + "eval_runtime": 36.5623, + "eval_samples_per_second": 229.307, + "eval_steps_per_second": 14.332, + "step": 6022 + } + ], + "logging_steps": 50, + "max_steps": 15055, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.282861088518144e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/trial-8/checkpoint-6022/training_args.bin b/trial-8/checkpoint-6022/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..38b919e408e37c8659bd156ac7debfa744d1306f --- /dev/null +++ b/trial-8/checkpoint-6022/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67c4a841d9791e131313053f2277167e82e39e63ce8d395fd78c60884646dc0f +size 5368 diff --git a/trial-9/checkpoint-3012/config.json b/trial-9/checkpoint-3012/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7350b831846f83521af7b47b1ce300123c145b00 --- /dev/null +++ b/trial-9/checkpoint-3012/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "answerdotai/ModernBERT-base", + "architectures": [ + "ModernBertForSequenceClassification" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 50281, + "classifier_activation": "gelu", + "classifier_bias": false, + "classifier_dropout": 0.0, + "classifier_pooling": "mean", + "cls_token_id": 50281, + "decoder_bias": true, + "deterministic_flash_attn": false, + "embedding_dropout": 0.0, + "eos_token_id": 50282, + "global_attn_every_n_layers": 3, + "global_rope_theta": 160000.0, + "gradient_checkpointing": false, + "hidden_activation": "gelu", + "hidden_size": 768, + "initializer_cutoff_factor": 2.0, + "initializer_range": 0.02, + "intermediate_size": 1152, + "layer_norm_eps": 1e-05, + "local_attention": 128, + "local_rope_theta": 10000.0, + "max_position_embeddings": 8192, + "mlp_bias": false, + "mlp_dropout": 0.0, + "model_type": "modernbert", + "norm_bias": false, + "norm_eps": 1e-05, + "num_attention_heads": 12, + "num_hidden_layers": 22, + "pad_token_id": 50283, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "reference_compile": true, + "sep_token_id": 50282, + "sparse_pred_ignore_index": -100, + "sparse_prediction": false, + "torch_dtype": "float32", + "transformers_version": "4.48.0.dev0", + "vocab_size": 50368 +} diff --git a/trial-9/checkpoint-3012/model.safetensors b/trial-9/checkpoint-3012/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..344849614bcd8ca99397f5076f96a4d3a5861441 --- /dev/null +++ b/trial-9/checkpoint-3012/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbdaa19d83b8a2e040bb399bf1c9efb459a15e8429ba43cbf85ed958325c8a8a +size 598439784 diff --git a/trial-9/checkpoint-3012/optimizer.pt b/trial-9/checkpoint-3012/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..411534b735c0ef326b49041bee5352a531831675 --- /dev/null +++ b/trial-9/checkpoint-3012/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c0340932c9aad23f7fcb0392b8fde450fb7ced7808a6df9095d7314b759b59d +size 1196967418 diff --git a/trial-9/checkpoint-3012/rng_state.pth b/trial-9/checkpoint-3012/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b387de0c48181ec5812538ddf1fc60cfda1a89c1 --- /dev/null +++ b/trial-9/checkpoint-3012/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef +size 14244 diff --git a/trial-9/checkpoint-3012/scheduler.pt b/trial-9/checkpoint-3012/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..83a710614683b29a7a852f855b46ce0871810214 --- /dev/null +++ b/trial-9/checkpoint-3012/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0a01c4c544a9920a83cf6474f63ae7bb57f7d77551e95df7c0ed175573c04db +size 1064 diff --git a/trial-9/checkpoint-3012/trainer_state.json b/trial-9/checkpoint-3012/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7a8719291430b7e3922103d2d4fdfe4c33385420 --- /dev/null +++ b/trial-9/checkpoint-3012/trainer_state.json @@ -0,0 +1,477 @@ +{ + "best_metric": 0.013810121454298496, + "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-9/checkpoint-3012", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 3012, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.033200531208499334, + "grad_norm": 3.6277902126312256, + "learning_rate": 4.223876761987849e-05, + "loss": 0.2681, + "step": 50 + }, + { + "epoch": 0.06640106241699867, + "grad_norm": 0.3113747537136078, + "learning_rate": 4.2037477267134116e-05, + "loss": 0.0958, + "step": 100 + }, + { + "epoch": 0.099601593625498, + "grad_norm": 0.18985196948051453, + "learning_rate": 4.183618691438975e-05, + "loss": 0.0636, + "step": 150 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 0.8797281980514526, + "learning_rate": 4.163489656164538e-05, + "loss": 0.0562, + "step": 200 + }, + { + "epoch": 0.16600265604249667, + "grad_norm": 1.9831819534301758, + "learning_rate": 4.143360620890101e-05, + "loss": 0.047, + "step": 250 + }, + { + "epoch": 0.199203187250996, + "grad_norm": 3.341094732284546, + "learning_rate": 4.123231585615664e-05, + "loss": 0.0346, + "step": 300 + }, + { + "epoch": 0.23240371845949534, + "grad_norm": 0.048162225633859634, + "learning_rate": 4.103102550341227e-05, + "loss": 0.0457, + "step": 350 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.0644642785191536, + "learning_rate": 4.08297351506679e-05, + "loss": 0.0391, + "step": 400 + }, + { + "epoch": 0.29880478087649404, + "grad_norm": 6.679907321929932, + "learning_rate": 4.0628444797923535e-05, + "loss": 0.0346, + "step": 450 + }, + { + "epoch": 0.33200531208499334, + "grad_norm": 0.01181253232061863, + "learning_rate": 4.0427154445179164e-05, + "loss": 0.0103, + "step": 500 + }, + { + "epoch": 0.3652058432934927, + "grad_norm": 0.06453288346529007, + "learning_rate": 4.022586409243479e-05, + "loss": 0.0292, + "step": 550 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.011014764197170734, + "learning_rate": 4.002457373969042e-05, + "loss": 0.0101, + "step": 600 + }, + { + "epoch": 0.4316069057104914, + "grad_norm": 0.44575539231300354, + "learning_rate": 3.982328338694605e-05, + "loss": 0.0187, + "step": 650 + }, + { + "epoch": 0.4648074369189907, + "grad_norm": 0.27992862462997437, + "learning_rate": 3.962199303420168e-05, + "loss": 0.0196, + "step": 700 + }, + { + "epoch": 0.49800796812749004, + "grad_norm": 0.003195864148437977, + "learning_rate": 3.942070268145732e-05, + "loss": 0.0266, + "step": 750 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 5.236836910247803, + "learning_rate": 3.921941232871295e-05, + "loss": 0.0128, + "step": 800 + }, + { + "epoch": 0.5644090305444888, + "grad_norm": 0.6897503137588501, + "learning_rate": 3.9018121975968576e-05, + "loss": 0.0163, + "step": 850 + }, + { + "epoch": 0.5976095617529881, + "grad_norm": 0.07702745497226715, + "learning_rate": 3.881683162322421e-05, + "loss": 0.0196, + "step": 900 + }, + { + "epoch": 0.6308100929614874, + "grad_norm": 0.00853784941136837, + "learning_rate": 3.8615541270479835e-05, + "loss": 0.0207, + "step": 950 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.1736297905445099, + "learning_rate": 3.841425091773547e-05, + "loss": 0.0219, + "step": 1000 + }, + { + "epoch": 0.6972111553784861, + "grad_norm": 0.07749740034341812, + "learning_rate": 3.82129605649911e-05, + "loss": 0.0158, + "step": 1050 + }, + { + "epoch": 0.7304116865869854, + "grad_norm": 0.0033312628511339426, + "learning_rate": 3.801167021224673e-05, + "loss": 0.0133, + "step": 1100 + }, + { + "epoch": 0.7636122177954847, + "grad_norm": 0.003007786348462105, + "learning_rate": 3.781037985950236e-05, + "loss": 0.0143, + "step": 1150 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 3.926494598388672, + "learning_rate": 3.760908950675799e-05, + "loss": 0.0113, + "step": 1200 + }, + { + "epoch": 0.8300132802124834, + "grad_norm": 0.0027299339417368174, + "learning_rate": 3.740779915401362e-05, + "loss": 0.0013, + "step": 1250 + }, + { + "epoch": 0.8632138114209827, + "grad_norm": 0.013163665309548378, + "learning_rate": 3.7206508801269253e-05, + "loss": 0.0109, + "step": 1300 + }, + { + "epoch": 0.896414342629482, + "grad_norm": 0.0006267031421884894, + "learning_rate": 3.700521844852488e-05, + "loss": 0.0034, + "step": 1350 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.00010993422620231286, + "learning_rate": 3.680392809578051e-05, + "loss": 0.0013, + "step": 1400 + }, + { + "epoch": 0.9628154050464808, + "grad_norm": 1.569828987121582, + "learning_rate": 3.660263774303614e-05, + "loss": 0.0077, + "step": 1450 + }, + { + "epoch": 0.9960159362549801, + "grad_norm": 0.01317554246634245, + "learning_rate": 3.640134739029177e-05, + "loss": 0.0144, + "step": 1500 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9965410305343512, + "eval_f1": 0.9965177127714778, + "eval_loss": 0.025706786662340164, + "eval_precision": 0.9965095878718089, + "eval_recall": 0.9965410305343512, + "eval_runtime": 31.4201, + "eval_samples_per_second": 266.836, + "eval_steps_per_second": 8.339, + "step": 1506 + }, + { + "epoch": 1.0292164674634794, + "grad_norm": 0.40110042691230774, + "learning_rate": 3.620005703754741e-05, + "loss": 0.0002, + "step": 1550 + }, + { + "epoch": 1.0624169986719787, + "grad_norm": 0.0003720091190189123, + "learning_rate": 3.5998766684803036e-05, + "loss": 0.0035, + "step": 1600 + }, + { + "epoch": 1.095617529880478, + "grad_norm": 0.0027410192415118217, + "learning_rate": 3.5797476332058666e-05, + "loss": 0.0122, + "step": 1650 + }, + { + "epoch": 1.1288180610889773, + "grad_norm": 0.010054959915578365, + "learning_rate": 3.5596185979314295e-05, + "loss": 0.0034, + "step": 1700 + }, + { + "epoch": 1.1620185922974768, + "grad_norm": 0.013796687126159668, + "learning_rate": 3.539489562656993e-05, + "loss": 0.0067, + "step": 1750 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 1.8211485147476196, + "learning_rate": 3.5193605273825553e-05, + "loss": 0.006, + "step": 1800 + }, + { + "epoch": 1.2284196547144755, + "grad_norm": 0.016155727207660675, + "learning_rate": 3.499231492108119e-05, + "loss": 0.0002, + "step": 1850 + }, + { + "epoch": 1.2616201859229748, + "grad_norm": 0.003053726628422737, + "learning_rate": 3.479102456833682e-05, + "loss": 0.0071, + "step": 1900 + }, + { + "epoch": 1.294820717131474, + "grad_norm": 0.0003582706267479807, + "learning_rate": 3.458973421559245e-05, + "loss": 0.0001, + "step": 1950 + }, + { + "epoch": 1.3280212483399734, + "grad_norm": 0.03127170354127884, + "learning_rate": 3.438844386284808e-05, + "loss": 0.0071, + "step": 2000 + }, + { + "epoch": 1.361221779548473, + "grad_norm": 0.0018665710231289268, + "learning_rate": 3.418715351010371e-05, + "loss": 0.0112, + "step": 2050 + }, + { + "epoch": 1.3944223107569722, + "grad_norm": 0.05576420947909355, + "learning_rate": 3.3985863157359336e-05, + "loss": 0.0071, + "step": 2100 + }, + { + "epoch": 1.4276228419654715, + "grad_norm": 4.577602863311768, + "learning_rate": 3.378457280461497e-05, + "loss": 0.0114, + "step": 2150 + }, + { + "epoch": 1.4608233731739708, + "grad_norm": 0.00251931045204401, + "learning_rate": 3.35832824518706e-05, + "loss": 0.0136, + "step": 2200 + }, + { + "epoch": 1.4940239043824701, + "grad_norm": 0.00033850205363705754, + "learning_rate": 3.338199209912623e-05, + "loss": 0.001, + "step": 2250 + }, + { + "epoch": 1.5272244355909694, + "grad_norm": 0.0019836120773106813, + "learning_rate": 3.318070174638187e-05, + "loss": 0.0001, + "step": 2300 + }, + { + "epoch": 1.5604249667994687, + "grad_norm": 0.00020935946668032557, + "learning_rate": 3.297941139363749e-05, + "loss": 0.0, + "step": 2350 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.000308250222587958, + "learning_rate": 3.2778121040893126e-05, + "loss": 0.0001, + "step": 2400 + }, + { + "epoch": 1.6268260292164674, + "grad_norm": 0.020645378157496452, + "learning_rate": 3.2576830688148755e-05, + "loss": 0.0092, + "step": 2450 + }, + { + "epoch": 1.6600265604249667, + "grad_norm": 0.000331960734911263, + "learning_rate": 3.2375540335404384e-05, + "loss": 0.004, + "step": 2500 + }, + { + "epoch": 1.6932270916334662, + "grad_norm": 0.0023610808420926332, + "learning_rate": 3.2174249982660014e-05, + "loss": 0.0062, + "step": 2550 + }, + { + "epoch": 1.7264276228419655, + "grad_norm": 0.0006301538087427616, + "learning_rate": 3.197295962991565e-05, + "loss": 0.0023, + "step": 2600 + }, + { + "epoch": 1.7596281540504648, + "grad_norm": 0.00027294279425404966, + "learning_rate": 3.177166927717127e-05, + "loss": 0.0078, + "step": 2650 + }, + { + "epoch": 1.792828685258964, + "grad_norm": 0.012537718750536442, + "learning_rate": 3.157037892442691e-05, + "loss": 0.0069, + "step": 2700 + }, + { + "epoch": 1.8260292164674636, + "grad_norm": 0.0029420643113553524, + "learning_rate": 3.136908857168254e-05, + "loss": 0.0086, + "step": 2750 + }, + { + "epoch": 1.859229747675963, + "grad_norm": 0.023261522874236107, + "learning_rate": 3.116779821893817e-05, + "loss": 0.0099, + "step": 2800 + }, + { + "epoch": 1.8924302788844622, + "grad_norm": 0.0013812623219564557, + "learning_rate": 3.0966507866193796e-05, + "loss": 0.007, + "step": 2850 + }, + { + "epoch": 1.9256308100929616, + "grad_norm": 0.0014662343310192227, + "learning_rate": 3.0765217513449426e-05, + "loss": 0.0013, + "step": 2900 + }, + { + "epoch": 1.9588313413014609, + "grad_norm": 0.014669010415673256, + "learning_rate": 3.0563927160705055e-05, + "loss": 0.0134, + "step": 2950 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.014950312674045563, + "learning_rate": 3.036263680796069e-05, + "loss": 0.0061, + "step": 3000 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9977337786259542, + "eval_f1": 0.9977150455912147, + "eval_loss": 0.013810121454298496, + "eval_precision": 0.9977195036627051, + "eval_recall": 0.9977337786259542, + "eval_runtime": 31.5085, + "eval_samples_per_second": 266.087, + "eval_steps_per_second": 8.315, + "step": 3012 + } + ], + "logging_steps": 50, + "max_steps": 10542, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.282861088518144e+16, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/trial-9/checkpoint-3012/training_args.bin b/trial-9/checkpoint-3012/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f1454f457da94bd2c196f1046d24d373d571f40 --- /dev/null +++ b/trial-9/checkpoint-3012/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72c66611e9dfe0c92b05ede60995e52f447a378ac3e2dd77b9ffae1fd950a0d4 +size 5368