Model save
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +73 -0
- config.json +47 -0
- model.safetensors +3 -0
- training_args.bin +3 -0
- trial-0/checkpoint-1506/config.json +47 -0
- trial-0/checkpoint-1506/model.safetensors +3 -0
- trial-0/checkpoint-1506/optimizer.pt +3 -0
- trial-0/checkpoint-1506/rng_state.pth +3 -0
- trial-0/checkpoint-1506/scheduler.pt +3 -0
- trial-0/checkpoint-1506/trainer_state.json +255 -0
- trial-0/checkpoint-1506/training_args.bin +3 -0
- trial-1/checkpoint-6022/config.json +47 -0
- trial-1/checkpoint-6022/model.safetensors +3 -0
- trial-1/checkpoint-6022/optimizer.pt +3 -0
- trial-1/checkpoint-6022/rng_state.pth +3 -0
- trial-1/checkpoint-6022/scheduler.pt +3 -0
- trial-1/checkpoint-6022/trainer_state.json +897 -0
- trial-1/checkpoint-6022/training_args.bin +3 -0
- trial-2/checkpoint-6022/config.json +47 -0
- trial-2/checkpoint-6022/model.safetensors +3 -0
- trial-2/checkpoint-6022/optimizer.pt +3 -0
- trial-2/checkpoint-6022/rng_state.pth +3 -0
- trial-2/checkpoint-6022/scheduler.pt +3 -0
- trial-2/checkpoint-6022/trainer_state.json +897 -0
- trial-2/checkpoint-6022/training_args.bin +3 -0
- trial-3/checkpoint-1506/config.json +47 -0
- trial-3/checkpoint-1506/model.safetensors +3 -0
- trial-3/checkpoint-1506/optimizer.pt +3 -0
- trial-3/checkpoint-1506/rng_state.pth +3 -0
- trial-3/checkpoint-1506/scheduler.pt +3 -0
- trial-3/checkpoint-1506/trainer_state.json +255 -0
- trial-3/checkpoint-1506/training_args.bin +3 -0
- trial-4/checkpoint-3011/config.json +47 -0
- trial-4/checkpoint-3011/model.safetensors +3 -0
- trial-4/checkpoint-3011/optimizer.pt +3 -0
- trial-4/checkpoint-3011/rng_state.pth +3 -0
- trial-4/checkpoint-3011/scheduler.pt +3 -0
- trial-4/checkpoint-3011/trainer_state.json +465 -0
- trial-4/checkpoint-3011/training_args.bin +3 -0
- trial-5/checkpoint-3012/config.json +47 -0
- trial-5/checkpoint-3012/model.safetensors +3 -0
- trial-5/checkpoint-3012/optimizer.pt +3 -0
- trial-5/checkpoint-3012/rng_state.pth +3 -0
- trial-5/checkpoint-3012/scheduler.pt +3 -0
- trial-5/checkpoint-3012/trainer_state.json +477 -0
- trial-5/checkpoint-3012/training_args.bin +3 -0
- trial-6/checkpoint-6022/config.json +47 -0
- trial-6/checkpoint-6022/model.safetensors +3 -0
- trial-6/checkpoint-6022/optimizer.pt +3 -0
- trial-6/checkpoint-6022/rng_state.pth +3 -0
README.md
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: transformers
|
3 |
+
license: apache-2.0
|
4 |
+
base_model: answerdotai/ModernBERT-base
|
5 |
+
tags:
|
6 |
+
- generated_from_trainer
|
7 |
+
metrics:
|
8 |
+
- accuracy
|
9 |
+
- precision
|
10 |
+
- recall
|
11 |
+
- f1
|
12 |
+
model-index:
|
13 |
+
- name: answerdotai-ModernBERT-base-finetuned
|
14 |
+
results: []
|
15 |
+
---
|
16 |
+
|
17 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
18 |
+
should probably proofread and complete it, then remove this comment. -->
|
19 |
+
|
20 |
+
# answerdotai-ModernBERT-base-finetuned
|
21 |
+
|
22 |
+
This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the None dataset.
|
23 |
+
It achieves the following results on the evaluation set:
|
24 |
+
- Loss: 0.0116
|
25 |
+
- Accuracy: 0.9976
|
26 |
+
- Precision: 0.9977
|
27 |
+
- Recall: 0.9976
|
28 |
+
- F1: 0.9976
|
29 |
+
|
30 |
+
## Model description
|
31 |
+
|
32 |
+
More information needed
|
33 |
+
|
34 |
+
## Intended uses & limitations
|
35 |
+
|
36 |
+
More information needed
|
37 |
+
|
38 |
+
## Training and evaluation data
|
39 |
+
|
40 |
+
More information needed
|
41 |
+
|
42 |
+
## Training procedure
|
43 |
+
|
44 |
+
### Training hyperparameters
|
45 |
+
|
46 |
+
The following hyperparameters were used during training:
|
47 |
+
- learning_rate: 4.244005797262286e-05
|
48 |
+
- train_batch_size: 32
|
49 |
+
- eval_batch_size: 32
|
50 |
+
- seed: 42
|
51 |
+
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
52 |
+
- lr_scheduler_type: linear
|
53 |
+
- num_epochs: 7
|
54 |
+
|
55 |
+
### Training results
|
56 |
+
|
57 |
+
| Training Loss | Epoch | Step | Validation Loss | Accuracy | Precision | Recall | F1 |
|
58 |
+
|:-------------:|:-----:|:-----:|:---------------:|:--------:|:---------:|:------:|:------:|
|
59 |
+
| 0.0175 | 1.0 | 1506 | 0.0195 | 0.9971 | 0.9971 | 0.9971 | 0.9971 |
|
60 |
+
| 0.0134 | 2.0 | 3012 | 0.0153 | 0.9970 | 0.9970 | 0.9970 | 0.9970 |
|
61 |
+
| 0.0 | 3.0 | 4518 | 0.0228 | 0.9976 | 0.9976 | 0.9976 | 0.9976 |
|
62 |
+
| 0.0 | 4.0 | 6024 | 0.0270 | 0.9976 | 0.9976 | 0.9976 | 0.9976 |
|
63 |
+
| 0.0 | 5.0 | 7530 | 0.0272 | 0.9976 | 0.9976 | 0.9976 | 0.9976 |
|
64 |
+
| 0.0 | 6.0 | 9036 | 0.0279 | 0.9975 | 0.9975 | 0.9975 | 0.9975 |
|
65 |
+
| 0.0 | 7.0 | 10542 | 0.0283 | 0.9975 | 0.9975 | 0.9975 | 0.9975 |
|
66 |
+
|
67 |
+
|
68 |
+
### Framework versions
|
69 |
+
|
70 |
+
- Transformers 4.48.0.dev0
|
71 |
+
- Pytorch 2.5.1+cu124
|
72 |
+
- Datasets 3.2.0
|
73 |
+
- Tokenizers 0.21.0
|
config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "answerdotai/ModernBERT-base",
|
3 |
+
"architectures": [
|
4 |
+
"ModernBertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 50281,
|
9 |
+
"classifier_activation": "gelu",
|
10 |
+
"classifier_bias": false,
|
11 |
+
"classifier_dropout": 0.0,
|
12 |
+
"classifier_pooling": "mean",
|
13 |
+
"cls_token_id": 50281,
|
14 |
+
"decoder_bias": true,
|
15 |
+
"deterministic_flash_attn": false,
|
16 |
+
"embedding_dropout": 0.0,
|
17 |
+
"eos_token_id": 50282,
|
18 |
+
"global_attn_every_n_layers": 3,
|
19 |
+
"global_rope_theta": 160000.0,
|
20 |
+
"gradient_checkpointing": false,
|
21 |
+
"hidden_activation": "gelu",
|
22 |
+
"hidden_size": 768,
|
23 |
+
"initializer_cutoff_factor": 2.0,
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 1152,
|
26 |
+
"layer_norm_eps": 1e-05,
|
27 |
+
"local_attention": 128,
|
28 |
+
"local_rope_theta": 10000.0,
|
29 |
+
"max_position_embeddings": 8192,
|
30 |
+
"mlp_bias": false,
|
31 |
+
"mlp_dropout": 0.0,
|
32 |
+
"model_type": "modernbert",
|
33 |
+
"norm_bias": false,
|
34 |
+
"norm_eps": 1e-05,
|
35 |
+
"num_attention_heads": 12,
|
36 |
+
"num_hidden_layers": 22,
|
37 |
+
"pad_token_id": 50283,
|
38 |
+
"position_embedding_type": "absolute",
|
39 |
+
"problem_type": "single_label_classification",
|
40 |
+
"reference_compile": true,
|
41 |
+
"sep_token_id": 50282,
|
42 |
+
"sparse_pred_ignore_index": -100,
|
43 |
+
"sparse_prediction": false,
|
44 |
+
"torch_dtype": "float32",
|
45 |
+
"transformers_version": "4.48.0.dev0",
|
46 |
+
"vocab_size": 50368
|
47 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd2c8555404b25095196f950baad8216db0404ff16448d62a6d453105d7bd0c7
|
3 |
+
size 598439784
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33b0c987e99ad21c3b9517dc831f21fd66bcbcd55d62a62f0a28008a0e8674e2
|
3 |
+
size 5432
|
trial-0/checkpoint-1506/config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "answerdotai/ModernBERT-base",
|
3 |
+
"architectures": [
|
4 |
+
"ModernBertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 50281,
|
9 |
+
"classifier_activation": "gelu",
|
10 |
+
"classifier_bias": false,
|
11 |
+
"classifier_dropout": 0.0,
|
12 |
+
"classifier_pooling": "mean",
|
13 |
+
"cls_token_id": 50281,
|
14 |
+
"decoder_bias": true,
|
15 |
+
"deterministic_flash_attn": false,
|
16 |
+
"embedding_dropout": 0.0,
|
17 |
+
"eos_token_id": 50282,
|
18 |
+
"global_attn_every_n_layers": 3,
|
19 |
+
"global_rope_theta": 160000.0,
|
20 |
+
"gradient_checkpointing": false,
|
21 |
+
"hidden_activation": "gelu",
|
22 |
+
"hidden_size": 768,
|
23 |
+
"initializer_cutoff_factor": 2.0,
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 1152,
|
26 |
+
"layer_norm_eps": 1e-05,
|
27 |
+
"local_attention": 128,
|
28 |
+
"local_rope_theta": 10000.0,
|
29 |
+
"max_position_embeddings": 8192,
|
30 |
+
"mlp_bias": false,
|
31 |
+
"mlp_dropout": 0.0,
|
32 |
+
"model_type": "modernbert",
|
33 |
+
"norm_bias": false,
|
34 |
+
"norm_eps": 1e-05,
|
35 |
+
"num_attention_heads": 12,
|
36 |
+
"num_hidden_layers": 22,
|
37 |
+
"pad_token_id": 50283,
|
38 |
+
"position_embedding_type": "absolute",
|
39 |
+
"problem_type": "single_label_classification",
|
40 |
+
"reference_compile": true,
|
41 |
+
"sep_token_id": 50282,
|
42 |
+
"sparse_pred_ignore_index": -100,
|
43 |
+
"sparse_prediction": false,
|
44 |
+
"torch_dtype": "float32",
|
45 |
+
"transformers_version": "4.48.0.dev0",
|
46 |
+
"vocab_size": 50368
|
47 |
+
}
|
trial-0/checkpoint-1506/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68eefa4a9be7b2db68618e1cb44c2cdf2163fb53cc3380fc52767266b121ddd2
|
3 |
+
size 598439784
|
trial-0/checkpoint-1506/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:08a1a4cc69805f73befa2723d41c1d97c0a2f799125f15e25de8295d6c23580c
|
3 |
+
size 1196967418
|
trial-0/checkpoint-1506/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
|
3 |
+
size 14244
|
trial-0/checkpoint-1506/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5bddebb63f2196cebff07c6da8f9e668e8379463981f8be40fb7e151e6c09ff
|
3 |
+
size 1064
|
trial-0/checkpoint-1506/trainer_state.json
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.02135350927710533,
|
3 |
+
"best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-0/checkpoint-1506",
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 1506,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.033200531208499334,
|
13 |
+
"grad_norm": 11.822611808776855,
|
14 |
+
"learning_rate": 4.4935320035267014e-05,
|
15 |
+
"loss": 0.295,
|
16 |
+
"step": 50
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.06640106241699867,
|
20 |
+
"grad_norm": 0.11557121574878693,
|
21 |
+
"learning_rate": 4.463495024893502e-05,
|
22 |
+
"loss": 0.0808,
|
23 |
+
"step": 100
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.099601593625498,
|
27 |
+
"grad_norm": 0.01743650808930397,
|
28 |
+
"learning_rate": 4.433458046260302e-05,
|
29 |
+
"loss": 0.052,
|
30 |
+
"step": 150
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.13280212483399734,
|
34 |
+
"grad_norm": 4.474731922149658,
|
35 |
+
"learning_rate": 4.4034210676271024e-05,
|
36 |
+
"loss": 0.0491,
|
37 |
+
"step": 200
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.16600265604249667,
|
41 |
+
"grad_norm": 4.205756664276123,
|
42 |
+
"learning_rate": 4.373384088993902e-05,
|
43 |
+
"loss": 0.0344,
|
44 |
+
"step": 250
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.199203187250996,
|
48 |
+
"grad_norm": 4.239188194274902,
|
49 |
+
"learning_rate": 4.343347110360703e-05,
|
50 |
+
"loss": 0.0295,
|
51 |
+
"step": 300
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.23240371845949534,
|
55 |
+
"grad_norm": 0.19662700593471527,
|
56 |
+
"learning_rate": 4.3133101317275027e-05,
|
57 |
+
"loss": 0.0342,
|
58 |
+
"step": 350
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.2656042496679947,
|
62 |
+
"grad_norm": 0.008393031544983387,
|
63 |
+
"learning_rate": 4.2832731530943025e-05,
|
64 |
+
"loss": 0.0245,
|
65 |
+
"step": 400
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.29880478087649404,
|
69 |
+
"grad_norm": 0.06995929777622223,
|
70 |
+
"learning_rate": 4.253236174461103e-05,
|
71 |
+
"loss": 0.0281,
|
72 |
+
"step": 450
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.33200531208499334,
|
76 |
+
"grad_norm": 0.010315222665667534,
|
77 |
+
"learning_rate": 4.223199195827902e-05,
|
78 |
+
"loss": 0.0188,
|
79 |
+
"step": 500
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.3652058432934927,
|
83 |
+
"grad_norm": 3.1021769046783447,
|
84 |
+
"learning_rate": 4.193162217194703e-05,
|
85 |
+
"loss": 0.018,
|
86 |
+
"step": 550
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.398406374501992,
|
90 |
+
"grad_norm": 0.00041495164623484015,
|
91 |
+
"learning_rate": 4.1631252385615027e-05,
|
92 |
+
"loss": 0.0053,
|
93 |
+
"step": 600
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.4316069057104914,
|
97 |
+
"grad_norm": 0.19596342742443085,
|
98 |
+
"learning_rate": 4.133088259928303e-05,
|
99 |
+
"loss": 0.0178,
|
100 |
+
"step": 650
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.4648074369189907,
|
104 |
+
"grad_norm": 0.0566418319940567,
|
105 |
+
"learning_rate": 4.103051281295103e-05,
|
106 |
+
"loss": 0.0101,
|
107 |
+
"step": 700
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.49800796812749004,
|
111 |
+
"grad_norm": 0.005816417746245861,
|
112 |
+
"learning_rate": 4.0730143026619036e-05,
|
113 |
+
"loss": 0.0166,
|
114 |
+
"step": 750
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.5312084993359893,
|
118 |
+
"grad_norm": 2.2474324703216553,
|
119 |
+
"learning_rate": 4.0429773240287035e-05,
|
120 |
+
"loss": 0.0156,
|
121 |
+
"step": 800
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.5644090305444888,
|
125 |
+
"grad_norm": 0.06311876326799393,
|
126 |
+
"learning_rate": 4.0129403453955033e-05,
|
127 |
+
"loss": 0.0166,
|
128 |
+
"step": 850
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.5976095617529881,
|
132 |
+
"grad_norm": 0.012764506973326206,
|
133 |
+
"learning_rate": 3.982903366762304e-05,
|
134 |
+
"loss": 0.0175,
|
135 |
+
"step": 900
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.6308100929614874,
|
139 |
+
"grad_norm": 0.00253055221401155,
|
140 |
+
"learning_rate": 3.952866388129104e-05,
|
141 |
+
"loss": 0.0047,
|
142 |
+
"step": 950
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.6640106241699867,
|
146 |
+
"grad_norm": 0.03604559600353241,
|
147 |
+
"learning_rate": 3.922829409495904e-05,
|
148 |
+
"loss": 0.016,
|
149 |
+
"step": 1000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.6972111553784861,
|
153 |
+
"grad_norm": 0.006498202681541443,
|
154 |
+
"learning_rate": 3.892792430862704e-05,
|
155 |
+
"loss": 0.0055,
|
156 |
+
"step": 1050
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.7304116865869854,
|
160 |
+
"grad_norm": 0.11296769976615906,
|
161 |
+
"learning_rate": 3.862755452229504e-05,
|
162 |
+
"loss": 0.0122,
|
163 |
+
"step": 1100
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.7636122177954847,
|
167 |
+
"grad_norm": 0.0005851402529515326,
|
168 |
+
"learning_rate": 3.8327184735963046e-05,
|
169 |
+
"loss": 0.01,
|
170 |
+
"step": 1150
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.796812749003984,
|
174 |
+
"grad_norm": 0.018440622836351395,
|
175 |
+
"learning_rate": 3.8026814949631044e-05,
|
176 |
+
"loss": 0.0064,
|
177 |
+
"step": 1200
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.8300132802124834,
|
181 |
+
"grad_norm": 0.0023099363315850496,
|
182 |
+
"learning_rate": 3.772644516329905e-05,
|
183 |
+
"loss": 0.0011,
|
184 |
+
"step": 1250
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.8632138114209827,
|
188 |
+
"grad_norm": 0.07595626264810562,
|
189 |
+
"learning_rate": 3.742607537696705e-05,
|
190 |
+
"loss": 0.0156,
|
191 |
+
"step": 1300
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.896414342629482,
|
195 |
+
"grad_norm": 0.0008996099350042641,
|
196 |
+
"learning_rate": 3.7125705590635054e-05,
|
197 |
+
"loss": 0.0103,
|
198 |
+
"step": 1350
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.9296148738379814,
|
202 |
+
"grad_norm": 3.656134504126385e-05,
|
203 |
+
"learning_rate": 3.682533580430305e-05,
|
204 |
+
"loss": 0.0027,
|
205 |
+
"step": 1400
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.9628154050464808,
|
209 |
+
"grad_norm": 0.2666904032230377,
|
210 |
+
"learning_rate": 3.652496601797105e-05,
|
211 |
+
"loss": 0.0152,
|
212 |
+
"step": 1450
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.9960159362549801,
|
216 |
+
"grad_norm": 0.011590929701924324,
|
217 |
+
"learning_rate": 3.622459623163905e-05,
|
218 |
+
"loss": 0.0115,
|
219 |
+
"step": 1500
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 1.0,
|
223 |
+
"eval_accuracy": 0.9963024809160306,
|
224 |
+
"eval_f1": 0.9962997469825083,
|
225 |
+
"eval_loss": 0.02135350927710533,
|
226 |
+
"eval_precision": 0.9962971957079396,
|
227 |
+
"eval_recall": 0.9963024809160306,
|
228 |
+
"eval_runtime": 34.0647,
|
229 |
+
"eval_samples_per_second": 246.12,
|
230 |
+
"eval_steps_per_second": 7.691,
|
231 |
+
"step": 1506
|
232 |
+
}
|
233 |
+
],
|
234 |
+
"logging_steps": 50,
|
235 |
+
"max_steps": 7530,
|
236 |
+
"num_input_tokens_seen": 0,
|
237 |
+
"num_train_epochs": 5,
|
238 |
+
"save_steps": 500,
|
239 |
+
"stateful_callbacks": {
|
240 |
+
"TrainerControl": {
|
241 |
+
"args": {
|
242 |
+
"should_epoch_stop": false,
|
243 |
+
"should_evaluate": false,
|
244 |
+
"should_log": false,
|
245 |
+
"should_save": true,
|
246 |
+
"should_training_stop": false
|
247 |
+
},
|
248 |
+
"attributes": {}
|
249 |
+
}
|
250 |
+
},
|
251 |
+
"total_flos": 1.641430544259072e+16,
|
252 |
+
"train_batch_size": 32,
|
253 |
+
"trial_name": null,
|
254 |
+
"trial_params": null
|
255 |
+
}
|
trial-0/checkpoint-1506/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f87e0989b8aabc63686d8b1c4f4f6463501f9b534fd10b5dda472e02e5c6d200
|
3 |
+
size 5368
|
trial-1/checkpoint-6022/config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "answerdotai/ModernBERT-base",
|
3 |
+
"architectures": [
|
4 |
+
"ModernBertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 50281,
|
9 |
+
"classifier_activation": "gelu",
|
10 |
+
"classifier_bias": false,
|
11 |
+
"classifier_dropout": 0.0,
|
12 |
+
"classifier_pooling": "mean",
|
13 |
+
"cls_token_id": 50281,
|
14 |
+
"decoder_bias": true,
|
15 |
+
"deterministic_flash_attn": false,
|
16 |
+
"embedding_dropout": 0.0,
|
17 |
+
"eos_token_id": 50282,
|
18 |
+
"global_attn_every_n_layers": 3,
|
19 |
+
"global_rope_theta": 160000.0,
|
20 |
+
"gradient_checkpointing": false,
|
21 |
+
"hidden_activation": "gelu",
|
22 |
+
"hidden_size": 768,
|
23 |
+
"initializer_cutoff_factor": 2.0,
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 1152,
|
26 |
+
"layer_norm_eps": 1e-05,
|
27 |
+
"local_attention": 128,
|
28 |
+
"local_rope_theta": 10000.0,
|
29 |
+
"max_position_embeddings": 8192,
|
30 |
+
"mlp_bias": false,
|
31 |
+
"mlp_dropout": 0.0,
|
32 |
+
"model_type": "modernbert",
|
33 |
+
"norm_bias": false,
|
34 |
+
"norm_eps": 1e-05,
|
35 |
+
"num_attention_heads": 12,
|
36 |
+
"num_hidden_layers": 22,
|
37 |
+
"pad_token_id": 50283,
|
38 |
+
"position_embedding_type": "absolute",
|
39 |
+
"problem_type": "single_label_classification",
|
40 |
+
"reference_compile": true,
|
41 |
+
"sep_token_id": 50282,
|
42 |
+
"sparse_pred_ignore_index": -100,
|
43 |
+
"sparse_prediction": false,
|
44 |
+
"torch_dtype": "float32",
|
45 |
+
"transformers_version": "4.48.0.dev0",
|
46 |
+
"vocab_size": 50368
|
47 |
+
}
|
trial-1/checkpoint-6022/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9376e02caf20a3536db5adaec49e89c8583378974c975bdfa4e4fa72bb7ed87c
|
3 |
+
size 598439784
|
trial-1/checkpoint-6022/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f989a18c3b9f0cb969ade19c78b7d7d4405053c69000081f12d16f8076c4691
|
3 |
+
size 1196967418
|
trial-1/checkpoint-6022/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
|
3 |
+
size 14244
|
trial-1/checkpoint-6022/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04bd594b0cd8e46cee28cfc34b0ba6a02854df28789c81eb4c180d9356f4de00
|
3 |
+
size 1064
|
trial-1/checkpoint-6022/trainer_state.json
ADDED
@@ -0,0 +1,897 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.0445549376308918,
|
3 |
+
"best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-1/checkpoint-6022",
|
4 |
+
"epoch": 2.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 6022,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.016605778811026237,
|
13 |
+
"grad_norm": 15.757351875305176,
|
14 |
+
"learning_rate": 2.4306427769118723e-06,
|
15 |
+
"loss": 0.6703,
|
16 |
+
"step": 50
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.033211557622052475,
|
20 |
+
"grad_norm": 14.056926727294922,
|
21 |
+
"learning_rate": 2.425586942863882e-06,
|
22 |
+
"loss": 0.4736,
|
23 |
+
"step": 100
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.04981733643307871,
|
27 |
+
"grad_norm": 15.678231239318848,
|
28 |
+
"learning_rate": 2.4205311088158915e-06,
|
29 |
+
"loss": 0.338,
|
30 |
+
"step": 150
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.06642311524410495,
|
34 |
+
"grad_norm": 4.84220552444458,
|
35 |
+
"learning_rate": 2.4154752747679013e-06,
|
36 |
+
"loss": 0.2931,
|
37 |
+
"step": 200
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.08302889405513118,
|
41 |
+
"grad_norm": 5.182389736175537,
|
42 |
+
"learning_rate": 2.4104194407199107e-06,
|
43 |
+
"loss": 0.251,
|
44 |
+
"step": 250
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.09963467286615742,
|
48 |
+
"grad_norm": 1.5187151432037354,
|
49 |
+
"learning_rate": 2.4053636066719205e-06,
|
50 |
+
"loss": 0.2133,
|
51 |
+
"step": 300
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.11624045167718366,
|
55 |
+
"grad_norm": 16.253589630126953,
|
56 |
+
"learning_rate": 2.40030777262393e-06,
|
57 |
+
"loss": 0.1518,
|
58 |
+
"step": 350
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.1328462304882099,
|
62 |
+
"grad_norm": 6.757865905761719,
|
63 |
+
"learning_rate": 2.3952519385759397e-06,
|
64 |
+
"loss": 0.1508,
|
65 |
+
"step": 400
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.14945200929923613,
|
69 |
+
"grad_norm": 2.119438886642456,
|
70 |
+
"learning_rate": 2.390196104527949e-06,
|
71 |
+
"loss": 0.1175,
|
72 |
+
"step": 450
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.16605778811026237,
|
76 |
+
"grad_norm": 15.932334899902344,
|
77 |
+
"learning_rate": 2.3851402704799585e-06,
|
78 |
+
"loss": 0.1401,
|
79 |
+
"step": 500
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.1826635669212886,
|
83 |
+
"grad_norm": 22.459735870361328,
|
84 |
+
"learning_rate": 2.3800844364319683e-06,
|
85 |
+
"loss": 0.1384,
|
86 |
+
"step": 550
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.19926934573231483,
|
90 |
+
"grad_norm": 10.65778923034668,
|
91 |
+
"learning_rate": 2.3750286023839777e-06,
|
92 |
+
"loss": 0.1179,
|
93 |
+
"step": 600
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.2158751245433411,
|
97 |
+
"grad_norm": 6.71965217590332,
|
98 |
+
"learning_rate": 2.3699727683359876e-06,
|
99 |
+
"loss": 0.0782,
|
100 |
+
"step": 650
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.23248090335436733,
|
104 |
+
"grad_norm": 3.6098344326019287,
|
105 |
+
"learning_rate": 2.364916934287997e-06,
|
106 |
+
"loss": 0.138,
|
107 |
+
"step": 700
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.24908668216539356,
|
111 |
+
"grad_norm": 2.3249447345733643,
|
112 |
+
"learning_rate": 2.3598611002400068e-06,
|
113 |
+
"loss": 0.1087,
|
114 |
+
"step": 750
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.2656924609764198,
|
118 |
+
"grad_norm": 15.047837257385254,
|
119 |
+
"learning_rate": 2.354805266192016e-06,
|
120 |
+
"loss": 0.0868,
|
121 |
+
"step": 800
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.282298239787446,
|
125 |
+
"grad_norm": 6.7322773933410645,
|
126 |
+
"learning_rate": 2.349749432144026e-06,
|
127 |
+
"loss": 0.0954,
|
128 |
+
"step": 850
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.29890401859847227,
|
132 |
+
"grad_norm": 12.954623222351074,
|
133 |
+
"learning_rate": 2.3446935980960354e-06,
|
134 |
+
"loss": 0.0689,
|
135 |
+
"step": 900
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.3155097974094985,
|
139 |
+
"grad_norm": 1.4312756061553955,
|
140 |
+
"learning_rate": 2.3396377640480448e-06,
|
141 |
+
"loss": 0.0908,
|
142 |
+
"step": 950
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.33211557622052473,
|
146 |
+
"grad_norm": 0.21316280961036682,
|
147 |
+
"learning_rate": 2.3345819300000546e-06,
|
148 |
+
"loss": 0.0766,
|
149 |
+
"step": 1000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.348721355031551,
|
153 |
+
"grad_norm": 13.642809867858887,
|
154 |
+
"learning_rate": 2.329526095952064e-06,
|
155 |
+
"loss": 0.0533,
|
156 |
+
"step": 1050
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.3653271338425772,
|
160 |
+
"grad_norm": 14.525202751159668,
|
161 |
+
"learning_rate": 2.324470261904074e-06,
|
162 |
+
"loss": 0.0745,
|
163 |
+
"step": 1100
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.38193291265360346,
|
167 |
+
"grad_norm": 0.5210687518119812,
|
168 |
+
"learning_rate": 2.319414427856083e-06,
|
169 |
+
"loss": 0.0618,
|
170 |
+
"step": 1150
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.39853869146462967,
|
174 |
+
"grad_norm": 0.07292640954256058,
|
175 |
+
"learning_rate": 2.314358593808093e-06,
|
176 |
+
"loss": 0.0307,
|
177 |
+
"step": 1200
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.41514447027565593,
|
181 |
+
"grad_norm": 0.08236780017614365,
|
182 |
+
"learning_rate": 2.309302759760103e-06,
|
183 |
+
"loss": 0.0321,
|
184 |
+
"step": 1250
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.4317502490866822,
|
188 |
+
"grad_norm": 28.97471809387207,
|
189 |
+
"learning_rate": 2.304246925712112e-06,
|
190 |
+
"loss": 0.0748,
|
191 |
+
"step": 1300
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.4483560278977084,
|
195 |
+
"grad_norm": 0.4781515896320343,
|
196 |
+
"learning_rate": 2.2991910916641216e-06,
|
197 |
+
"loss": 0.0733,
|
198 |
+
"step": 1350
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.46496180670873466,
|
202 |
+
"grad_norm": 3.214794397354126,
|
203 |
+
"learning_rate": 2.2941352576161314e-06,
|
204 |
+
"loss": 0.0149,
|
205 |
+
"step": 1400
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.48156758551976087,
|
209 |
+
"grad_norm": 0.3289443850517273,
|
210 |
+
"learning_rate": 2.289079423568141e-06,
|
211 |
+
"loss": 0.0401,
|
212 |
+
"step": 1450
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.4981733643307871,
|
216 |
+
"grad_norm": 0.12368986010551453,
|
217 |
+
"learning_rate": 2.28402358952015e-06,
|
218 |
+
"loss": 0.0334,
|
219 |
+
"step": 1500
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.5147791431418134,
|
223 |
+
"grad_norm": 0.08283340185880661,
|
224 |
+
"learning_rate": 2.27896775547216e-06,
|
225 |
+
"loss": 0.0331,
|
226 |
+
"step": 1550
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.5313849219528396,
|
230 |
+
"grad_norm": 2.650063991546631,
|
231 |
+
"learning_rate": 2.2739119214241694e-06,
|
232 |
+
"loss": 0.0496,
|
233 |
+
"step": 1600
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.5479907007638658,
|
237 |
+
"grad_norm": 3.296297311782837,
|
238 |
+
"learning_rate": 2.2688560873761792e-06,
|
239 |
+
"loss": 0.0365,
|
240 |
+
"step": 1650
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.564596479574892,
|
244 |
+
"grad_norm": 0.032304324209690094,
|
245 |
+
"learning_rate": 2.263800253328189e-06,
|
246 |
+
"loss": 0.005,
|
247 |
+
"step": 1700
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.5812022583859183,
|
251 |
+
"grad_norm": 0.003552216337993741,
|
252 |
+
"learning_rate": 2.2587444192801985e-06,
|
253 |
+
"loss": 0.0183,
|
254 |
+
"step": 1750
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.5978080371969445,
|
258 |
+
"grad_norm": 0.0315885953605175,
|
259 |
+
"learning_rate": 2.253688585232208e-06,
|
260 |
+
"loss": 0.0184,
|
261 |
+
"step": 1800
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.6144138160079707,
|
265 |
+
"grad_norm": 0.004702410195022821,
|
266 |
+
"learning_rate": 2.2486327511842177e-06,
|
267 |
+
"loss": 0.0346,
|
268 |
+
"step": 1850
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.631019594818997,
|
272 |
+
"grad_norm": 0.07862639427185059,
|
273 |
+
"learning_rate": 2.243576917136227e-06,
|
274 |
+
"loss": 0.0296,
|
275 |
+
"step": 1900
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.6476253736300233,
|
279 |
+
"grad_norm": 0.3578585982322693,
|
280 |
+
"learning_rate": 2.2385210830882364e-06,
|
281 |
+
"loss": 0.0266,
|
282 |
+
"step": 1950
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.6642311524410495,
|
286 |
+
"grad_norm": 0.045335959643125534,
|
287 |
+
"learning_rate": 2.2334652490402463e-06,
|
288 |
+
"loss": 0.032,
|
289 |
+
"step": 2000
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.6808369312520757,
|
293 |
+
"grad_norm": 1.6869137287139893,
|
294 |
+
"learning_rate": 2.2284094149922557e-06,
|
295 |
+
"loss": 0.0297,
|
296 |
+
"step": 2050
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.697442710063102,
|
300 |
+
"grad_norm": 0.6017621755599976,
|
301 |
+
"learning_rate": 2.2233535809442655e-06,
|
302 |
+
"loss": 0.0119,
|
303 |
+
"step": 2100
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.7140484888741282,
|
307 |
+
"grad_norm": 0.13145552575588226,
|
308 |
+
"learning_rate": 2.2182977468962753e-06,
|
309 |
+
"loss": 0.0157,
|
310 |
+
"step": 2150
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.7306542676851544,
|
314 |
+
"grad_norm": 0.00971242692321539,
|
315 |
+
"learning_rate": 2.2132419128482847e-06,
|
316 |
+
"loss": 0.0099,
|
317 |
+
"step": 2200
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.7472600464961807,
|
321 |
+
"grad_norm": 0.5801131725311279,
|
322 |
+
"learning_rate": 2.208186078800294e-06,
|
323 |
+
"loss": 0.0235,
|
324 |
+
"step": 2250
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.7638658253072069,
|
328 |
+
"grad_norm": 0.008363746106624603,
|
329 |
+
"learning_rate": 2.203130244752304e-06,
|
330 |
+
"loss": 0.0275,
|
331 |
+
"step": 2300
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.7804716041182331,
|
335 |
+
"grad_norm": 0.23013177514076233,
|
336 |
+
"learning_rate": 2.1980744107043133e-06,
|
337 |
+
"loss": 0.0022,
|
338 |
+
"step": 2350
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.7970773829292593,
|
342 |
+
"grad_norm": 0.044313572347164154,
|
343 |
+
"learning_rate": 2.1930185766563227e-06,
|
344 |
+
"loss": 0.0185,
|
345 |
+
"step": 2400
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 0.8136831617402857,
|
349 |
+
"grad_norm": 0.008519169874489307,
|
350 |
+
"learning_rate": 2.1879627426083325e-06,
|
351 |
+
"loss": 0.0023,
|
352 |
+
"step": 2450
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"epoch": 0.8302889405513119,
|
356 |
+
"grad_norm": 0.0008576350519433618,
|
357 |
+
"learning_rate": 2.182906908560342e-06,
|
358 |
+
"loss": 0.0062,
|
359 |
+
"step": 2500
|
360 |
+
},
|
361 |
+
{
|
362 |
+
"epoch": 0.8468947193623381,
|
363 |
+
"grad_norm": 0.56068354845047,
|
364 |
+
"learning_rate": 2.1778510745123517e-06,
|
365 |
+
"loss": 0.0106,
|
366 |
+
"step": 2550
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"epoch": 0.8635004981733644,
|
370 |
+
"grad_norm": 33.770652770996094,
|
371 |
+
"learning_rate": 2.1727952404643615e-06,
|
372 |
+
"loss": 0.0298,
|
373 |
+
"step": 2600
|
374 |
+
},
|
375 |
+
{
|
376 |
+
"epoch": 0.8801062769843906,
|
377 |
+
"grad_norm": 0.0006891911034472287,
|
378 |
+
"learning_rate": 2.167739406416371e-06,
|
379 |
+
"loss": 0.0046,
|
380 |
+
"step": 2650
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"epoch": 0.8967120557954168,
|
384 |
+
"grad_norm": 0.000691475928761065,
|
385 |
+
"learning_rate": 2.1626835723683803e-06,
|
386 |
+
"loss": 0.0014,
|
387 |
+
"step": 2700
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"epoch": 0.913317834606443,
|
391 |
+
"grad_norm": 0.022216275334358215,
|
392 |
+
"learning_rate": 2.15762773832039e-06,
|
393 |
+
"loss": 0.0152,
|
394 |
+
"step": 2750
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"epoch": 0.9299236134174693,
|
398 |
+
"grad_norm": 0.0004267705953679979,
|
399 |
+
"learning_rate": 2.1525719042723995e-06,
|
400 |
+
"loss": 0.0117,
|
401 |
+
"step": 2800
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"epoch": 0.9465293922284955,
|
405 |
+
"grad_norm": 0.016712836921215057,
|
406 |
+
"learning_rate": 2.147516070224409e-06,
|
407 |
+
"loss": 0.0009,
|
408 |
+
"step": 2850
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"epoch": 0.9631351710395217,
|
412 |
+
"grad_norm": 23.74860382080078,
|
413 |
+
"learning_rate": 2.1424602361764187e-06,
|
414 |
+
"loss": 0.0233,
|
415 |
+
"step": 2900
|
416 |
+
},
|
417 |
+
{
|
418 |
+
"epoch": 0.9797409498505479,
|
419 |
+
"grad_norm": 0.0039037028327584267,
|
420 |
+
"learning_rate": 2.137404402128428e-06,
|
421 |
+
"loss": 0.0193,
|
422 |
+
"step": 2950
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"epoch": 0.9963467286615743,
|
426 |
+
"grad_norm": 0.0023961260449141264,
|
427 |
+
"learning_rate": 2.132348568080438e-06,
|
428 |
+
"loss": 0.0068,
|
429 |
+
"step": 3000
|
430 |
+
},
|
431 |
+
{
|
432 |
+
"epoch": 1.0,
|
433 |
+
"eval_accuracy": 0.9921278625954199,
|
434 |
+
"eval_f1": 0.9921278625954199,
|
435 |
+
"eval_loss": 0.046909503638744354,
|
436 |
+
"eval_precision": 0.9921278625954199,
|
437 |
+
"eval_recall": 0.9921278625954199,
|
438 |
+
"eval_runtime": 36.762,
|
439 |
+
"eval_samples_per_second": 228.061,
|
440 |
+
"eval_steps_per_second": 14.254,
|
441 |
+
"step": 3011
|
442 |
+
},
|
443 |
+
{
|
444 |
+
"epoch": 1.0129525074726005,
|
445 |
+
"grad_norm": 0.0033601378090679646,
|
446 |
+
"learning_rate": 2.1272927340324478e-06,
|
447 |
+
"loss": 0.0005,
|
448 |
+
"step": 3050
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"epoch": 1.0295582862836268,
|
452 |
+
"grad_norm": 0.038166940212249756,
|
453 |
+
"learning_rate": 2.122236899984457e-06,
|
454 |
+
"loss": 0.0002,
|
455 |
+
"step": 3100
|
456 |
+
},
|
457 |
+
{
|
458 |
+
"epoch": 1.0461640650946529,
|
459 |
+
"grad_norm": 0.0003456630220171064,
|
460 |
+
"learning_rate": 2.1171810659364666e-06,
|
461 |
+
"loss": 0.0139,
|
462 |
+
"step": 3150
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"epoch": 1.0627698439056792,
|
466 |
+
"grad_norm": 0.004587268922477961,
|
467 |
+
"learning_rate": 2.1121252318884764e-06,
|
468 |
+
"loss": 0.0001,
|
469 |
+
"step": 3200
|
470 |
+
},
|
471 |
+
{
|
472 |
+
"epoch": 1.0793756227167055,
|
473 |
+
"grad_norm": 0.08502045273780823,
|
474 |
+
"learning_rate": 2.1070693978404858e-06,
|
475 |
+
"loss": 0.0216,
|
476 |
+
"step": 3250
|
477 |
+
},
|
478 |
+
{
|
479 |
+
"epoch": 1.0959814015277316,
|
480 |
+
"grad_norm": 0.10945820808410645,
|
481 |
+
"learning_rate": 2.102013563792495e-06,
|
482 |
+
"loss": 0.0256,
|
483 |
+
"step": 3300
|
484 |
+
},
|
485 |
+
{
|
486 |
+
"epoch": 1.112587180338758,
|
487 |
+
"grad_norm": 0.03236968442797661,
|
488 |
+
"learning_rate": 2.096957729744505e-06,
|
489 |
+
"loss": 0.005,
|
490 |
+
"step": 3350
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"epoch": 1.1291929591497842,
|
494 |
+
"grad_norm": 0.007731316145509481,
|
495 |
+
"learning_rate": 2.0919018956965144e-06,
|
496 |
+
"loss": 0.0101,
|
497 |
+
"step": 3400
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"epoch": 1.1457987379608103,
|
501 |
+
"grad_norm": 0.00674546230584383,
|
502 |
+
"learning_rate": 2.086846061648524e-06,
|
503 |
+
"loss": 0.0051,
|
504 |
+
"step": 3450
|
505 |
+
},
|
506 |
+
{
|
507 |
+
"epoch": 1.1624045167718366,
|
508 |
+
"grad_norm": 0.004380326252430677,
|
509 |
+
"learning_rate": 2.081790227600534e-06,
|
510 |
+
"loss": 0.0039,
|
511 |
+
"step": 3500
|
512 |
+
},
|
513 |
+
{
|
514 |
+
"epoch": 1.1790102955828627,
|
515 |
+
"grad_norm": 0.031456008553504944,
|
516 |
+
"learning_rate": 2.0767343935525434e-06,
|
517 |
+
"loss": 0.0001,
|
518 |
+
"step": 3550
|
519 |
+
},
|
520 |
+
{
|
521 |
+
"epoch": 1.195616074393889,
|
522 |
+
"grad_norm": 0.017602458596229553,
|
523 |
+
"learning_rate": 2.071678559504553e-06,
|
524 |
+
"loss": 0.006,
|
525 |
+
"step": 3600
|
526 |
+
},
|
527 |
+
{
|
528 |
+
"epoch": 1.2122218532049154,
|
529 |
+
"grad_norm": 0.009589639492332935,
|
530 |
+
"learning_rate": 2.0666227254565626e-06,
|
531 |
+
"loss": 0.001,
|
532 |
+
"step": 3650
|
533 |
+
},
|
534 |
+
{
|
535 |
+
"epoch": 1.2288276320159415,
|
536 |
+
"grad_norm": 0.003254746785387397,
|
537 |
+
"learning_rate": 2.061566891408572e-06,
|
538 |
+
"loss": 0.0,
|
539 |
+
"step": 3700
|
540 |
+
},
|
541 |
+
{
|
542 |
+
"epoch": 1.2454334108269678,
|
543 |
+
"grad_norm": 0.0011986729223281145,
|
544 |
+
"learning_rate": 2.056511057360582e-06,
|
545 |
+
"loss": 0.0126,
|
546 |
+
"step": 3750
|
547 |
+
},
|
548 |
+
{
|
549 |
+
"epoch": 1.2620391896379939,
|
550 |
+
"grad_norm": 0.006293583195656538,
|
551 |
+
"learning_rate": 2.0514552233125912e-06,
|
552 |
+
"loss": 0.0006,
|
553 |
+
"step": 3800
|
554 |
+
},
|
555 |
+
{
|
556 |
+
"epoch": 1.2786449684490202,
|
557 |
+
"grad_norm": 0.11370380967855453,
|
558 |
+
"learning_rate": 2.0463993892646006e-06,
|
559 |
+
"loss": 0.0252,
|
560 |
+
"step": 3850
|
561 |
+
},
|
562 |
+
{
|
563 |
+
"epoch": 1.2952507472600465,
|
564 |
+
"grad_norm": 0.0018469190690666437,
|
565 |
+
"learning_rate": 2.0413435552166104e-06,
|
566 |
+
"loss": 0.0004,
|
567 |
+
"step": 3900
|
568 |
+
},
|
569 |
+
{
|
570 |
+
"epoch": 1.3118565260710726,
|
571 |
+
"grad_norm": 0.0002411604655208066,
|
572 |
+
"learning_rate": 2.0362877211686202e-06,
|
573 |
+
"loss": 0.003,
|
574 |
+
"step": 3950
|
575 |
+
},
|
576 |
+
{
|
577 |
+
"epoch": 1.328462304882099,
|
578 |
+
"grad_norm": 4.065009852638468e-05,
|
579 |
+
"learning_rate": 2.0312318871206296e-06,
|
580 |
+
"loss": 0.0165,
|
581 |
+
"step": 4000
|
582 |
+
},
|
583 |
+
{
|
584 |
+
"epoch": 1.3450680836931252,
|
585 |
+
"grad_norm": 0.005062599666416645,
|
586 |
+
"learning_rate": 2.0261760530726395e-06,
|
587 |
+
"loss": 0.0028,
|
588 |
+
"step": 4050
|
589 |
+
},
|
590 |
+
{
|
591 |
+
"epoch": 1.3616738625041513,
|
592 |
+
"grad_norm": 0.017400013282895088,
|
593 |
+
"learning_rate": 2.021120219024649e-06,
|
594 |
+
"loss": 0.001,
|
595 |
+
"step": 4100
|
596 |
+
},
|
597 |
+
{
|
598 |
+
"epoch": 1.3782796413151777,
|
599 |
+
"grad_norm": 0.05683843046426773,
|
600 |
+
"learning_rate": 2.0160643849766582e-06,
|
601 |
+
"loss": 0.0124,
|
602 |
+
"step": 4150
|
603 |
+
},
|
604 |
+
{
|
605 |
+
"epoch": 1.394885420126204,
|
606 |
+
"grad_norm": 0.0027029893826693296,
|
607 |
+
"learning_rate": 2.011008550928668e-06,
|
608 |
+
"loss": 0.0003,
|
609 |
+
"step": 4200
|
610 |
+
},
|
611 |
+
{
|
612 |
+
"epoch": 1.41149119893723,
|
613 |
+
"grad_norm": 0.002034110017120838,
|
614 |
+
"learning_rate": 2.0059527168806775e-06,
|
615 |
+
"loss": 0.0073,
|
616 |
+
"step": 4250
|
617 |
+
},
|
618 |
+
{
|
619 |
+
"epoch": 1.4280969777482564,
|
620 |
+
"grad_norm": 0.001398180378600955,
|
621 |
+
"learning_rate": 2.000896882832687e-06,
|
622 |
+
"loss": 0.0044,
|
623 |
+
"step": 4300
|
624 |
+
},
|
625 |
+
{
|
626 |
+
"epoch": 1.4447027565592827,
|
627 |
+
"grad_norm": 0.00037716259248554707,
|
628 |
+
"learning_rate": 1.9958410487846967e-06,
|
629 |
+
"loss": 0.0228,
|
630 |
+
"step": 4350
|
631 |
+
},
|
632 |
+
{
|
633 |
+
"epoch": 1.4613085353703088,
|
634 |
+
"grad_norm": 0.015627387911081314,
|
635 |
+
"learning_rate": 1.9907852147367065e-06,
|
636 |
+
"loss": 0.0114,
|
637 |
+
"step": 4400
|
638 |
+
},
|
639 |
+
{
|
640 |
+
"epoch": 1.4779143141813351,
|
641 |
+
"grad_norm": 0.008964600041508675,
|
642 |
+
"learning_rate": 1.985729380688716e-06,
|
643 |
+
"loss": 0.0032,
|
644 |
+
"step": 4450
|
645 |
+
},
|
646 |
+
{
|
647 |
+
"epoch": 1.4945200929923614,
|
648 |
+
"grad_norm": 0.003252738853916526,
|
649 |
+
"learning_rate": 1.9806735466407257e-06,
|
650 |
+
"loss": 0.0082,
|
651 |
+
"step": 4500
|
652 |
+
},
|
653 |
+
{
|
654 |
+
"epoch": 1.5111258718033875,
|
655 |
+
"grad_norm": 0.00012037971464451402,
|
656 |
+
"learning_rate": 1.975617712592735e-06,
|
657 |
+
"loss": 0.0001,
|
658 |
+
"step": 4550
|
659 |
+
},
|
660 |
+
{
|
661 |
+
"epoch": 1.5277316506144138,
|
662 |
+
"grad_norm": 0.010974590666592121,
|
663 |
+
"learning_rate": 1.9705618785447445e-06,
|
664 |
+
"loss": 0.0,
|
665 |
+
"step": 4600
|
666 |
+
},
|
667 |
+
{
|
668 |
+
"epoch": 1.5443374294254402,
|
669 |
+
"grad_norm": 0.08398176729679108,
|
670 |
+
"learning_rate": 1.9655060444967543e-06,
|
671 |
+
"loss": 0.0002,
|
672 |
+
"step": 4650
|
673 |
+
},
|
674 |
+
{
|
675 |
+
"epoch": 1.5609432082364663,
|
676 |
+
"grad_norm": 0.03629281371831894,
|
677 |
+
"learning_rate": 1.9604502104487637e-06,
|
678 |
+
"loss": 0.006,
|
679 |
+
"step": 4700
|
680 |
+
},
|
681 |
+
{
|
682 |
+
"epoch": 1.5775489870474926,
|
683 |
+
"grad_norm": 0.00034110501292161644,
|
684 |
+
"learning_rate": 1.955394376400773e-06,
|
685 |
+
"loss": 0.0003,
|
686 |
+
"step": 4750
|
687 |
+
},
|
688 |
+
{
|
689 |
+
"epoch": 1.594154765858519,
|
690 |
+
"grad_norm": 0.0027959852013736963,
|
691 |
+
"learning_rate": 1.950338542352783e-06,
|
692 |
+
"loss": 0.0,
|
693 |
+
"step": 4800
|
694 |
+
},
|
695 |
+
{
|
696 |
+
"epoch": 1.610760544669545,
|
697 |
+
"grad_norm": 0.0001677741383900866,
|
698 |
+
"learning_rate": 1.9452827083047927e-06,
|
699 |
+
"loss": 0.0023,
|
700 |
+
"step": 4850
|
701 |
+
},
|
702 |
+
{
|
703 |
+
"epoch": 1.627366323480571,
|
704 |
+
"grad_norm": 0.055583104491233826,
|
705 |
+
"learning_rate": 1.940226874256802e-06,
|
706 |
+
"loss": 0.0225,
|
707 |
+
"step": 4900
|
708 |
+
},
|
709 |
+
{
|
710 |
+
"epoch": 1.6439721022915976,
|
711 |
+
"grad_norm": 8.664117194712162e-05,
|
712 |
+
"learning_rate": 1.935171040208812e-06,
|
713 |
+
"loss": 0.0009,
|
714 |
+
"step": 4950
|
715 |
+
},
|
716 |
+
{
|
717 |
+
"epoch": 1.6605778811026237,
|
718 |
+
"grad_norm": 0.0017323939828202128,
|
719 |
+
"learning_rate": 1.9301152061608213e-06,
|
720 |
+
"loss": 0.008,
|
721 |
+
"step": 5000
|
722 |
+
},
|
723 |
+
{
|
724 |
+
"epoch": 1.6771836599136498,
|
725 |
+
"grad_norm": 0.0034425491467118263,
|
726 |
+
"learning_rate": 1.9250593721128307e-06,
|
727 |
+
"loss": 0.0,
|
728 |
+
"step": 5050
|
729 |
+
},
|
730 |
+
{
|
731 |
+
"epoch": 1.6937894387246761,
|
732 |
+
"grad_norm": 6.076216959627345e-05,
|
733 |
+
"learning_rate": 1.9200035380648405e-06,
|
734 |
+
"loss": 0.0041,
|
735 |
+
"step": 5100
|
736 |
+
},
|
737 |
+
{
|
738 |
+
"epoch": 1.7103952175357025,
|
739 |
+
"grad_norm": 0.0018082900205627084,
|
740 |
+
"learning_rate": 1.91494770401685e-06,
|
741 |
+
"loss": 0.0017,
|
742 |
+
"step": 5150
|
743 |
+
},
|
744 |
+
{
|
745 |
+
"epoch": 1.7270009963467285,
|
746 |
+
"grad_norm": 0.008552160114049911,
|
747 |
+
"learning_rate": 1.9098918699688593e-06,
|
748 |
+
"loss": 0.0137,
|
749 |
+
"step": 5200
|
750 |
+
},
|
751 |
+
{
|
752 |
+
"epoch": 1.7436067751577549,
|
753 |
+
"grad_norm": 0.08908296376466751,
|
754 |
+
"learning_rate": 1.9048360359208694e-06,
|
755 |
+
"loss": 0.0092,
|
756 |
+
"step": 5250
|
757 |
+
},
|
758 |
+
{
|
759 |
+
"epoch": 1.7602125539687812,
|
760 |
+
"grad_norm": 0.002973488997668028,
|
761 |
+
"learning_rate": 1.8997802018728788e-06,
|
762 |
+
"loss": 0.0002,
|
763 |
+
"step": 5300
|
764 |
+
},
|
765 |
+
{
|
766 |
+
"epoch": 1.7768183327798073,
|
767 |
+
"grad_norm": 0.005116044543683529,
|
768 |
+
"learning_rate": 1.8947243678248884e-06,
|
769 |
+
"loss": 0.0079,
|
770 |
+
"step": 5350
|
771 |
+
},
|
772 |
+
{
|
773 |
+
"epoch": 1.7934241115908336,
|
774 |
+
"grad_norm": 0.002092874376103282,
|
775 |
+
"learning_rate": 1.889668533776898e-06,
|
776 |
+
"loss": 0.0,
|
777 |
+
"step": 5400
|
778 |
+
},
|
779 |
+
{
|
780 |
+
"epoch": 1.81002989040186,
|
781 |
+
"grad_norm": 0.0070649790577590466,
|
782 |
+
"learning_rate": 1.8846126997289076e-06,
|
783 |
+
"loss": 0.0,
|
784 |
+
"step": 5450
|
785 |
+
},
|
786 |
+
{
|
787 |
+
"epoch": 1.826635669212886,
|
788 |
+
"grad_norm": 0.001974167302250862,
|
789 |
+
"learning_rate": 1.879556865680917e-06,
|
790 |
+
"loss": 0.016,
|
791 |
+
"step": 5500
|
792 |
+
},
|
793 |
+
{
|
794 |
+
"epoch": 1.8432414480239123,
|
795 |
+
"grad_norm": 0.0012006360339000821,
|
796 |
+
"learning_rate": 1.8745010316329268e-06,
|
797 |
+
"loss": 0.0,
|
798 |
+
"step": 5550
|
799 |
+
},
|
800 |
+
{
|
801 |
+
"epoch": 1.8598472268349386,
|
802 |
+
"grad_norm": 0.006318301893770695,
|
803 |
+
"learning_rate": 1.8694451975849362e-06,
|
804 |
+
"loss": 0.0,
|
805 |
+
"step": 5600
|
806 |
+
},
|
807 |
+
{
|
808 |
+
"epoch": 1.8764530056459647,
|
809 |
+
"grad_norm": 0.0020722977351397276,
|
810 |
+
"learning_rate": 1.8643893635369458e-06,
|
811 |
+
"loss": 0.0104,
|
812 |
+
"step": 5650
|
813 |
+
},
|
814 |
+
{
|
815 |
+
"epoch": 1.893058784456991,
|
816 |
+
"grad_norm": 0.0874456912279129,
|
817 |
+
"learning_rate": 1.8593335294889556e-06,
|
818 |
+
"loss": 0.0023,
|
819 |
+
"step": 5700
|
820 |
+
},
|
821 |
+
{
|
822 |
+
"epoch": 1.9096645632680174,
|
823 |
+
"grad_norm": 0.00042386740096844733,
|
824 |
+
"learning_rate": 1.854277695440965e-06,
|
825 |
+
"loss": 0.0105,
|
826 |
+
"step": 5750
|
827 |
+
},
|
828 |
+
{
|
829 |
+
"epoch": 1.9262703420790435,
|
830 |
+
"grad_norm": 0.05140538513660431,
|
831 |
+
"learning_rate": 1.8492218613929746e-06,
|
832 |
+
"loss": 0.0008,
|
833 |
+
"step": 5800
|
834 |
+
},
|
835 |
+
{
|
836 |
+
"epoch": 1.9428761208900698,
|
837 |
+
"grad_norm": 0.00046465068589895964,
|
838 |
+
"learning_rate": 1.8441660273449842e-06,
|
839 |
+
"loss": 0.0176,
|
840 |
+
"step": 5850
|
841 |
+
},
|
842 |
+
{
|
843 |
+
"epoch": 1.959481899701096,
|
844 |
+
"grad_norm": 0.001875279936939478,
|
845 |
+
"learning_rate": 1.8391101932969938e-06,
|
846 |
+
"loss": 0.0002,
|
847 |
+
"step": 5900
|
848 |
+
},
|
849 |
+
{
|
850 |
+
"epoch": 1.9760876785121222,
|
851 |
+
"grad_norm": 0.0012590339174494147,
|
852 |
+
"learning_rate": 1.8340543592490032e-06,
|
853 |
+
"loss": 0.001,
|
854 |
+
"step": 5950
|
855 |
+
},
|
856 |
+
{
|
857 |
+
"epoch": 1.9926934573231485,
|
858 |
+
"grad_norm": 25.133811950683594,
|
859 |
+
"learning_rate": 1.828998525201013e-06,
|
860 |
+
"loss": 0.0229,
|
861 |
+
"step": 6000
|
862 |
+
},
|
863 |
+
{
|
864 |
+
"epoch": 2.0,
|
865 |
+
"eval_accuracy": 0.995706106870229,
|
866 |
+
"eval_f1": 0.9956269879098661,
|
867 |
+
"eval_loss": 0.0445549376308918,
|
868 |
+
"eval_precision": 0.9956596696711074,
|
869 |
+
"eval_recall": 0.995706106870229,
|
870 |
+
"eval_runtime": 38.3077,
|
871 |
+
"eval_samples_per_second": 218.859,
|
872 |
+
"eval_steps_per_second": 13.679,
|
873 |
+
"step": 6022
|
874 |
+
}
|
875 |
+
],
|
876 |
+
"logging_steps": 50,
|
877 |
+
"max_steps": 24088,
|
878 |
+
"num_input_tokens_seen": 0,
|
879 |
+
"num_train_epochs": 8,
|
880 |
+
"save_steps": 500,
|
881 |
+
"stateful_callbacks": {
|
882 |
+
"TrainerControl": {
|
883 |
+
"args": {
|
884 |
+
"should_epoch_stop": false,
|
885 |
+
"should_evaluate": false,
|
886 |
+
"should_log": false,
|
887 |
+
"should_save": true,
|
888 |
+
"should_training_stop": false
|
889 |
+
},
|
890 |
+
"attributes": {}
|
891 |
+
}
|
892 |
+
},
|
893 |
+
"total_flos": 3.282861088518144e+16,
|
894 |
+
"train_batch_size": 16,
|
895 |
+
"trial_name": null,
|
896 |
+
"trial_params": null
|
897 |
+
}
|
trial-1/checkpoint-6022/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:161830f01fe4451cf2afb08516c24e569c5b229b44b735c51814ae17b5494e10
|
3 |
+
size 5368
|
trial-2/checkpoint-6022/config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "answerdotai/ModernBERT-base",
|
3 |
+
"architectures": [
|
4 |
+
"ModernBertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 50281,
|
9 |
+
"classifier_activation": "gelu",
|
10 |
+
"classifier_bias": false,
|
11 |
+
"classifier_dropout": 0.0,
|
12 |
+
"classifier_pooling": "mean",
|
13 |
+
"cls_token_id": 50281,
|
14 |
+
"decoder_bias": true,
|
15 |
+
"deterministic_flash_attn": false,
|
16 |
+
"embedding_dropout": 0.0,
|
17 |
+
"eos_token_id": 50282,
|
18 |
+
"global_attn_every_n_layers": 3,
|
19 |
+
"global_rope_theta": 160000.0,
|
20 |
+
"gradient_checkpointing": false,
|
21 |
+
"hidden_activation": "gelu",
|
22 |
+
"hidden_size": 768,
|
23 |
+
"initializer_cutoff_factor": 2.0,
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 1152,
|
26 |
+
"layer_norm_eps": 1e-05,
|
27 |
+
"local_attention": 128,
|
28 |
+
"local_rope_theta": 10000.0,
|
29 |
+
"max_position_embeddings": 8192,
|
30 |
+
"mlp_bias": false,
|
31 |
+
"mlp_dropout": 0.0,
|
32 |
+
"model_type": "modernbert",
|
33 |
+
"norm_bias": false,
|
34 |
+
"norm_eps": 1e-05,
|
35 |
+
"num_attention_heads": 12,
|
36 |
+
"num_hidden_layers": 22,
|
37 |
+
"pad_token_id": 50283,
|
38 |
+
"position_embedding_type": "absolute",
|
39 |
+
"problem_type": "single_label_classification",
|
40 |
+
"reference_compile": true,
|
41 |
+
"sep_token_id": 50282,
|
42 |
+
"sparse_pred_ignore_index": -100,
|
43 |
+
"sparse_prediction": false,
|
44 |
+
"torch_dtype": "float32",
|
45 |
+
"transformers_version": "4.48.0.dev0",
|
46 |
+
"vocab_size": 50368
|
47 |
+
}
|
trial-2/checkpoint-6022/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33d8242e8a21a76a0ad8b21949fe7bd68e94de5ce2da543a151336909fcb8e83
|
3 |
+
size 598439784
|
trial-2/checkpoint-6022/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c89405c1def95fb7d1e0ff7deac188ca136134ebd620d1451c9f0d4ed557d77a
|
3 |
+
size 1196967418
|
trial-2/checkpoint-6022/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
|
3 |
+
size 14244
|
trial-2/checkpoint-6022/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:daebe5b6f96508652ee77aa623e80e4943a4ab7b8acffe2720aa77d58c2624f9
|
3 |
+
size 1064
|
trial-2/checkpoint-6022/trainer_state.json
ADDED
@@ -0,0 +1,897 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.031979888677597046,
|
3 |
+
"best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-2/checkpoint-6022",
|
4 |
+
"epoch": 2.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 6022,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.016605778811026237,
|
13 |
+
"grad_norm": 21.788597106933594,
|
14 |
+
"learning_rate": 5.429575351871404e-06,
|
15 |
+
"loss": 0.5789,
|
16 |
+
"step": 50
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.033211557622052475,
|
20 |
+
"grad_norm": 20.038349151611328,
|
21 |
+
"learning_rate": 5.416664391316233e-06,
|
22 |
+
"loss": 0.37,
|
23 |
+
"step": 100
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.04981733643307871,
|
27 |
+
"grad_norm": 23.927526473999023,
|
28 |
+
"learning_rate": 5.403753430761063e-06,
|
29 |
+
"loss": 0.25,
|
30 |
+
"step": 150
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.06642311524410495,
|
34 |
+
"grad_norm": 4.1712799072265625,
|
35 |
+
"learning_rate": 5.390842470205893e-06,
|
36 |
+
"loss": 0.1921,
|
37 |
+
"step": 200
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.08302889405513118,
|
41 |
+
"grad_norm": 6.138601303100586,
|
42 |
+
"learning_rate": 5.3779315096507225e-06,
|
43 |
+
"loss": 0.1365,
|
44 |
+
"step": 250
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.09963467286615742,
|
48 |
+
"grad_norm": 0.9431160092353821,
|
49 |
+
"learning_rate": 5.3650205490955514e-06,
|
50 |
+
"loss": 0.1473,
|
51 |
+
"step": 300
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.11624045167718366,
|
55 |
+
"grad_norm": 25.303245544433594,
|
56 |
+
"learning_rate": 5.352109588540381e-06,
|
57 |
+
"loss": 0.0875,
|
58 |
+
"step": 350
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.1328462304882099,
|
62 |
+
"grad_norm": 14.83379077911377,
|
63 |
+
"learning_rate": 5.33919862798521e-06,
|
64 |
+
"loss": 0.111,
|
65 |
+
"step": 400
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.14945200929923613,
|
69 |
+
"grad_norm": 0.2346535325050354,
|
70 |
+
"learning_rate": 5.32628766743004e-06,
|
71 |
+
"loss": 0.0722,
|
72 |
+
"step": 450
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.16605778811026237,
|
76 |
+
"grad_norm": 19.045169830322266,
|
77 |
+
"learning_rate": 5.31337670687487e-06,
|
78 |
+
"loss": 0.1236,
|
79 |
+
"step": 500
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.1826635669212886,
|
83 |
+
"grad_norm": 10.871609687805176,
|
84 |
+
"learning_rate": 5.300465746319699e-06,
|
85 |
+
"loss": 0.1018,
|
86 |
+
"step": 550
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.19926934573231483,
|
90 |
+
"grad_norm": 8.278830528259277,
|
91 |
+
"learning_rate": 5.287554785764528e-06,
|
92 |
+
"loss": 0.0608,
|
93 |
+
"step": 600
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.2158751245433411,
|
97 |
+
"grad_norm": 3.4486818313598633,
|
98 |
+
"learning_rate": 5.274643825209358e-06,
|
99 |
+
"loss": 0.0684,
|
100 |
+
"step": 650
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.23248090335436733,
|
104 |
+
"grad_norm": 9.789453506469727,
|
105 |
+
"learning_rate": 5.261732864654187e-06,
|
106 |
+
"loss": 0.0826,
|
107 |
+
"step": 700
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.24908668216539356,
|
111 |
+
"grad_norm": 0.013454285450279713,
|
112 |
+
"learning_rate": 5.248821904099017e-06,
|
113 |
+
"loss": 0.0672,
|
114 |
+
"step": 750
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.2656924609764198,
|
118 |
+
"grad_norm": 0.8878294825553894,
|
119 |
+
"learning_rate": 5.2359109435438465e-06,
|
120 |
+
"loss": 0.0472,
|
121 |
+
"step": 800
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.282298239787446,
|
125 |
+
"grad_norm": 15.41006088256836,
|
126 |
+
"learning_rate": 5.222999982988676e-06,
|
127 |
+
"loss": 0.0616,
|
128 |
+
"step": 850
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.29890401859847227,
|
132 |
+
"grad_norm": 0.04324938729405403,
|
133 |
+
"learning_rate": 5.210089022433506e-06,
|
134 |
+
"loss": 0.0215,
|
135 |
+
"step": 900
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.3155097974094985,
|
139 |
+
"grad_norm": 0.011849366128444672,
|
140 |
+
"learning_rate": 5.197178061878335e-06,
|
141 |
+
"loss": 0.0398,
|
142 |
+
"step": 950
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.33211557622052473,
|
146 |
+
"grad_norm": 0.0020897299982607365,
|
147 |
+
"learning_rate": 5.184267101323165e-06,
|
148 |
+
"loss": 0.0294,
|
149 |
+
"step": 1000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.348721355031551,
|
153 |
+
"grad_norm": 0.00038467388367280364,
|
154 |
+
"learning_rate": 5.171356140767994e-06,
|
155 |
+
"loss": 0.0328,
|
156 |
+
"step": 1050
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.3653271338425772,
|
160 |
+
"grad_norm": 0.0022064056247472763,
|
161 |
+
"learning_rate": 5.158445180212823e-06,
|
162 |
+
"loss": 0.0216,
|
163 |
+
"step": 1100
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.38193291265360346,
|
167 |
+
"grad_norm": 0.012603014707565308,
|
168 |
+
"learning_rate": 5.145534219657653e-06,
|
169 |
+
"loss": 0.0293,
|
170 |
+
"step": 1150
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.39853869146462967,
|
174 |
+
"grad_norm": 0.002970542525872588,
|
175 |
+
"learning_rate": 5.132623259102483e-06,
|
176 |
+
"loss": 0.0133,
|
177 |
+
"step": 1200
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.41514447027565593,
|
181 |
+
"grad_norm": 0.09289965778589249,
|
182 |
+
"learning_rate": 5.119712298547312e-06,
|
183 |
+
"loss": 0.0189,
|
184 |
+
"step": 1250
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.4317502490866822,
|
188 |
+
"grad_norm": 0.030116688460111618,
|
189 |
+
"learning_rate": 5.106801337992142e-06,
|
190 |
+
"loss": 0.0266,
|
191 |
+
"step": 1300
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.4483560278977084,
|
195 |
+
"grad_norm": 23.291847229003906,
|
196 |
+
"learning_rate": 5.0938903774369705e-06,
|
197 |
+
"loss": 0.0378,
|
198 |
+
"step": 1350
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.46496180670873466,
|
202 |
+
"grad_norm": 0.00580954784527421,
|
203 |
+
"learning_rate": 5.0809794168818e-06,
|
204 |
+
"loss": 0.0002,
|
205 |
+
"step": 1400
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.48156758551976087,
|
209 |
+
"grad_norm": 0.0036250711418688297,
|
210 |
+
"learning_rate": 5.06806845632663e-06,
|
211 |
+
"loss": 0.0297,
|
212 |
+
"step": 1450
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.4981733643307871,
|
216 |
+
"grad_norm": 0.0013630707981064916,
|
217 |
+
"learning_rate": 5.05515749577146e-06,
|
218 |
+
"loss": 0.0114,
|
219 |
+
"step": 1500
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.5147791431418134,
|
223 |
+
"grad_norm": 0.025447094812989235,
|
224 |
+
"learning_rate": 5.042246535216289e-06,
|
225 |
+
"loss": 0.0019,
|
226 |
+
"step": 1550
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.5313849219528396,
|
230 |
+
"grad_norm": 18.81841468811035,
|
231 |
+
"learning_rate": 5.0293355746611185e-06,
|
232 |
+
"loss": 0.0286,
|
233 |
+
"step": 1600
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.5479907007638658,
|
237 |
+
"grad_norm": 0.0033424277789890766,
|
238 |
+
"learning_rate": 5.016424614105948e-06,
|
239 |
+
"loss": 0.0393,
|
240 |
+
"step": 1650
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.564596479574892,
|
244 |
+
"grad_norm": 0.039123374968767166,
|
245 |
+
"learning_rate": 5.003513653550777e-06,
|
246 |
+
"loss": 0.0186,
|
247 |
+
"step": 1700
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.5812022583859183,
|
251 |
+
"grad_norm": 0.0005275913863442838,
|
252 |
+
"learning_rate": 4.990602692995607e-06,
|
253 |
+
"loss": 0.0003,
|
254 |
+
"step": 1750
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.5978080371969445,
|
258 |
+
"grad_norm": 0.005070064682513475,
|
259 |
+
"learning_rate": 4.977691732440437e-06,
|
260 |
+
"loss": 0.01,
|
261 |
+
"step": 1800
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.6144138160079707,
|
265 |
+
"grad_norm": 0.003932475112378597,
|
266 |
+
"learning_rate": 4.9647807718852664e-06,
|
267 |
+
"loss": 0.0222,
|
268 |
+
"step": 1850
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.631019594818997,
|
272 |
+
"grad_norm": 0.6544032692909241,
|
273 |
+
"learning_rate": 4.951869811330095e-06,
|
274 |
+
"loss": 0.0138,
|
275 |
+
"step": 1900
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.6476253736300233,
|
279 |
+
"grad_norm": 0.008768323808908463,
|
280 |
+
"learning_rate": 4.938958850774925e-06,
|
281 |
+
"loss": 0.0056,
|
282 |
+
"step": 1950
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.6642311524410495,
|
286 |
+
"grad_norm": 0.0021180976182222366,
|
287 |
+
"learning_rate": 4.926047890219754e-06,
|
288 |
+
"loss": 0.0049,
|
289 |
+
"step": 2000
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.6808369312520757,
|
293 |
+
"grad_norm": 0.002039346843957901,
|
294 |
+
"learning_rate": 4.913136929664584e-06,
|
295 |
+
"loss": 0.0142,
|
296 |
+
"step": 2050
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.697442710063102,
|
300 |
+
"grad_norm": 0.012900142930448055,
|
301 |
+
"learning_rate": 4.9002259691094136e-06,
|
302 |
+
"loss": 0.0105,
|
303 |
+
"step": 2100
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.7140484888741282,
|
307 |
+
"grad_norm": 0.0022153747268021107,
|
308 |
+
"learning_rate": 4.887315008554243e-06,
|
309 |
+
"loss": 0.0142,
|
310 |
+
"step": 2150
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.7306542676851544,
|
314 |
+
"grad_norm": 0.001426122267730534,
|
315 |
+
"learning_rate": 4.874404047999072e-06,
|
316 |
+
"loss": 0.0068,
|
317 |
+
"step": 2200
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.7472600464961807,
|
321 |
+
"grad_norm": 0.0008603449095971882,
|
322 |
+
"learning_rate": 4.861493087443902e-06,
|
323 |
+
"loss": 0.0119,
|
324 |
+
"step": 2250
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.7638658253072069,
|
328 |
+
"grad_norm": 0.0006780526018701494,
|
329 |
+
"learning_rate": 4.848582126888731e-06,
|
330 |
+
"loss": 0.0108,
|
331 |
+
"step": 2300
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.7804716041182331,
|
335 |
+
"grad_norm": 0.014527379535138607,
|
336 |
+
"learning_rate": 4.835671166333561e-06,
|
337 |
+
"loss": 0.0002,
|
338 |
+
"step": 2350
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.7970773829292593,
|
342 |
+
"grad_norm": 0.00022624376288149506,
|
343 |
+
"learning_rate": 4.8227602057783904e-06,
|
344 |
+
"loss": 0.0092,
|
345 |
+
"step": 2400
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 0.8136831617402857,
|
349 |
+
"grad_norm": 0.0044932495802640915,
|
350 |
+
"learning_rate": 4.80984924522322e-06,
|
351 |
+
"loss": 0.0001,
|
352 |
+
"step": 2450
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"epoch": 0.8302889405513119,
|
356 |
+
"grad_norm": 0.0009355309884995222,
|
357 |
+
"learning_rate": 4.79693828466805e-06,
|
358 |
+
"loss": 0.0002,
|
359 |
+
"step": 2500
|
360 |
+
},
|
361 |
+
{
|
362 |
+
"epoch": 0.8468947193623381,
|
363 |
+
"grad_norm": 0.12550997734069824,
|
364 |
+
"learning_rate": 4.784027324112879e-06,
|
365 |
+
"loss": 0.0024,
|
366 |
+
"step": 2550
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"epoch": 0.8635004981733644,
|
370 |
+
"grad_norm": 0.02399071305990219,
|
371 |
+
"learning_rate": 4.771116363557709e-06,
|
372 |
+
"loss": 0.0099,
|
373 |
+
"step": 2600
|
374 |
+
},
|
375 |
+
{
|
376 |
+
"epoch": 0.8801062769843906,
|
377 |
+
"grad_norm": 0.008470265194773674,
|
378 |
+
"learning_rate": 4.7582054030025375e-06,
|
379 |
+
"loss": 0.0157,
|
380 |
+
"step": 2650
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"epoch": 0.8967120557954168,
|
384 |
+
"grad_norm": 3.967735028709285e-05,
|
385 |
+
"learning_rate": 4.745294442447367e-06,
|
386 |
+
"loss": 0.0013,
|
387 |
+
"step": 2700
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"epoch": 0.913317834606443,
|
391 |
+
"grad_norm": 0.0005532742943614721,
|
392 |
+
"learning_rate": 4.732383481892197e-06,
|
393 |
+
"loss": 0.0025,
|
394 |
+
"step": 2750
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"epoch": 0.9299236134174693,
|
398 |
+
"grad_norm": 9.227233022102155e-06,
|
399 |
+
"learning_rate": 4.719472521337027e-06,
|
400 |
+
"loss": 0.0028,
|
401 |
+
"step": 2800
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"epoch": 0.9465293922284955,
|
405 |
+
"grad_norm": 0.280258446931839,
|
406 |
+
"learning_rate": 4.706561560781856e-06,
|
407 |
+
"loss": 0.0004,
|
408 |
+
"step": 2850
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"epoch": 0.9631351710395217,
|
412 |
+
"grad_norm": 27.427757263183594,
|
413 |
+
"learning_rate": 4.6936506002266855e-06,
|
414 |
+
"loss": 0.0127,
|
415 |
+
"step": 2900
|
416 |
+
},
|
417 |
+
{
|
418 |
+
"epoch": 0.9797409498505479,
|
419 |
+
"grad_norm": 176.85423278808594,
|
420 |
+
"learning_rate": 4.680739639671514e-06,
|
421 |
+
"loss": 0.0298,
|
422 |
+
"step": 2950
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"epoch": 0.9963467286615743,
|
426 |
+
"grad_norm": 0.00011263355554547161,
|
427 |
+
"learning_rate": 4.667828679116344e-06,
|
428 |
+
"loss": 0.001,
|
429 |
+
"step": 3000
|
430 |
+
},
|
431 |
+
{
|
432 |
+
"epoch": 1.0,
|
433 |
+
"eval_accuracy": 0.9963024809160306,
|
434 |
+
"eval_f1": 0.9962431632227496,
|
435 |
+
"eval_loss": 0.04071500524878502,
|
436 |
+
"eval_precision": 0.9962693439313673,
|
437 |
+
"eval_recall": 0.9963024809160306,
|
438 |
+
"eval_runtime": 38.0003,
|
439 |
+
"eval_samples_per_second": 220.63,
|
440 |
+
"eval_steps_per_second": 13.789,
|
441 |
+
"step": 3011
|
442 |
+
},
|
443 |
+
{
|
444 |
+
"epoch": 1.0129525074726005,
|
445 |
+
"grad_norm": 0.05092976614832878,
|
446 |
+
"learning_rate": 4.654917718561174e-06,
|
447 |
+
"loss": 0.018,
|
448 |
+
"step": 3050
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"epoch": 1.0295582862836268,
|
452 |
+
"grad_norm": 3.4633874747669324e-05,
|
453 |
+
"learning_rate": 4.642006758006004e-06,
|
454 |
+
"loss": 0.0,
|
455 |
+
"step": 3100
|
456 |
+
},
|
457 |
+
{
|
458 |
+
"epoch": 1.0461640650946529,
|
459 |
+
"grad_norm": 8.058391540544108e-05,
|
460 |
+
"learning_rate": 4.629095797450833e-06,
|
461 |
+
"loss": 0.0,
|
462 |
+
"step": 3150
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"epoch": 1.0627698439056792,
|
466 |
+
"grad_norm": 0.00043129033292643726,
|
467 |
+
"learning_rate": 4.616184836895662e-06,
|
468 |
+
"loss": 0.0,
|
469 |
+
"step": 3200
|
470 |
+
},
|
471 |
+
{
|
472 |
+
"epoch": 1.0793756227167055,
|
473 |
+
"grad_norm": 0.012417804449796677,
|
474 |
+
"learning_rate": 4.603273876340492e-06,
|
475 |
+
"loss": 0.0204,
|
476 |
+
"step": 3250
|
477 |
+
},
|
478 |
+
{
|
479 |
+
"epoch": 1.0959814015277316,
|
480 |
+
"grad_norm": 0.07707448303699493,
|
481 |
+
"learning_rate": 4.590362915785321e-06,
|
482 |
+
"loss": 0.0089,
|
483 |
+
"step": 3300
|
484 |
+
},
|
485 |
+
{
|
486 |
+
"epoch": 1.112587180338758,
|
487 |
+
"grad_norm": 0.0019856118597090244,
|
488 |
+
"learning_rate": 4.577451955230151e-06,
|
489 |
+
"loss": 0.0003,
|
490 |
+
"step": 3350
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"epoch": 1.1291929591497842,
|
494 |
+
"grad_norm": 0.0003844090970233083,
|
495 |
+
"learning_rate": 4.564540994674981e-06,
|
496 |
+
"loss": 0.0,
|
497 |
+
"step": 3400
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"epoch": 1.1457987379608103,
|
501 |
+
"grad_norm": 0.004796341527253389,
|
502 |
+
"learning_rate": 4.55163003411981e-06,
|
503 |
+
"loss": 0.0054,
|
504 |
+
"step": 3450
|
505 |
+
},
|
506 |
+
{
|
507 |
+
"epoch": 1.1624045167718366,
|
508 |
+
"grad_norm": 0.0021394495852291584,
|
509 |
+
"learning_rate": 4.538719073564639e-06,
|
510 |
+
"loss": 0.0001,
|
511 |
+
"step": 3500
|
512 |
+
},
|
513 |
+
{
|
514 |
+
"epoch": 1.1790102955828627,
|
515 |
+
"grad_norm": 0.00016287445032503456,
|
516 |
+
"learning_rate": 4.525808113009469e-06,
|
517 |
+
"loss": 0.0017,
|
518 |
+
"step": 3550
|
519 |
+
},
|
520 |
+
{
|
521 |
+
"epoch": 1.195616074393889,
|
522 |
+
"grad_norm": 0.005753168836236,
|
523 |
+
"learning_rate": 4.512897152454298e-06,
|
524 |
+
"loss": 0.0132,
|
525 |
+
"step": 3600
|
526 |
+
},
|
527 |
+
{
|
528 |
+
"epoch": 1.2122218532049154,
|
529 |
+
"grad_norm": 0.00012519631127361208,
|
530 |
+
"learning_rate": 4.499986191899128e-06,
|
531 |
+
"loss": 0.0,
|
532 |
+
"step": 3650
|
533 |
+
},
|
534 |
+
{
|
535 |
+
"epoch": 1.2288276320159415,
|
536 |
+
"grad_norm": 0.0009526669164188206,
|
537 |
+
"learning_rate": 4.487075231343957e-06,
|
538 |
+
"loss": 0.0083,
|
539 |
+
"step": 3700
|
540 |
+
},
|
541 |
+
{
|
542 |
+
"epoch": 1.2454334108269678,
|
543 |
+
"grad_norm": 6.90124070388265e-05,
|
544 |
+
"learning_rate": 4.474164270788787e-06,
|
545 |
+
"loss": 0.0114,
|
546 |
+
"step": 3750
|
547 |
+
},
|
548 |
+
{
|
549 |
+
"epoch": 1.2620391896379939,
|
550 |
+
"grad_norm": 0.0029422417283058167,
|
551 |
+
"learning_rate": 4.461253310233616e-06,
|
552 |
+
"loss": 0.0001,
|
553 |
+
"step": 3800
|
554 |
+
},
|
555 |
+
{
|
556 |
+
"epoch": 1.2786449684490202,
|
557 |
+
"grad_norm": 1.6564589738845825,
|
558 |
+
"learning_rate": 4.448342349678446e-06,
|
559 |
+
"loss": 0.0065,
|
560 |
+
"step": 3850
|
561 |
+
},
|
562 |
+
{
|
563 |
+
"epoch": 1.2952507472600465,
|
564 |
+
"grad_norm": 4.6906425268389285e-05,
|
565 |
+
"learning_rate": 4.435431389123275e-06,
|
566 |
+
"loss": 0.0,
|
567 |
+
"step": 3900
|
568 |
+
},
|
569 |
+
{
|
570 |
+
"epoch": 1.3118565260710726,
|
571 |
+
"grad_norm": 1.4456440112553537e-05,
|
572 |
+
"learning_rate": 4.4225204285681046e-06,
|
573 |
+
"loss": 0.0,
|
574 |
+
"step": 3950
|
575 |
+
},
|
576 |
+
{
|
577 |
+
"epoch": 1.328462304882099,
|
578 |
+
"grad_norm": 4.6707005822099745e-05,
|
579 |
+
"learning_rate": 4.409609468012934e-06,
|
580 |
+
"loss": 0.0227,
|
581 |
+
"step": 4000
|
582 |
+
},
|
583 |
+
{
|
584 |
+
"epoch": 1.3450680836931252,
|
585 |
+
"grad_norm": 4.7155015636235476e-05,
|
586 |
+
"learning_rate": 4.396698507457763e-06,
|
587 |
+
"loss": 0.0002,
|
588 |
+
"step": 4050
|
589 |
+
},
|
590 |
+
{
|
591 |
+
"epoch": 1.3616738625041513,
|
592 |
+
"grad_norm": 0.01696430891752243,
|
593 |
+
"learning_rate": 4.383787546902593e-06,
|
594 |
+
"loss": 0.0188,
|
595 |
+
"step": 4100
|
596 |
+
},
|
597 |
+
{
|
598 |
+
"epoch": 1.3782796413151777,
|
599 |
+
"grad_norm": 0.0008329456904903054,
|
600 |
+
"learning_rate": 4.370876586347423e-06,
|
601 |
+
"loss": 0.0178,
|
602 |
+
"step": 4150
|
603 |
+
},
|
604 |
+
{
|
605 |
+
"epoch": 1.394885420126204,
|
606 |
+
"grad_norm": 9.179511835100129e-05,
|
607 |
+
"learning_rate": 4.3579656257922525e-06,
|
608 |
+
"loss": 0.0,
|
609 |
+
"step": 4200
|
610 |
+
},
|
611 |
+
{
|
612 |
+
"epoch": 1.41149119893723,
|
613 |
+
"grad_norm": 2.924172622442711e-05,
|
614 |
+
"learning_rate": 4.3450546652370814e-06,
|
615 |
+
"loss": 0.0013,
|
616 |
+
"step": 4250
|
617 |
+
},
|
618 |
+
{
|
619 |
+
"epoch": 1.4280969777482564,
|
620 |
+
"grad_norm": 0.015076125971972942,
|
621 |
+
"learning_rate": 4.332143704681911e-06,
|
622 |
+
"loss": 0.0104,
|
623 |
+
"step": 4300
|
624 |
+
},
|
625 |
+
{
|
626 |
+
"epoch": 1.4447027565592827,
|
627 |
+
"grad_norm": 5.385762415244244e-05,
|
628 |
+
"learning_rate": 4.31923274412674e-06,
|
629 |
+
"loss": 0.014,
|
630 |
+
"step": 4350
|
631 |
+
},
|
632 |
+
{
|
633 |
+
"epoch": 1.4613085353703088,
|
634 |
+
"grad_norm": 0.0007110639126040041,
|
635 |
+
"learning_rate": 4.30632178357157e-06,
|
636 |
+
"loss": 0.0126,
|
637 |
+
"step": 4400
|
638 |
+
},
|
639 |
+
{
|
640 |
+
"epoch": 1.4779143141813351,
|
641 |
+
"grad_norm": 0.00014339391782414168,
|
642 |
+
"learning_rate": 4.2934108230164e-06,
|
643 |
+
"loss": 0.0003,
|
644 |
+
"step": 4450
|
645 |
+
},
|
646 |
+
{
|
647 |
+
"epoch": 1.4945200929923614,
|
648 |
+
"grad_norm": 0.0006024091853760183,
|
649 |
+
"learning_rate": 4.280499862461229e-06,
|
650 |
+
"loss": 0.0118,
|
651 |
+
"step": 4500
|
652 |
+
},
|
653 |
+
{
|
654 |
+
"epoch": 1.5111258718033875,
|
655 |
+
"grad_norm": 0.0002353072923142463,
|
656 |
+
"learning_rate": 4.267588901906058e-06,
|
657 |
+
"loss": 0.0086,
|
658 |
+
"step": 4550
|
659 |
+
},
|
660 |
+
{
|
661 |
+
"epoch": 1.5277316506144138,
|
662 |
+
"grad_norm": 0.0008946498855948448,
|
663 |
+
"learning_rate": 4.254677941350888e-06,
|
664 |
+
"loss": 0.0,
|
665 |
+
"step": 4600
|
666 |
+
},
|
667 |
+
{
|
668 |
+
"epoch": 1.5443374294254402,
|
669 |
+
"grad_norm": 7.315174298128113e-05,
|
670 |
+
"learning_rate": 4.241766980795717e-06,
|
671 |
+
"loss": 0.0003,
|
672 |
+
"step": 4650
|
673 |
+
},
|
674 |
+
{
|
675 |
+
"epoch": 1.5609432082364663,
|
676 |
+
"grad_norm": 9.232313459506258e-05,
|
677 |
+
"learning_rate": 4.228856020240547e-06,
|
678 |
+
"loss": 0.0001,
|
679 |
+
"step": 4700
|
680 |
+
},
|
681 |
+
{
|
682 |
+
"epoch": 1.5775489870474926,
|
683 |
+
"grad_norm": 1.4020029084349517e-05,
|
684 |
+
"learning_rate": 4.2159450596853765e-06,
|
685 |
+
"loss": 0.0,
|
686 |
+
"step": 4750
|
687 |
+
},
|
688 |
+
{
|
689 |
+
"epoch": 1.594154765858519,
|
690 |
+
"grad_norm": 4.0607475966680795e-05,
|
691 |
+
"learning_rate": 4.203034099130206e-06,
|
692 |
+
"loss": 0.0,
|
693 |
+
"step": 4800
|
694 |
+
},
|
695 |
+
{
|
696 |
+
"epoch": 1.610760544669545,
|
697 |
+
"grad_norm": 4.69290571345482e-05,
|
698 |
+
"learning_rate": 4.190123138575036e-06,
|
699 |
+
"loss": 0.0177,
|
700 |
+
"step": 4850
|
701 |
+
},
|
702 |
+
{
|
703 |
+
"epoch": 1.627366323480571,
|
704 |
+
"grad_norm": 0.14096687734127045,
|
705 |
+
"learning_rate": 4.177212178019865e-06,
|
706 |
+
"loss": 0.0115,
|
707 |
+
"step": 4900
|
708 |
+
},
|
709 |
+
{
|
710 |
+
"epoch": 1.6439721022915976,
|
711 |
+
"grad_norm": 0.00020342542848084122,
|
712 |
+
"learning_rate": 4.164301217464695e-06,
|
713 |
+
"loss": 0.0001,
|
714 |
+
"step": 4950
|
715 |
+
},
|
716 |
+
{
|
717 |
+
"epoch": 1.6605778811026237,
|
718 |
+
"grad_norm": 0.0002786288969218731,
|
719 |
+
"learning_rate": 4.151390256909524e-06,
|
720 |
+
"loss": 0.0,
|
721 |
+
"step": 5000
|
722 |
+
},
|
723 |
+
{
|
724 |
+
"epoch": 1.6771836599136498,
|
725 |
+
"grad_norm": 2.8438846129574813e-05,
|
726 |
+
"learning_rate": 4.138479296354353e-06,
|
727 |
+
"loss": 0.0032,
|
728 |
+
"step": 5050
|
729 |
+
},
|
730 |
+
{
|
731 |
+
"epoch": 1.6937894387246761,
|
732 |
+
"grad_norm": 5.944320037087891e-06,
|
733 |
+
"learning_rate": 4.125568335799183e-06,
|
734 |
+
"loss": 0.0001,
|
735 |
+
"step": 5100
|
736 |
+
},
|
737 |
+
{
|
738 |
+
"epoch": 1.7103952175357025,
|
739 |
+
"grad_norm": 0.005958211608231068,
|
740 |
+
"learning_rate": 4.112657375244013e-06,
|
741 |
+
"loss": 0.0,
|
742 |
+
"step": 5150
|
743 |
+
},
|
744 |
+
{
|
745 |
+
"epoch": 1.7270009963467285,
|
746 |
+
"grad_norm": 0.002004456939175725,
|
747 |
+
"learning_rate": 4.099746414688842e-06,
|
748 |
+
"loss": 0.0106,
|
749 |
+
"step": 5200
|
750 |
+
},
|
751 |
+
{
|
752 |
+
"epoch": 1.7436067751577549,
|
753 |
+
"grad_norm": 0.0008562383009120822,
|
754 |
+
"learning_rate": 4.086835454133672e-06,
|
755 |
+
"loss": 0.0081,
|
756 |
+
"step": 5250
|
757 |
+
},
|
758 |
+
{
|
759 |
+
"epoch": 1.7602125539687812,
|
760 |
+
"grad_norm": 0.03570560738444328,
|
761 |
+
"learning_rate": 4.0739244935785005e-06,
|
762 |
+
"loss": 0.025,
|
763 |
+
"step": 5300
|
764 |
+
},
|
765 |
+
{
|
766 |
+
"epoch": 1.7768183327798073,
|
767 |
+
"grad_norm": 0.001486024702899158,
|
768 |
+
"learning_rate": 4.06101353302333e-06,
|
769 |
+
"loss": 0.0145,
|
770 |
+
"step": 5350
|
771 |
+
},
|
772 |
+
{
|
773 |
+
"epoch": 1.7934241115908336,
|
774 |
+
"grad_norm": 0.0015331929316744208,
|
775 |
+
"learning_rate": 4.04810257246816e-06,
|
776 |
+
"loss": 0.0001,
|
777 |
+
"step": 5400
|
778 |
+
},
|
779 |
+
{
|
780 |
+
"epoch": 1.81002989040186,
|
781 |
+
"grad_norm": 0.004162834957242012,
|
782 |
+
"learning_rate": 4.03519161191299e-06,
|
783 |
+
"loss": 0.0005,
|
784 |
+
"step": 5450
|
785 |
+
},
|
786 |
+
{
|
787 |
+
"epoch": 1.826635669212886,
|
788 |
+
"grad_norm": 0.0003064811462536454,
|
789 |
+
"learning_rate": 4.022280651357819e-06,
|
790 |
+
"loss": 0.0,
|
791 |
+
"step": 5500
|
792 |
+
},
|
793 |
+
{
|
794 |
+
"epoch": 1.8432414480239123,
|
795 |
+
"grad_norm": 0.000830256671179086,
|
796 |
+
"learning_rate": 4.0093696908026485e-06,
|
797 |
+
"loss": 0.0034,
|
798 |
+
"step": 5550
|
799 |
+
},
|
800 |
+
{
|
801 |
+
"epoch": 1.8598472268349386,
|
802 |
+
"grad_norm": 0.001540405093692243,
|
803 |
+
"learning_rate": 3.996458730247478e-06,
|
804 |
+
"loss": 0.0,
|
805 |
+
"step": 5600
|
806 |
+
},
|
807 |
+
{
|
808 |
+
"epoch": 1.8764530056459647,
|
809 |
+
"grad_norm": 0.011221639811992645,
|
810 |
+
"learning_rate": 3.983547769692307e-06,
|
811 |
+
"loss": 0.0116,
|
812 |
+
"step": 5650
|
813 |
+
},
|
814 |
+
{
|
815 |
+
"epoch": 1.893058784456991,
|
816 |
+
"grad_norm": 0.0031693174969404936,
|
817 |
+
"learning_rate": 3.970636809137137e-06,
|
818 |
+
"loss": 0.0061,
|
819 |
+
"step": 5700
|
820 |
+
},
|
821 |
+
{
|
822 |
+
"epoch": 1.9096645632680174,
|
823 |
+
"grad_norm": 7.828649540897459e-05,
|
824 |
+
"learning_rate": 3.957725848581967e-06,
|
825 |
+
"loss": 0.0,
|
826 |
+
"step": 5750
|
827 |
+
},
|
828 |
+
{
|
829 |
+
"epoch": 1.9262703420790435,
|
830 |
+
"grad_norm": 0.00892726145684719,
|
831 |
+
"learning_rate": 3.9448148880267964e-06,
|
832 |
+
"loss": 0.0003,
|
833 |
+
"step": 5800
|
834 |
+
},
|
835 |
+
{
|
836 |
+
"epoch": 1.9428761208900698,
|
837 |
+
"grad_norm": 0.0033830904867500067,
|
838 |
+
"learning_rate": 3.931903927471625e-06,
|
839 |
+
"loss": 0.0007,
|
840 |
+
"step": 5850
|
841 |
+
},
|
842 |
+
{
|
843 |
+
"epoch": 1.959481899701096,
|
844 |
+
"grad_norm": 0.017441514879465103,
|
845 |
+
"learning_rate": 3.918992966916455e-06,
|
846 |
+
"loss": 0.0109,
|
847 |
+
"step": 5900
|
848 |
+
},
|
849 |
+
{
|
850 |
+
"epoch": 1.9760876785121222,
|
851 |
+
"grad_norm": 0.006790176033973694,
|
852 |
+
"learning_rate": 3.906082006361284e-06,
|
853 |
+
"loss": 0.0101,
|
854 |
+
"step": 5950
|
855 |
+
},
|
856 |
+
{
|
857 |
+
"epoch": 1.9926934573231485,
|
858 |
+
"grad_norm": 0.0004248483164701611,
|
859 |
+
"learning_rate": 3.893171045806114e-06,
|
860 |
+
"loss": 0.0103,
|
861 |
+
"step": 6000
|
862 |
+
},
|
863 |
+
{
|
864 |
+
"epoch": 2.0,
|
865 |
+
"eval_accuracy": 0.9959446564885496,
|
866 |
+
"eval_f1": 0.9958827988724177,
|
867 |
+
"eval_loss": 0.031979888677597046,
|
868 |
+
"eval_precision": 0.9958978797187497,
|
869 |
+
"eval_recall": 0.9959446564885496,
|
870 |
+
"eval_runtime": 37.4063,
|
871 |
+
"eval_samples_per_second": 224.134,
|
872 |
+
"eval_steps_per_second": 14.008,
|
873 |
+
"step": 6022
|
874 |
+
}
|
875 |
+
],
|
876 |
+
"logging_steps": 50,
|
877 |
+
"max_steps": 21077,
|
878 |
+
"num_input_tokens_seen": 0,
|
879 |
+
"num_train_epochs": 7,
|
880 |
+
"save_steps": 500,
|
881 |
+
"stateful_callbacks": {
|
882 |
+
"TrainerControl": {
|
883 |
+
"args": {
|
884 |
+
"should_epoch_stop": false,
|
885 |
+
"should_evaluate": false,
|
886 |
+
"should_log": false,
|
887 |
+
"should_save": true,
|
888 |
+
"should_training_stop": false
|
889 |
+
},
|
890 |
+
"attributes": {}
|
891 |
+
}
|
892 |
+
},
|
893 |
+
"total_flos": 3.282861088518144e+16,
|
894 |
+
"train_batch_size": 16,
|
895 |
+
"trial_name": null,
|
896 |
+
"trial_params": null
|
897 |
+
}
|
trial-2/checkpoint-6022/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9657a8731817c986f017540c64090098467c35e79328bfa7cab093c33da6a8e9
|
3 |
+
size 5368
|
trial-3/checkpoint-1506/config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "answerdotai/ModernBERT-base",
|
3 |
+
"architectures": [
|
4 |
+
"ModernBertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 50281,
|
9 |
+
"classifier_activation": "gelu",
|
10 |
+
"classifier_bias": false,
|
11 |
+
"classifier_dropout": 0.0,
|
12 |
+
"classifier_pooling": "mean",
|
13 |
+
"cls_token_id": 50281,
|
14 |
+
"decoder_bias": true,
|
15 |
+
"deterministic_flash_attn": false,
|
16 |
+
"embedding_dropout": 0.0,
|
17 |
+
"eos_token_id": 50282,
|
18 |
+
"global_attn_every_n_layers": 3,
|
19 |
+
"global_rope_theta": 160000.0,
|
20 |
+
"gradient_checkpointing": false,
|
21 |
+
"hidden_activation": "gelu",
|
22 |
+
"hidden_size": 768,
|
23 |
+
"initializer_cutoff_factor": 2.0,
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 1152,
|
26 |
+
"layer_norm_eps": 1e-05,
|
27 |
+
"local_attention": 128,
|
28 |
+
"local_rope_theta": 10000.0,
|
29 |
+
"max_position_embeddings": 8192,
|
30 |
+
"mlp_bias": false,
|
31 |
+
"mlp_dropout": 0.0,
|
32 |
+
"model_type": "modernbert",
|
33 |
+
"norm_bias": false,
|
34 |
+
"norm_eps": 1e-05,
|
35 |
+
"num_attention_heads": 12,
|
36 |
+
"num_hidden_layers": 22,
|
37 |
+
"pad_token_id": 50283,
|
38 |
+
"position_embedding_type": "absolute",
|
39 |
+
"problem_type": "single_label_classification",
|
40 |
+
"reference_compile": true,
|
41 |
+
"sep_token_id": 50282,
|
42 |
+
"sparse_pred_ignore_index": -100,
|
43 |
+
"sparse_prediction": false,
|
44 |
+
"torch_dtype": "float32",
|
45 |
+
"transformers_version": "4.48.0.dev0",
|
46 |
+
"vocab_size": 50368
|
47 |
+
}
|
trial-3/checkpoint-1506/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:577af3b8b0a6d7db7f2ff1054a5c4c43704103dd0ed797800f9d9582a3237033
|
3 |
+
size 598439784
|
trial-3/checkpoint-1506/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:309810681fe0458054a9e76c6bfbb6fc2862ae83f89b084906874442e8913f57
|
3 |
+
size 1196967418
|
trial-3/checkpoint-1506/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
|
3 |
+
size 14244
|
trial-3/checkpoint-1506/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:77511df67542c270c7a8ed9a3ae9f0a88d6822756582e31cb89e7ee9b503abfb
|
3 |
+
size 1064
|
trial-3/checkpoint-1506/trainer_state.json
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.03509189188480377,
|
3 |
+
"best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-3/checkpoint-1506",
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 1506,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.033200531208499334,
|
13 |
+
"grad_norm": 6.976862907409668,
|
14 |
+
"learning_rate": 2.8972663455552343e-06,
|
15 |
+
"loss": 0.5378,
|
16 |
+
"step": 50
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.06640106241699867,
|
20 |
+
"grad_norm": 3.674832344055176,
|
21 |
+
"learning_rate": 2.8648439379281615e-06,
|
22 |
+
"loss": 0.3375,
|
23 |
+
"step": 100
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.099601593625498,
|
27 |
+
"grad_norm": 2.678229570388794,
|
28 |
+
"learning_rate": 2.8324215303010886e-06,
|
29 |
+
"loss": 0.2213,
|
30 |
+
"step": 150
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.13280212483399734,
|
34 |
+
"grad_norm": 6.4370551109313965,
|
35 |
+
"learning_rate": 2.7999991226740153e-06,
|
36 |
+
"loss": 0.1558,
|
37 |
+
"step": 200
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.16600265604249667,
|
41 |
+
"grad_norm": 6.4544525146484375,
|
42 |
+
"learning_rate": 2.767576715046943e-06,
|
43 |
+
"loss": 0.1457,
|
44 |
+
"step": 250
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.199203187250996,
|
48 |
+
"grad_norm": 2.4753177165985107,
|
49 |
+
"learning_rate": 2.7351543074198696e-06,
|
50 |
+
"loss": 0.1349,
|
51 |
+
"step": 300
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.23240371845949534,
|
55 |
+
"grad_norm": 3.116945743560791,
|
56 |
+
"learning_rate": 2.7027318997927968e-06,
|
57 |
+
"loss": 0.1144,
|
58 |
+
"step": 350
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.2656042496679947,
|
62 |
+
"grad_norm": 10.000889778137207,
|
63 |
+
"learning_rate": 2.670309492165724e-06,
|
64 |
+
"loss": 0.0942,
|
65 |
+
"step": 400
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.29880478087649404,
|
69 |
+
"grad_norm": 0.3915446996688843,
|
70 |
+
"learning_rate": 2.637887084538651e-06,
|
71 |
+
"loss": 0.0841,
|
72 |
+
"step": 450
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.33200531208499334,
|
76 |
+
"grad_norm": 0.7093335390090942,
|
77 |
+
"learning_rate": 2.605464676911578e-06,
|
78 |
+
"loss": 0.0815,
|
79 |
+
"step": 500
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.3652058432934927,
|
83 |
+
"grad_norm": 5.660763263702393,
|
84 |
+
"learning_rate": 2.5730422692845053e-06,
|
85 |
+
"loss": 0.058,
|
86 |
+
"step": 550
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.398406374501992,
|
90 |
+
"grad_norm": 9.372917175292969,
|
91 |
+
"learning_rate": 2.5406198616574325e-06,
|
92 |
+
"loss": 0.0521,
|
93 |
+
"step": 600
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.4316069057104914,
|
97 |
+
"grad_norm": 6.086747169494629,
|
98 |
+
"learning_rate": 2.5081974540303596e-06,
|
99 |
+
"loss": 0.0671,
|
100 |
+
"step": 650
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.4648074369189907,
|
104 |
+
"grad_norm": 5.661391735076904,
|
105 |
+
"learning_rate": 2.4757750464032863e-06,
|
106 |
+
"loss": 0.0354,
|
107 |
+
"step": 700
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.49800796812749004,
|
111 |
+
"grad_norm": 1.4707638025283813,
|
112 |
+
"learning_rate": 2.443352638776214e-06,
|
113 |
+
"loss": 0.0386,
|
114 |
+
"step": 750
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.5312084993359893,
|
118 |
+
"grad_norm": 7.550576686859131,
|
119 |
+
"learning_rate": 2.4109302311491406e-06,
|
120 |
+
"loss": 0.0363,
|
121 |
+
"step": 800
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.5644090305444888,
|
125 |
+
"grad_norm": 11.072442054748535,
|
126 |
+
"learning_rate": 2.3785078235220678e-06,
|
127 |
+
"loss": 0.0254,
|
128 |
+
"step": 850
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.5976095617529881,
|
132 |
+
"grad_norm": 0.3040500581264496,
|
133 |
+
"learning_rate": 2.346085415894995e-06,
|
134 |
+
"loss": 0.018,
|
135 |
+
"step": 900
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.6308100929614874,
|
139 |
+
"grad_norm": 11.503410339355469,
|
140 |
+
"learning_rate": 2.313663008267922e-06,
|
141 |
+
"loss": 0.0302,
|
142 |
+
"step": 950
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.6640106241699867,
|
146 |
+
"grad_norm": 0.7599239945411682,
|
147 |
+
"learning_rate": 2.281240600640849e-06,
|
148 |
+
"loss": 0.0267,
|
149 |
+
"step": 1000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.6972111553784861,
|
153 |
+
"grad_norm": 0.21025581657886505,
|
154 |
+
"learning_rate": 2.2488181930137764e-06,
|
155 |
+
"loss": 0.0211,
|
156 |
+
"step": 1050
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.7304116865869854,
|
160 |
+
"grad_norm": 11.052717208862305,
|
161 |
+
"learning_rate": 2.2163957853867035e-06,
|
162 |
+
"loss": 0.0112,
|
163 |
+
"step": 1100
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.7636122177954847,
|
167 |
+
"grad_norm": 0.0778539627790451,
|
168 |
+
"learning_rate": 2.1839733777596302e-06,
|
169 |
+
"loss": 0.0212,
|
170 |
+
"step": 1150
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.796812749003984,
|
174 |
+
"grad_norm": 0.050592467188835144,
|
175 |
+
"learning_rate": 2.151550970132558e-06,
|
176 |
+
"loss": 0.0082,
|
177 |
+
"step": 1200
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.8300132802124834,
|
181 |
+
"grad_norm": 0.04680703952908516,
|
182 |
+
"learning_rate": 2.1191285625054845e-06,
|
183 |
+
"loss": 0.008,
|
184 |
+
"step": 1250
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.8632138114209827,
|
188 |
+
"grad_norm": 127.69743347167969,
|
189 |
+
"learning_rate": 2.0867061548784117e-06,
|
190 |
+
"loss": 0.0192,
|
191 |
+
"step": 1300
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.896414342629482,
|
195 |
+
"grad_norm": 0.013791153207421303,
|
196 |
+
"learning_rate": 2.0542837472513392e-06,
|
197 |
+
"loss": 0.0063,
|
198 |
+
"step": 1350
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.9296148738379814,
|
202 |
+
"grad_norm": 0.011688283644616604,
|
203 |
+
"learning_rate": 2.021861339624266e-06,
|
204 |
+
"loss": 0.0068,
|
205 |
+
"step": 1400
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.9628154050464808,
|
209 |
+
"grad_norm": 14.885448455810547,
|
210 |
+
"learning_rate": 1.989438931997193e-06,
|
211 |
+
"loss": 0.004,
|
212 |
+
"step": 1450
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.9960159362549801,
|
216 |
+
"grad_norm": 0.38216766715049744,
|
217 |
+
"learning_rate": 1.9570165243701202e-06,
|
218 |
+
"loss": 0.0069,
|
219 |
+
"step": 1500
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 1.0,
|
223 |
+
"eval_accuracy": 0.992604961832061,
|
224 |
+
"eval_f1": 0.9926480803352735,
|
225 |
+
"eval_loss": 0.03509189188480377,
|
226 |
+
"eval_precision": 0.9927020529431649,
|
227 |
+
"eval_recall": 0.992604961832061,
|
228 |
+
"eval_runtime": 31.6693,
|
229 |
+
"eval_samples_per_second": 264.736,
|
230 |
+
"eval_steps_per_second": 8.273,
|
231 |
+
"step": 1506
|
232 |
+
}
|
233 |
+
],
|
234 |
+
"logging_steps": 50,
|
235 |
+
"max_steps": 4518,
|
236 |
+
"num_input_tokens_seen": 0,
|
237 |
+
"num_train_epochs": 3,
|
238 |
+
"save_steps": 500,
|
239 |
+
"stateful_callbacks": {
|
240 |
+
"TrainerControl": {
|
241 |
+
"args": {
|
242 |
+
"should_epoch_stop": false,
|
243 |
+
"should_evaluate": false,
|
244 |
+
"should_log": false,
|
245 |
+
"should_save": true,
|
246 |
+
"should_training_stop": false
|
247 |
+
},
|
248 |
+
"attributes": {}
|
249 |
+
}
|
250 |
+
},
|
251 |
+
"total_flos": 1.641430544259072e+16,
|
252 |
+
"train_batch_size": 32,
|
253 |
+
"trial_name": null,
|
254 |
+
"trial_params": null
|
255 |
+
}
|
trial-3/checkpoint-1506/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1ed06b7fefd178dad53ae3fef61fd304580c1d532a37d5010e58ca8f39e302fa
|
3 |
+
size 5368
|
trial-4/checkpoint-3011/config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "answerdotai/ModernBERT-base",
|
3 |
+
"architectures": [
|
4 |
+
"ModernBertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 50281,
|
9 |
+
"classifier_activation": "gelu",
|
10 |
+
"classifier_bias": false,
|
11 |
+
"classifier_dropout": 0.0,
|
12 |
+
"classifier_pooling": "mean",
|
13 |
+
"cls_token_id": 50281,
|
14 |
+
"decoder_bias": true,
|
15 |
+
"deterministic_flash_attn": false,
|
16 |
+
"embedding_dropout": 0.0,
|
17 |
+
"eos_token_id": 50282,
|
18 |
+
"global_attn_every_n_layers": 3,
|
19 |
+
"global_rope_theta": 160000.0,
|
20 |
+
"gradient_checkpointing": false,
|
21 |
+
"hidden_activation": "gelu",
|
22 |
+
"hidden_size": 768,
|
23 |
+
"initializer_cutoff_factor": 2.0,
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 1152,
|
26 |
+
"layer_norm_eps": 1e-05,
|
27 |
+
"local_attention": 128,
|
28 |
+
"local_rope_theta": 10000.0,
|
29 |
+
"max_position_embeddings": 8192,
|
30 |
+
"mlp_bias": false,
|
31 |
+
"mlp_dropout": 0.0,
|
32 |
+
"model_type": "modernbert",
|
33 |
+
"norm_bias": false,
|
34 |
+
"norm_eps": 1e-05,
|
35 |
+
"num_attention_heads": 12,
|
36 |
+
"num_hidden_layers": 22,
|
37 |
+
"pad_token_id": 50283,
|
38 |
+
"position_embedding_type": "absolute",
|
39 |
+
"problem_type": "single_label_classification",
|
40 |
+
"reference_compile": true,
|
41 |
+
"sep_token_id": 50282,
|
42 |
+
"sparse_pred_ignore_index": -100,
|
43 |
+
"sparse_prediction": false,
|
44 |
+
"torch_dtype": "float32",
|
45 |
+
"transformers_version": "4.48.0.dev0",
|
46 |
+
"vocab_size": 50368
|
47 |
+
}
|
trial-4/checkpoint-3011/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6998cd19c83cb7aad4574fdf2f2d1d911f7f01e8d94fcb558dc40e5561e3d188
|
3 |
+
size 598439784
|
trial-4/checkpoint-3011/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a0517bca24af0d5ed5988e5100a9e9f6f59df1b0d3e7ca53764baa7878d5d5e3
|
3 |
+
size 1196967418
|
trial-4/checkpoint-3011/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:568428d80a25211a390c359ca51b0b20b38ca0607fbc196f106c9841c02d3e59
|
3 |
+
size 14244
|
trial-4/checkpoint-3011/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c0dbc7f9aff9e32282e3dcfb80127104b5c3d0089b59d9cb1b981e6af6f8c41
|
3 |
+
size 1064
|
trial-4/checkpoint-3011/trainer_state.json
ADDED
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.02325253002345562,
|
3 |
+
"best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-4/checkpoint-3011",
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 3011,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.016605778811026237,
|
13 |
+
"grad_norm": 7.4845476150512695,
|
14 |
+
"learning_rate": 1.3209406688296726e-05,
|
15 |
+
"loss": 0.427,
|
16 |
+
"step": 50
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.033211557622052475,
|
20 |
+
"grad_norm": 8.739913940429688,
|
21 |
+
"learning_rate": 1.3184989137392264e-05,
|
22 |
+
"loss": 0.2079,
|
23 |
+
"step": 100
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.04981733643307871,
|
27 |
+
"grad_norm": 10.918631553649902,
|
28 |
+
"learning_rate": 1.31605715864878e-05,
|
29 |
+
"loss": 0.1374,
|
30 |
+
"step": 150
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.06642311524410495,
|
34 |
+
"grad_norm": 0.09207049757242203,
|
35 |
+
"learning_rate": 1.3136154035583336e-05,
|
36 |
+
"loss": 0.0971,
|
37 |
+
"step": 200
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.08302889405513118,
|
41 |
+
"grad_norm": 0.1270512193441391,
|
42 |
+
"learning_rate": 1.3111736484678873e-05,
|
43 |
+
"loss": 0.0431,
|
44 |
+
"step": 250
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.09963467286615742,
|
48 |
+
"grad_norm": 0.01078485231846571,
|
49 |
+
"learning_rate": 1.3087318933774408e-05,
|
50 |
+
"loss": 0.0679,
|
51 |
+
"step": 300
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.11624045167718366,
|
55 |
+
"grad_norm": 0.16803160309791565,
|
56 |
+
"learning_rate": 1.3062901382869945e-05,
|
57 |
+
"loss": 0.0364,
|
58 |
+
"step": 350
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.1328462304882099,
|
62 |
+
"grad_norm": 0.2863476872444153,
|
63 |
+
"learning_rate": 1.303848383196548e-05,
|
64 |
+
"loss": 0.0802,
|
65 |
+
"step": 400
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.14945200929923613,
|
69 |
+
"grad_norm": 0.018498318269848824,
|
70 |
+
"learning_rate": 1.3014066281061019e-05,
|
71 |
+
"loss": 0.0324,
|
72 |
+
"step": 450
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.16605778811026237,
|
76 |
+
"grad_norm": 12.099262237548828,
|
77 |
+
"learning_rate": 1.2989648730156554e-05,
|
78 |
+
"loss": 0.0567,
|
79 |
+
"step": 500
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.1826635669212886,
|
83 |
+
"grad_norm": 0.04201498255133629,
|
84 |
+
"learning_rate": 1.296523117925209e-05,
|
85 |
+
"loss": 0.0265,
|
86 |
+
"step": 550
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.19926934573231483,
|
90 |
+
"grad_norm": 13.225788116455078,
|
91 |
+
"learning_rate": 1.2940813628347628e-05,
|
92 |
+
"loss": 0.027,
|
93 |
+
"step": 600
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.2158751245433411,
|
97 |
+
"grad_norm": 2.1863136291503906,
|
98 |
+
"learning_rate": 1.2916396077443163e-05,
|
99 |
+
"loss": 0.0325,
|
100 |
+
"step": 650
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.23248090335436733,
|
104 |
+
"grad_norm": 0.0031948979012668133,
|
105 |
+
"learning_rate": 1.28919785265387e-05,
|
106 |
+
"loss": 0.0378,
|
107 |
+
"step": 700
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.24908668216539356,
|
111 |
+
"grad_norm": 0.0001850352855399251,
|
112 |
+
"learning_rate": 1.2867560975634237e-05,
|
113 |
+
"loss": 0.0242,
|
114 |
+
"step": 750
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.2656924609764198,
|
118 |
+
"grad_norm": 0.0007033672300167382,
|
119 |
+
"learning_rate": 1.2843143424729772e-05,
|
120 |
+
"loss": 0.0306,
|
121 |
+
"step": 800
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.282298239787446,
|
125 |
+
"grad_norm": 13.938993453979492,
|
126 |
+
"learning_rate": 1.2818725873825309e-05,
|
127 |
+
"loss": 0.0458,
|
128 |
+
"step": 850
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.29890401859847227,
|
132 |
+
"grad_norm": 0.02099405601620674,
|
133 |
+
"learning_rate": 1.2794308322920844e-05,
|
134 |
+
"loss": 0.0306,
|
135 |
+
"step": 900
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.3155097974094985,
|
139 |
+
"grad_norm": 0.024268606677651405,
|
140 |
+
"learning_rate": 1.2769890772016383e-05,
|
141 |
+
"loss": 0.0142,
|
142 |
+
"step": 950
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.33211557622052473,
|
146 |
+
"grad_norm": 0.004759958013892174,
|
147 |
+
"learning_rate": 1.2745473221111918e-05,
|
148 |
+
"loss": 0.0141,
|
149 |
+
"step": 1000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.348721355031551,
|
153 |
+
"grad_norm": 0.0019629066810011864,
|
154 |
+
"learning_rate": 1.2721055670207453e-05,
|
155 |
+
"loss": 0.0345,
|
156 |
+
"step": 1050
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.3653271338425772,
|
160 |
+
"grad_norm": 0.00019358922145329416,
|
161 |
+
"learning_rate": 1.2696638119302992e-05,
|
162 |
+
"loss": 0.0089,
|
163 |
+
"step": 1100
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.38193291265360346,
|
167 |
+
"grad_norm": 0.0028237327933311462,
|
168 |
+
"learning_rate": 1.2672220568398527e-05,
|
169 |
+
"loss": 0.0239,
|
170 |
+
"step": 1150
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.39853869146462967,
|
174 |
+
"grad_norm": 0.00010467255196999758,
|
175 |
+
"learning_rate": 1.2647803017494064e-05,
|
176 |
+
"loss": 0.0094,
|
177 |
+
"step": 1200
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.41514447027565593,
|
181 |
+
"grad_norm": 0.05774892866611481,
|
182 |
+
"learning_rate": 1.26233854665896e-05,
|
183 |
+
"loss": 0.0246,
|
184 |
+
"step": 1250
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.4317502490866822,
|
188 |
+
"grad_norm": 0.024394717067480087,
|
189 |
+
"learning_rate": 1.2598967915685136e-05,
|
190 |
+
"loss": 0.0328,
|
191 |
+
"step": 1300
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.4483560278977084,
|
195 |
+
"grad_norm": 2.231964349746704,
|
196 |
+
"learning_rate": 1.2574550364780673e-05,
|
197 |
+
"loss": 0.0204,
|
198 |
+
"step": 1350
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.46496180670873466,
|
202 |
+
"grad_norm": 0.0014322358183562756,
|
203 |
+
"learning_rate": 1.2550132813876208e-05,
|
204 |
+
"loss": 0.0001,
|
205 |
+
"step": 1400
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.48156758551976087,
|
209 |
+
"grad_norm": 0.001744006876833737,
|
210 |
+
"learning_rate": 1.2525715262971747e-05,
|
211 |
+
"loss": 0.0392,
|
212 |
+
"step": 1450
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.4981733643307871,
|
216 |
+
"grad_norm": 0.027050139382481575,
|
217 |
+
"learning_rate": 1.2501297712067282e-05,
|
218 |
+
"loss": 0.0151,
|
219 |
+
"step": 1500
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.5147791431418134,
|
223 |
+
"grad_norm": 0.0001924823591252789,
|
224 |
+
"learning_rate": 1.2476880161162817e-05,
|
225 |
+
"loss": 0.0036,
|
226 |
+
"step": 1550
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.5313849219528396,
|
230 |
+
"grad_norm": 4.767300128936768,
|
231 |
+
"learning_rate": 1.2452462610258356e-05,
|
232 |
+
"loss": 0.0148,
|
233 |
+
"step": 1600
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.5479907007638658,
|
237 |
+
"grad_norm": 0.0022574588656425476,
|
238 |
+
"learning_rate": 1.242804505935389e-05,
|
239 |
+
"loss": 0.0384,
|
240 |
+
"step": 1650
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.564596479574892,
|
244 |
+
"grad_norm": 0.12995891273021698,
|
245 |
+
"learning_rate": 1.2403627508449428e-05,
|
246 |
+
"loss": 0.018,
|
247 |
+
"step": 1700
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.5812022583859183,
|
251 |
+
"grad_norm": 0.0005374422180466354,
|
252 |
+
"learning_rate": 1.2379209957544964e-05,
|
253 |
+
"loss": 0.0039,
|
254 |
+
"step": 1750
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.5978080371969445,
|
258 |
+
"grad_norm": 0.004592420998960733,
|
259 |
+
"learning_rate": 1.23547924066405e-05,
|
260 |
+
"loss": 0.0136,
|
261 |
+
"step": 1800
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.6144138160079707,
|
265 |
+
"grad_norm": 0.0008812470478005707,
|
266 |
+
"learning_rate": 1.2330374855736037e-05,
|
267 |
+
"loss": 0.0167,
|
268 |
+
"step": 1850
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.631019594818997,
|
272 |
+
"grad_norm": 28.337797164916992,
|
273 |
+
"learning_rate": 1.2305957304831572e-05,
|
274 |
+
"loss": 0.0098,
|
275 |
+
"step": 1900
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.6476253736300233,
|
279 |
+
"grad_norm": 0.0003208396374247968,
|
280 |
+
"learning_rate": 1.228153975392711e-05,
|
281 |
+
"loss": 0.0083,
|
282 |
+
"step": 1950
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.6642311524410495,
|
286 |
+
"grad_norm": 0.004917904268950224,
|
287 |
+
"learning_rate": 1.2257122203022646e-05,
|
288 |
+
"loss": 0.012,
|
289 |
+
"step": 2000
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.6808369312520757,
|
293 |
+
"grad_norm": 0.0006444657919928432,
|
294 |
+
"learning_rate": 1.2232704652118182e-05,
|
295 |
+
"loss": 0.0006,
|
296 |
+
"step": 2050
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.697442710063102,
|
300 |
+
"grad_norm": 0.00020880017837043852,
|
301 |
+
"learning_rate": 1.220828710121372e-05,
|
302 |
+
"loss": 0.0169,
|
303 |
+
"step": 2100
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.7140484888741282,
|
307 |
+
"grad_norm": 0.009818737395107746,
|
308 |
+
"learning_rate": 1.2183869550309254e-05,
|
309 |
+
"loss": 0.0143,
|
310 |
+
"step": 2150
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.7306542676851544,
|
314 |
+
"grad_norm": 0.0009041284793056548,
|
315 |
+
"learning_rate": 1.2159451999404791e-05,
|
316 |
+
"loss": 0.0026,
|
317 |
+
"step": 2200
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.7472600464961807,
|
321 |
+
"grad_norm": 2.3109569549560547,
|
322 |
+
"learning_rate": 1.2135034448500328e-05,
|
323 |
+
"loss": 0.0062,
|
324 |
+
"step": 2250
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.7638658253072069,
|
328 |
+
"grad_norm": 9.242107807949651e-06,
|
329 |
+
"learning_rate": 1.2110616897595863e-05,
|
330 |
+
"loss": 0.0029,
|
331 |
+
"step": 2300
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.7804716041182331,
|
335 |
+
"grad_norm": 0.00020709235104732215,
|
336 |
+
"learning_rate": 1.20861993466914e-05,
|
337 |
+
"loss": 0.0,
|
338 |
+
"step": 2350
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.7970773829292593,
|
342 |
+
"grad_norm": 0.0008476360817439854,
|
343 |
+
"learning_rate": 1.2061781795786937e-05,
|
344 |
+
"loss": 0.019,
|
345 |
+
"step": 2400
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 0.8136831617402857,
|
349 |
+
"grad_norm": 0.0002165739715564996,
|
350 |
+
"learning_rate": 1.2037364244882474e-05,
|
351 |
+
"loss": 0.0,
|
352 |
+
"step": 2450
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"epoch": 0.8302889405513119,
|
356 |
+
"grad_norm": 0.029956847429275513,
|
357 |
+
"learning_rate": 1.201294669397801e-05,
|
358 |
+
"loss": 0.0012,
|
359 |
+
"step": 2500
|
360 |
+
},
|
361 |
+
{
|
362 |
+
"epoch": 0.8468947193623381,
|
363 |
+
"grad_norm": 0.0002400112134637311,
|
364 |
+
"learning_rate": 1.1988529143073546e-05,
|
365 |
+
"loss": 0.0191,
|
366 |
+
"step": 2550
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"epoch": 0.8635004981733644,
|
370 |
+
"grad_norm": 0.0070993551053106785,
|
371 |
+
"learning_rate": 1.1964111592169083e-05,
|
372 |
+
"loss": 0.0155,
|
373 |
+
"step": 2600
|
374 |
+
},
|
375 |
+
{
|
376 |
+
"epoch": 0.8801062769843906,
|
377 |
+
"grad_norm": 5.127764234202914e-05,
|
378 |
+
"learning_rate": 1.1939694041264618e-05,
|
379 |
+
"loss": 0.0185,
|
380 |
+
"step": 2650
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"epoch": 0.8967120557954168,
|
384 |
+
"grad_norm": 0.056577421724796295,
|
385 |
+
"learning_rate": 1.1915276490360155e-05,
|
386 |
+
"loss": 0.0063,
|
387 |
+
"step": 2700
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"epoch": 0.913317834606443,
|
391 |
+
"grad_norm": 4.399678437039256e-05,
|
392 |
+
"learning_rate": 1.1890858939455692e-05,
|
393 |
+
"loss": 0.012,
|
394 |
+
"step": 2750
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"epoch": 0.9299236134174693,
|
398 |
+
"grad_norm": 6.6589759626367595e-06,
|
399 |
+
"learning_rate": 1.1866441388551227e-05,
|
400 |
+
"loss": 0.0001,
|
401 |
+
"step": 2800
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"epoch": 0.9465293922284955,
|
405 |
+
"grad_norm": 0.009270718321204185,
|
406 |
+
"learning_rate": 1.1842023837646764e-05,
|
407 |
+
"loss": 0.0001,
|
408 |
+
"step": 2850
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"epoch": 0.9631351710395217,
|
412 |
+
"grad_norm": 6.743930339813232,
|
413 |
+
"learning_rate": 1.1817606286742301e-05,
|
414 |
+
"loss": 0.0019,
|
415 |
+
"step": 2900
|
416 |
+
},
|
417 |
+
{
|
418 |
+
"epoch": 0.9797409498505479,
|
419 |
+
"grad_norm": 10.679564476013184,
|
420 |
+
"learning_rate": 1.1793188735837838e-05,
|
421 |
+
"loss": 0.0258,
|
422 |
+
"step": 2950
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"epoch": 0.9963467286615743,
|
426 |
+
"grad_norm": 0.0007653234642930329,
|
427 |
+
"learning_rate": 1.1768771184933373e-05,
|
428 |
+
"loss": 0.0018,
|
429 |
+
"step": 3000
|
430 |
+
},
|
431 |
+
{
|
432 |
+
"epoch": 1.0,
|
433 |
+
"eval_accuracy": 0.997256679389313,
|
434 |
+
"eval_f1": 0.9972464717374746,
|
435 |
+
"eval_loss": 0.02325253002345562,
|
436 |
+
"eval_precision": 0.997240941740882,
|
437 |
+
"eval_recall": 0.997256679389313,
|
438 |
+
"eval_runtime": 36.6991,
|
439 |
+
"eval_samples_per_second": 228.453,
|
440 |
+
"eval_steps_per_second": 14.278,
|
441 |
+
"step": 3011
|
442 |
+
}
|
443 |
+
],
|
444 |
+
"logging_steps": 50,
|
445 |
+
"max_steps": 27099,
|
446 |
+
"num_input_tokens_seen": 0,
|
447 |
+
"num_train_epochs": 9,
|
448 |
+
"save_steps": 500,
|
449 |
+
"stateful_callbacks": {
|
450 |
+
"TrainerControl": {
|
451 |
+
"args": {
|
452 |
+
"should_epoch_stop": false,
|
453 |
+
"should_evaluate": false,
|
454 |
+
"should_log": false,
|
455 |
+
"should_save": true,
|
456 |
+
"should_training_stop": false
|
457 |
+
},
|
458 |
+
"attributes": {}
|
459 |
+
}
|
460 |
+
},
|
461 |
+
"total_flos": 1.641430544259072e+16,
|
462 |
+
"train_batch_size": 16,
|
463 |
+
"trial_name": null,
|
464 |
+
"trial_params": null
|
465 |
+
}
|
trial-4/checkpoint-3011/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89fb66224a4a1dbc68c030610c33a1d3f64ca676b2064b388b8e2a7385785f5d
|
3 |
+
size 5368
|
trial-5/checkpoint-3012/config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "answerdotai/ModernBERT-base",
|
3 |
+
"architectures": [
|
4 |
+
"ModernBertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 50281,
|
9 |
+
"classifier_activation": "gelu",
|
10 |
+
"classifier_bias": false,
|
11 |
+
"classifier_dropout": 0.0,
|
12 |
+
"classifier_pooling": "mean",
|
13 |
+
"cls_token_id": 50281,
|
14 |
+
"decoder_bias": true,
|
15 |
+
"deterministic_flash_attn": false,
|
16 |
+
"embedding_dropout": 0.0,
|
17 |
+
"eos_token_id": 50282,
|
18 |
+
"global_attn_every_n_layers": 3,
|
19 |
+
"global_rope_theta": 160000.0,
|
20 |
+
"gradient_checkpointing": false,
|
21 |
+
"hidden_activation": "gelu",
|
22 |
+
"hidden_size": 768,
|
23 |
+
"initializer_cutoff_factor": 2.0,
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 1152,
|
26 |
+
"layer_norm_eps": 1e-05,
|
27 |
+
"local_attention": 128,
|
28 |
+
"local_rope_theta": 10000.0,
|
29 |
+
"max_position_embeddings": 8192,
|
30 |
+
"mlp_bias": false,
|
31 |
+
"mlp_dropout": 0.0,
|
32 |
+
"model_type": "modernbert",
|
33 |
+
"norm_bias": false,
|
34 |
+
"norm_eps": 1e-05,
|
35 |
+
"num_attention_heads": 12,
|
36 |
+
"num_hidden_layers": 22,
|
37 |
+
"pad_token_id": 50283,
|
38 |
+
"position_embedding_type": "absolute",
|
39 |
+
"problem_type": "single_label_classification",
|
40 |
+
"reference_compile": true,
|
41 |
+
"sep_token_id": 50282,
|
42 |
+
"sparse_pred_ignore_index": -100,
|
43 |
+
"sparse_prediction": false,
|
44 |
+
"torch_dtype": "float32",
|
45 |
+
"transformers_version": "4.48.0.dev0",
|
46 |
+
"vocab_size": 50368
|
47 |
+
}
|
trial-5/checkpoint-3012/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49ba330b843aca1a1d0454785b900ed96671619efb6df36ea614d0870f5ef2aa
|
3 |
+
size 598439784
|
trial-5/checkpoint-3012/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60fb304abd0c5b9d4e6de61faca1856b99e71865a5c592f8acaa47567b9139d9
|
3 |
+
size 1196967418
|
trial-5/checkpoint-3012/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
|
3 |
+
size 14244
|
trial-5/checkpoint-3012/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c69ec29ae0867d661613f53dea74fb003b51f72db6450102f05c6dfa235171f
|
3 |
+
size 1064
|
trial-5/checkpoint-3012/trainer_state.json
ADDED
@@ -0,0 +1,477 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.0418265163898468,
|
3 |
+
"best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-5/checkpoint-3012",
|
4 |
+
"epoch": 2.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 3012,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.033200531208499334,
|
13 |
+
"grad_norm": 6.311530113220215,
|
14 |
+
"learning_rate": 1.279094112727349e-06,
|
15 |
+
"loss": 0.7104,
|
16 |
+
"step": 50
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.06640106241699867,
|
20 |
+
"grad_norm": 17.497058868408203,
|
21 |
+
"learning_rate": 1.2748333062225943e-06,
|
22 |
+
"loss": 0.5729,
|
23 |
+
"step": 100
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.099601593625498,
|
27 |
+
"grad_norm": 7.590151309967041,
|
28 |
+
"learning_rate": 1.2705724997178397e-06,
|
29 |
+
"loss": 0.4714,
|
30 |
+
"step": 150
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.13280212483399734,
|
34 |
+
"grad_norm": 6.96728515625,
|
35 |
+
"learning_rate": 1.2663116932130851e-06,
|
36 |
+
"loss": 0.3881,
|
37 |
+
"step": 200
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.16600265604249667,
|
41 |
+
"grad_norm": 4.9838714599609375,
|
42 |
+
"learning_rate": 1.2620508867083303e-06,
|
43 |
+
"loss": 0.3194,
|
44 |
+
"step": 250
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.199203187250996,
|
48 |
+
"grad_norm": 6.317371368408203,
|
49 |
+
"learning_rate": 1.2577900802035758e-06,
|
50 |
+
"loss": 0.2976,
|
51 |
+
"step": 300
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.23240371845949534,
|
55 |
+
"grad_norm": 15.331583023071289,
|
56 |
+
"learning_rate": 1.2535292736988212e-06,
|
57 |
+
"loss": 0.2392,
|
58 |
+
"step": 350
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.2656042496679947,
|
62 |
+
"grad_norm": 15.493165016174316,
|
63 |
+
"learning_rate": 1.2492684671940664e-06,
|
64 |
+
"loss": 0.2337,
|
65 |
+
"step": 400
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.29880478087649404,
|
69 |
+
"grad_norm": 3.7081472873687744,
|
70 |
+
"learning_rate": 1.2450076606893118e-06,
|
71 |
+
"loss": 0.2037,
|
72 |
+
"step": 450
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.33200531208499334,
|
76 |
+
"grad_norm": 4.029483318328857,
|
77 |
+
"learning_rate": 1.240746854184557e-06,
|
78 |
+
"loss": 0.2054,
|
79 |
+
"step": 500
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.3652058432934927,
|
83 |
+
"grad_norm": 4.573270797729492,
|
84 |
+
"learning_rate": 1.2364860476798024e-06,
|
85 |
+
"loss": 0.1555,
|
86 |
+
"step": 550
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.398406374501992,
|
90 |
+
"grad_norm": 15.748998641967773,
|
91 |
+
"learning_rate": 1.2322252411750478e-06,
|
92 |
+
"loss": 0.1486,
|
93 |
+
"step": 600
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.4316069057104914,
|
97 |
+
"grad_norm": 12.240307807922363,
|
98 |
+
"learning_rate": 1.227964434670293e-06,
|
99 |
+
"loss": 0.1552,
|
100 |
+
"step": 650
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.4648074369189907,
|
104 |
+
"grad_norm": 17.192546844482422,
|
105 |
+
"learning_rate": 1.2237036281655385e-06,
|
106 |
+
"loss": 0.1234,
|
107 |
+
"step": 700
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.49800796812749004,
|
111 |
+
"grad_norm": 11.04953670501709,
|
112 |
+
"learning_rate": 1.2194428216607839e-06,
|
113 |
+
"loss": 0.1212,
|
114 |
+
"step": 750
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.5312084993359893,
|
118 |
+
"grad_norm": 4.883615016937256,
|
119 |
+
"learning_rate": 1.215182015156029e-06,
|
120 |
+
"loss": 0.1059,
|
121 |
+
"step": 800
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.5644090305444888,
|
125 |
+
"grad_norm": 4.633565425872803,
|
126 |
+
"learning_rate": 1.2109212086512745e-06,
|
127 |
+
"loss": 0.0788,
|
128 |
+
"step": 850
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.5976095617529881,
|
132 |
+
"grad_norm": 2.6228833198547363,
|
133 |
+
"learning_rate": 1.20666040214652e-06,
|
134 |
+
"loss": 0.087,
|
135 |
+
"step": 900
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.6308100929614874,
|
139 |
+
"grad_norm": 6.4782915115356445,
|
140 |
+
"learning_rate": 1.2023995956417651e-06,
|
141 |
+
"loss": 0.0802,
|
142 |
+
"step": 950
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.6640106241699867,
|
146 |
+
"grad_norm": 5.229304313659668,
|
147 |
+
"learning_rate": 1.1981387891370103e-06,
|
148 |
+
"loss": 0.077,
|
149 |
+
"step": 1000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.6972111553784861,
|
153 |
+
"grad_norm": 6.034313201904297,
|
154 |
+
"learning_rate": 1.1938779826322558e-06,
|
155 |
+
"loss": 0.0703,
|
156 |
+
"step": 1050
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.7304116865869854,
|
160 |
+
"grad_norm": 9.29736614227295,
|
161 |
+
"learning_rate": 1.1896171761275012e-06,
|
162 |
+
"loss": 0.066,
|
163 |
+
"step": 1100
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.7636122177954847,
|
167 |
+
"grad_norm": 0.6172637343406677,
|
168 |
+
"learning_rate": 1.1853563696227464e-06,
|
169 |
+
"loss": 0.0692,
|
170 |
+
"step": 1150
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.796812749003984,
|
174 |
+
"grad_norm": 1.642548680305481,
|
175 |
+
"learning_rate": 1.1810955631179918e-06,
|
176 |
+
"loss": 0.0437,
|
177 |
+
"step": 1200
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.8300132802124834,
|
181 |
+
"grad_norm": 3.888737916946411,
|
182 |
+
"learning_rate": 1.176834756613237e-06,
|
183 |
+
"loss": 0.0474,
|
184 |
+
"step": 1250
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.8632138114209827,
|
188 |
+
"grad_norm": 14.787779808044434,
|
189 |
+
"learning_rate": 1.1725739501084824e-06,
|
190 |
+
"loss": 0.0501,
|
191 |
+
"step": 1300
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.896414342629482,
|
195 |
+
"grad_norm": 0.8571153283119202,
|
196 |
+
"learning_rate": 1.1683131436037278e-06,
|
197 |
+
"loss": 0.0439,
|
198 |
+
"step": 1350
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.9296148738379814,
|
202 |
+
"grad_norm": 0.6915457248687744,
|
203 |
+
"learning_rate": 1.164052337098973e-06,
|
204 |
+
"loss": 0.0455,
|
205 |
+
"step": 1400
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.9628154050464808,
|
209 |
+
"grad_norm": 8.8081636428833,
|
210 |
+
"learning_rate": 1.1597915305942185e-06,
|
211 |
+
"loss": 0.0347,
|
212 |
+
"step": 1450
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.9960159362549801,
|
216 |
+
"grad_norm": 8.551522254943848,
|
217 |
+
"learning_rate": 1.1555307240894639e-06,
|
218 |
+
"loss": 0.0346,
|
219 |
+
"step": 1500
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 1.0,
|
223 |
+
"eval_accuracy": 0.982824427480916,
|
224 |
+
"eval_f1": 0.9838970307302017,
|
225 |
+
"eval_loss": 0.05475565418601036,
|
226 |
+
"eval_precision": 0.986134299459291,
|
227 |
+
"eval_recall": 0.982824427480916,
|
228 |
+
"eval_runtime": 31.8933,
|
229 |
+
"eval_samples_per_second": 262.877,
|
230 |
+
"eval_steps_per_second": 8.215,
|
231 |
+
"step": 1506
|
232 |
+
},
|
233 |
+
{
|
234 |
+
"epoch": 1.0292164674634794,
|
235 |
+
"grad_norm": 13.078969955444336,
|
236 |
+
"learning_rate": 1.151269917584709e-06,
|
237 |
+
"loss": 0.0379,
|
238 |
+
"step": 1550
|
239 |
+
},
|
240 |
+
{
|
241 |
+
"epoch": 1.0624169986719787,
|
242 |
+
"grad_norm": 1.906078815460205,
|
243 |
+
"learning_rate": 1.1470091110799545e-06,
|
244 |
+
"loss": 0.0338,
|
245 |
+
"step": 1600
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"epoch": 1.095617529880478,
|
249 |
+
"grad_norm": 0.4020080864429474,
|
250 |
+
"learning_rate": 1.1427483045752e-06,
|
251 |
+
"loss": 0.0298,
|
252 |
+
"step": 1650
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"epoch": 1.1288180610889773,
|
256 |
+
"grad_norm": 2.647258758544922,
|
257 |
+
"learning_rate": 1.1384874980704451e-06,
|
258 |
+
"loss": 0.023,
|
259 |
+
"step": 1700
|
260 |
+
},
|
261 |
+
{
|
262 |
+
"epoch": 1.1620185922974768,
|
263 |
+
"grad_norm": 2.046747922897339,
|
264 |
+
"learning_rate": 1.1342266915656906e-06,
|
265 |
+
"loss": 0.0253,
|
266 |
+
"step": 1750
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"epoch": 1.1952191235059761,
|
270 |
+
"grad_norm": 13.14510726928711,
|
271 |
+
"learning_rate": 1.129965885060936e-06,
|
272 |
+
"loss": 0.0268,
|
273 |
+
"step": 1800
|
274 |
+
},
|
275 |
+
{
|
276 |
+
"epoch": 1.2284196547144755,
|
277 |
+
"grad_norm": 0.12764006853103638,
|
278 |
+
"learning_rate": 1.1257050785561812e-06,
|
279 |
+
"loss": 0.0099,
|
280 |
+
"step": 1850
|
281 |
+
},
|
282 |
+
{
|
283 |
+
"epoch": 1.2616201859229748,
|
284 |
+
"grad_norm": 1.6261545419692993,
|
285 |
+
"learning_rate": 1.1214442720514266e-06,
|
286 |
+
"loss": 0.0252,
|
287 |
+
"step": 1900
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"epoch": 1.294820717131474,
|
291 |
+
"grad_norm": 5.552518844604492,
|
292 |
+
"learning_rate": 1.117183465546672e-06,
|
293 |
+
"loss": 0.036,
|
294 |
+
"step": 1950
|
295 |
+
},
|
296 |
+
{
|
297 |
+
"epoch": 1.3280212483399734,
|
298 |
+
"grad_norm": 24.064516067504883,
|
299 |
+
"learning_rate": 1.1129226590419172e-06,
|
300 |
+
"loss": 0.0169,
|
301 |
+
"step": 2000
|
302 |
+
},
|
303 |
+
{
|
304 |
+
"epoch": 1.361221779548473,
|
305 |
+
"grad_norm": 0.00925782322883606,
|
306 |
+
"learning_rate": 1.1086618525371626e-06,
|
307 |
+
"loss": 0.0184,
|
308 |
+
"step": 2050
|
309 |
+
},
|
310 |
+
{
|
311 |
+
"epoch": 1.3944223107569722,
|
312 |
+
"grad_norm": 16.54283905029297,
|
313 |
+
"learning_rate": 1.1044010460324078e-06,
|
314 |
+
"loss": 0.0139,
|
315 |
+
"step": 2100
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"epoch": 1.4276228419654715,
|
319 |
+
"grad_norm": 0.24406713247299194,
|
320 |
+
"learning_rate": 1.1001402395276533e-06,
|
321 |
+
"loss": 0.0126,
|
322 |
+
"step": 2150
|
323 |
+
},
|
324 |
+
{
|
325 |
+
"epoch": 1.4608233731739708,
|
326 |
+
"grad_norm": 0.02731563337147236,
|
327 |
+
"learning_rate": 1.0958794330228987e-06,
|
328 |
+
"loss": 0.0198,
|
329 |
+
"step": 2200
|
330 |
+
},
|
331 |
+
{
|
332 |
+
"epoch": 1.4940239043824701,
|
333 |
+
"grad_norm": 17.53055191040039,
|
334 |
+
"learning_rate": 1.0916186265181439e-06,
|
335 |
+
"loss": 0.0303,
|
336 |
+
"step": 2250
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"epoch": 1.5272244355909694,
|
340 |
+
"grad_norm": 0.07282107323408127,
|
341 |
+
"learning_rate": 1.0873578200133893e-06,
|
342 |
+
"loss": 0.0016,
|
343 |
+
"step": 2300
|
344 |
+
},
|
345 |
+
{
|
346 |
+
"epoch": 1.5604249667994687,
|
347 |
+
"grad_norm": 20.794416427612305,
|
348 |
+
"learning_rate": 1.0830970135086347e-06,
|
349 |
+
"loss": 0.0225,
|
350 |
+
"step": 2350
|
351 |
+
},
|
352 |
+
{
|
353 |
+
"epoch": 1.593625498007968,
|
354 |
+
"grad_norm": 0.052418053150177,
|
355 |
+
"learning_rate": 1.07883620700388e-06,
|
356 |
+
"loss": 0.0076,
|
357 |
+
"step": 2400
|
358 |
+
},
|
359 |
+
{
|
360 |
+
"epoch": 1.6268260292164674,
|
361 |
+
"grad_norm": 0.21063362061977386,
|
362 |
+
"learning_rate": 1.0745754004991254e-06,
|
363 |
+
"loss": 0.0159,
|
364 |
+
"step": 2450
|
365 |
+
},
|
366 |
+
{
|
367 |
+
"epoch": 1.6600265604249667,
|
368 |
+
"grad_norm": 10.455537796020508,
|
369 |
+
"learning_rate": 1.0703145939943708e-06,
|
370 |
+
"loss": 0.0105,
|
371 |
+
"step": 2500
|
372 |
+
},
|
373 |
+
{
|
374 |
+
"epoch": 1.6932270916334662,
|
375 |
+
"grad_norm": 6.205326557159424,
|
376 |
+
"learning_rate": 1.066053787489616e-06,
|
377 |
+
"loss": 0.0081,
|
378 |
+
"step": 2550
|
379 |
+
},
|
380 |
+
{
|
381 |
+
"epoch": 1.7264276228419655,
|
382 |
+
"grad_norm": 6.523694038391113,
|
383 |
+
"learning_rate": 1.0617929809848614e-06,
|
384 |
+
"loss": 0.0159,
|
385 |
+
"step": 2600
|
386 |
+
},
|
387 |
+
{
|
388 |
+
"epoch": 1.7596281540504648,
|
389 |
+
"grad_norm": 0.010043232701718807,
|
390 |
+
"learning_rate": 1.0575321744801068e-06,
|
391 |
+
"loss": 0.0113,
|
392 |
+
"step": 2650
|
393 |
+
},
|
394 |
+
{
|
395 |
+
"epoch": 1.792828685258964,
|
396 |
+
"grad_norm": 0.00458578672260046,
|
397 |
+
"learning_rate": 1.053271367975352e-06,
|
398 |
+
"loss": 0.0086,
|
399 |
+
"step": 2700
|
400 |
+
},
|
401 |
+
{
|
402 |
+
"epoch": 1.8260292164674636,
|
403 |
+
"grad_norm": 0.10986531525850296,
|
404 |
+
"learning_rate": 1.0490105614705974e-06,
|
405 |
+
"loss": 0.008,
|
406 |
+
"step": 2750
|
407 |
+
},
|
408 |
+
{
|
409 |
+
"epoch": 1.859229747675963,
|
410 |
+
"grad_norm": 0.12284637242555618,
|
411 |
+
"learning_rate": 1.0447497549658429e-06,
|
412 |
+
"loss": 0.0052,
|
413 |
+
"step": 2800
|
414 |
+
},
|
415 |
+
{
|
416 |
+
"epoch": 1.8924302788844622,
|
417 |
+
"grad_norm": 0.14606119692325592,
|
418 |
+
"learning_rate": 1.040488948461088e-06,
|
419 |
+
"loss": 0.0176,
|
420 |
+
"step": 2850
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"epoch": 1.9256308100929616,
|
424 |
+
"grad_norm": 0.020491423085331917,
|
425 |
+
"learning_rate": 1.0362281419563333e-06,
|
426 |
+
"loss": 0.0102,
|
427 |
+
"step": 2900
|
428 |
+
},
|
429 |
+
{
|
430 |
+
"epoch": 1.9588313413014609,
|
431 |
+
"grad_norm": 0.05764462426304817,
|
432 |
+
"learning_rate": 1.0319673354515787e-06,
|
433 |
+
"loss": 0.0044,
|
434 |
+
"step": 2950
|
435 |
+
},
|
436 |
+
{
|
437 |
+
"epoch": 1.9920318725099602,
|
438 |
+
"grad_norm": 0.7329011559486389,
|
439 |
+
"learning_rate": 1.027706528946824e-06,
|
440 |
+
"loss": 0.0139,
|
441 |
+
"step": 3000
|
442 |
+
},
|
443 |
+
{
|
444 |
+
"epoch": 2.0,
|
445 |
+
"eval_accuracy": 0.9924856870229007,
|
446 |
+
"eval_f1": 0.9924235722235019,
|
447 |
+
"eval_loss": 0.0418265163898468,
|
448 |
+
"eval_precision": 0.9923830636545329,
|
449 |
+
"eval_recall": 0.9924856870229007,
|
450 |
+
"eval_runtime": 31.6222,
|
451 |
+
"eval_samples_per_second": 265.131,
|
452 |
+
"eval_steps_per_second": 8.285,
|
453 |
+
"step": 3012
|
454 |
+
}
|
455 |
+
],
|
456 |
+
"logging_steps": 50,
|
457 |
+
"max_steps": 15060,
|
458 |
+
"num_input_tokens_seen": 0,
|
459 |
+
"num_train_epochs": 10,
|
460 |
+
"save_steps": 500,
|
461 |
+
"stateful_callbacks": {
|
462 |
+
"TrainerControl": {
|
463 |
+
"args": {
|
464 |
+
"should_epoch_stop": false,
|
465 |
+
"should_evaluate": false,
|
466 |
+
"should_log": false,
|
467 |
+
"should_save": true,
|
468 |
+
"should_training_stop": false
|
469 |
+
},
|
470 |
+
"attributes": {}
|
471 |
+
}
|
472 |
+
},
|
473 |
+
"total_flos": 3.282861088518144e+16,
|
474 |
+
"train_batch_size": 32,
|
475 |
+
"trial_name": null,
|
476 |
+
"trial_params": null
|
477 |
+
}
|
trial-5/checkpoint-3012/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b5a07ff58876babfad1d92462cc9e7062c8f5b0af8d8ba9142ab6f5e8880cf2
|
3 |
+
size 5368
|
trial-6/checkpoint-6022/config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "answerdotai/ModernBERT-base",
|
3 |
+
"architectures": [
|
4 |
+
"ModernBertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 50281,
|
9 |
+
"classifier_activation": "gelu",
|
10 |
+
"classifier_bias": false,
|
11 |
+
"classifier_dropout": 0.0,
|
12 |
+
"classifier_pooling": "mean",
|
13 |
+
"cls_token_id": 50281,
|
14 |
+
"decoder_bias": true,
|
15 |
+
"deterministic_flash_attn": false,
|
16 |
+
"embedding_dropout": 0.0,
|
17 |
+
"eos_token_id": 50282,
|
18 |
+
"global_attn_every_n_layers": 3,
|
19 |
+
"global_rope_theta": 160000.0,
|
20 |
+
"gradient_checkpointing": false,
|
21 |
+
"hidden_activation": "gelu",
|
22 |
+
"hidden_size": 768,
|
23 |
+
"initializer_cutoff_factor": 2.0,
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 1152,
|
26 |
+
"layer_norm_eps": 1e-05,
|
27 |
+
"local_attention": 128,
|
28 |
+
"local_rope_theta": 10000.0,
|
29 |
+
"max_position_embeddings": 8192,
|
30 |
+
"mlp_bias": false,
|
31 |
+
"mlp_dropout": 0.0,
|
32 |
+
"model_type": "modernbert",
|
33 |
+
"norm_bias": false,
|
34 |
+
"norm_eps": 1e-05,
|
35 |
+
"num_attention_heads": 12,
|
36 |
+
"num_hidden_layers": 22,
|
37 |
+
"pad_token_id": 50283,
|
38 |
+
"position_embedding_type": "absolute",
|
39 |
+
"problem_type": "single_label_classification",
|
40 |
+
"reference_compile": true,
|
41 |
+
"sep_token_id": 50282,
|
42 |
+
"sparse_pred_ignore_index": -100,
|
43 |
+
"sparse_prediction": false,
|
44 |
+
"torch_dtype": "float32",
|
45 |
+
"transformers_version": "4.48.0.dev0",
|
46 |
+
"vocab_size": 50368
|
47 |
+
}
|
trial-6/checkpoint-6022/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a60e2fc558ad0e5a9a4825234c28006f4c14c02aab969b5ebf7cb43d8f890d9e
|
3 |
+
size 598439784
|
trial-6/checkpoint-6022/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ccfa5cc878422afe6f38c7ea21cef7e9f532ec15d2d9169693197daa8b04fb0
|
3 |
+
size 1196967418
|
trial-6/checkpoint-6022/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:914f37830aa379563c31bd15a8b8f53b8ccc8e2de0f0aa6da9695369e4ad84ef
|
3 |
+
size 14244
|