samokosik commited on
Commit
8abfa49
·
verified ·
1 Parent(s): ed0d979

🍻 cheers

Browse files
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: google/vit-base-patch16-224-in21k
4
+ tags:
5
+ - image-classification
6
+ - generated_from_trainer
7
+ metrics:
8
+ - accuracy
9
+ model-index:
10
+ - name: finetuned-clothes
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # finetuned-clothes
18
+
19
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the clothes_simplifiedv2 dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.2225
22
+ - Accuracy: 0.9417
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 5e-05
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 8
44
+ - seed: 42
45
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
+ - lr_scheduler_type: linear
47
+ - num_epochs: 4
48
+ - mixed_precision_training: Native AMP
49
+
50
+ ### Training results
51
+
52
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
54
+ | 0.7725 | 0.2058 | 100 | 0.7008 | 0.8178 |
55
+ | 0.5535 | 0.4115 | 200 | 0.4494 | 0.8994 |
56
+ | 0.4334 | 0.6173 | 300 | 0.3649 | 0.9169 |
57
+ | 0.3921 | 0.8230 | 400 | 0.3085 | 0.9184 |
58
+ | 0.3695 | 1.0288 | 500 | 0.3091 | 0.9184 |
59
+ | 0.2634 | 1.2346 | 600 | 0.3339 | 0.9082 |
60
+ | 0.4788 | 1.4403 | 700 | 0.2827 | 0.9257 |
61
+ | 0.3337 | 1.6461 | 800 | 0.2499 | 0.9344 |
62
+ | 0.34 | 1.8519 | 900 | 0.2586 | 0.9315 |
63
+ | 0.2424 | 2.0576 | 1000 | 0.2248 | 0.9402 |
64
+ | 0.1559 | 2.2634 | 1100 | 0.2333 | 0.9344 |
65
+ | 0.351 | 2.4691 | 1200 | 0.2495 | 0.9359 |
66
+ | 0.2206 | 2.6749 | 1300 | 0.2622 | 0.9242 |
67
+ | 0.3814 | 2.8807 | 1400 | 0.3138 | 0.9155 |
68
+ | 0.2141 | 3.0864 | 1500 | 0.2613 | 0.9315 |
69
+ | 0.112 | 3.2922 | 1600 | 0.2266 | 0.9402 |
70
+ | 0.0631 | 3.4979 | 1700 | 0.2255 | 0.9402 |
71
+ | 0.1986 | 3.7037 | 1800 | 0.2225 | 0.9417 |
72
+ | 0.2345 | 3.9095 | 1900 | 0.2235 | 0.9373 |
73
+
74
+
75
+ ### Framework versions
76
+
77
+ - Transformers 4.40.1
78
+ - Pytorch 2.2.1+cu121
79
+ - Datasets 2.19.0
80
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.9416909620991254,
4
+ "eval_loss": 0.22252099215984344,
5
+ "eval_runtime": 7.4442,
6
+ "eval_samples_per_second": 92.153,
7
+ "eval_steps_per_second": 11.553,
8
+ "total_flos": 1.2048994477712425e+18,
9
+ "train_loss": 0.3544519797212793,
10
+ "train_runtime": 505.606,
11
+ "train_samples_per_second": 30.751,
12
+ "train_steps_per_second": 3.845
13
+ }
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "Hat",
13
+ "1": "Longsleeve",
14
+ "2": "Outwear",
15
+ "3": "Pants",
16
+ "4": "Shoes",
17
+ "5": "Shorts",
18
+ "6": "Shortsleeve"
19
+ },
20
+ "image_size": 224,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 3072,
23
+ "label2id": {
24
+ "Hat": "0",
25
+ "Longsleeve": "1",
26
+ "Outwear": "2",
27
+ "Pants": "3",
28
+ "Shoes": "4",
29
+ "Shorts": "5",
30
+ "Shortsleeve": "6"
31
+ },
32
+ "layer_norm_eps": 1e-12,
33
+ "model_type": "vit",
34
+ "num_attention_heads": 12,
35
+ "num_channels": 3,
36
+ "num_hidden_layers": 12,
37
+ "patch_size": 16,
38
+ "problem_type": "single_label_classification",
39
+ "qkv_bias": true,
40
+ "torch_dtype": "float32",
41
+ "transformers_version": "4.40.1"
42
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.9416909620991254,
4
+ "eval_loss": 0.22252099215984344,
5
+ "eval_runtime": 7.4442,
6
+ "eval_samples_per_second": 92.153,
7
+ "eval_steps_per_second": 11.553
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a63787376b3045c7867b265ce5ccec7f5bb76974b4a3ba8c554830f137febe9
3
+ size 343239356
preprocessor_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_rescale",
8
+ "rescale_factor",
9
+ "do_normalize",
10
+ "image_mean",
11
+ "image_std",
12
+ "return_tensors",
13
+ "data_format",
14
+ "input_data_format"
15
+ ],
16
+ "do_normalize": true,
17
+ "do_rescale": true,
18
+ "do_resize": true,
19
+ "image_mean": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "image_processor_type": "ViTImageProcessor",
25
+ "image_std": [
26
+ 0.5,
27
+ 0.5,
28
+ 0.5
29
+ ],
30
+ "resample": 2,
31
+ "rescale_factor": 0.00392156862745098,
32
+ "size": {
33
+ "height": 224,
34
+ "width": 224
35
+ }
36
+ }
runs/Apr28_09-24-43_8de383cac982/events.out.tfevents.1714296294.8de383cac982.2041.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cdb2859db0046112b6540028c4a9b2474bff43db913245c05f2a707637ffe0
3
+ size 52226
runs/Apr28_09-24-43_8de383cac982/events.out.tfevents.1714296828.8de383cac982.2041.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9c357624e0844ac8993ddbfe88e442164e40e923ef5061ae3084c0f658d2422
3
+ size 411
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "total_flos": 1.2048994477712425e+18,
4
+ "train_loss": 0.3544519797212793,
5
+ "train_runtime": 505.606,
6
+ "train_samples_per_second": 30.751,
7
+ "train_steps_per_second": 3.845
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.22252099215984344,
3
+ "best_model_checkpoint": "finetuned-clothes/checkpoint-1800",
4
+ "epoch": 4.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1944,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0205761316872428,
13
+ "grad_norm": 2.173704147338867,
14
+ "learning_rate": 4.974279835390947e-05,
15
+ "loss": 1.8548,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.0411522633744856,
20
+ "grad_norm": 4.239943981170654,
21
+ "learning_rate": 4.948559670781893e-05,
22
+ "loss": 1.6474,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.06172839506172839,
27
+ "grad_norm": 3.0864417552948,
28
+ "learning_rate": 4.92283950617284e-05,
29
+ "loss": 1.4579,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.0823045267489712,
34
+ "grad_norm": 2.301297187805176,
35
+ "learning_rate": 4.8971193415637865e-05,
36
+ "loss": 1.3241,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.102880658436214,
41
+ "grad_norm": 2.933986186981201,
42
+ "learning_rate": 4.871399176954733e-05,
43
+ "loss": 1.1818,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.12345679012345678,
48
+ "grad_norm": 2.552161931991577,
49
+ "learning_rate": 4.845679012345679e-05,
50
+ "loss": 1.0563,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.1440329218106996,
55
+ "grad_norm": 2.6376731395721436,
56
+ "learning_rate": 4.819958847736626e-05,
57
+ "loss": 1.0264,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.1646090534979424,
62
+ "grad_norm": 3.713806390762329,
63
+ "learning_rate": 4.794238683127572e-05,
64
+ "loss": 0.9449,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.18518518518518517,
69
+ "grad_norm": 4.88785982131958,
70
+ "learning_rate": 4.768518518518519e-05,
71
+ "loss": 0.7746,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.205761316872428,
76
+ "grad_norm": 2.0521767139434814,
77
+ "learning_rate": 4.742798353909465e-05,
78
+ "loss": 0.7725,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.205761316872428,
83
+ "eval_accuracy": 0.8177842565597667,
84
+ "eval_loss": 0.7007715106010437,
85
+ "eval_runtime": 5.2424,
86
+ "eval_samples_per_second": 130.856,
87
+ "eval_steps_per_second": 16.405,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.22633744855967078,
92
+ "grad_norm": 1.4839606285095215,
93
+ "learning_rate": 4.7170781893004116e-05,
94
+ "loss": 0.6586,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 0.24691358024691357,
99
+ "grad_norm": 5.147204875946045,
100
+ "learning_rate": 4.691358024691358e-05,
101
+ "loss": 0.7189,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 0.2674897119341564,
106
+ "grad_norm": 7.209930896759033,
107
+ "learning_rate": 4.665637860082305e-05,
108
+ "loss": 0.7384,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 0.2880658436213992,
113
+ "grad_norm": 5.127917766571045,
114
+ "learning_rate": 4.639917695473252e-05,
115
+ "loss": 0.7484,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 0.30864197530864196,
120
+ "grad_norm": 3.2841947078704834,
121
+ "learning_rate": 4.614197530864198e-05,
122
+ "loss": 0.6732,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 0.3292181069958848,
127
+ "grad_norm": 5.1268110275268555,
128
+ "learning_rate": 4.5884773662551446e-05,
129
+ "loss": 0.6015,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 0.3497942386831276,
134
+ "grad_norm": 5.863048553466797,
135
+ "learning_rate": 4.5627572016460906e-05,
136
+ "loss": 0.594,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 0.37037037037037035,
141
+ "grad_norm": 2.000070095062256,
142
+ "learning_rate": 4.5370370370370374e-05,
143
+ "loss": 0.5165,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 0.39094650205761317,
148
+ "grad_norm": 4.510650157928467,
149
+ "learning_rate": 4.5113168724279834e-05,
150
+ "loss": 0.621,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 0.411522633744856,
155
+ "grad_norm": 2.4744088649749756,
156
+ "learning_rate": 4.48559670781893e-05,
157
+ "loss": 0.5535,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 0.411522633744856,
162
+ "eval_accuracy": 0.8994169096209913,
163
+ "eval_loss": 0.44938138127326965,
164
+ "eval_runtime": 5.2054,
165
+ "eval_samples_per_second": 131.787,
166
+ "eval_steps_per_second": 16.521,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 0.43209876543209874,
171
+ "grad_norm": 7.232150077819824,
172
+ "learning_rate": 4.459876543209877e-05,
173
+ "loss": 0.7554,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 0.45267489711934156,
178
+ "grad_norm": 2.480642318725586,
179
+ "learning_rate": 4.4341563786008236e-05,
180
+ "loss": 0.6475,
181
+ "step": 220
182
+ },
183
+ {
184
+ "epoch": 0.4732510288065844,
185
+ "grad_norm": 6.75883150100708,
186
+ "learning_rate": 4.40843621399177e-05,
187
+ "loss": 0.4951,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 0.49382716049382713,
192
+ "grad_norm": 2.3725767135620117,
193
+ "learning_rate": 4.3827160493827164e-05,
194
+ "loss": 0.5711,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 0.51440329218107,
199
+ "grad_norm": 4.138972282409668,
200
+ "learning_rate": 4.3569958847736625e-05,
201
+ "loss": 0.5318,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 0.5349794238683128,
206
+ "grad_norm": 11.300224304199219,
207
+ "learning_rate": 4.331275720164609e-05,
208
+ "loss": 0.5041,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 0.5555555555555556,
213
+ "grad_norm": 6.57551383972168,
214
+ "learning_rate": 4.305555555555556e-05,
215
+ "loss": 0.5828,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 0.5761316872427984,
220
+ "grad_norm": 4.986762523651123,
221
+ "learning_rate": 4.279835390946502e-05,
222
+ "loss": 0.491,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 0.5967078189300411,
227
+ "grad_norm": 3.5556581020355225,
228
+ "learning_rate": 4.254115226337449e-05,
229
+ "loss": 0.4911,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 0.6172839506172839,
234
+ "grad_norm": 0.6545975804328918,
235
+ "learning_rate": 4.230967078189301e-05,
236
+ "loss": 0.4334,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 0.6172839506172839,
241
+ "eval_accuracy": 0.9169096209912536,
242
+ "eval_loss": 0.3648872971534729,
243
+ "eval_runtime": 5.389,
244
+ "eval_samples_per_second": 127.297,
245
+ "eval_steps_per_second": 15.959,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 0.6378600823045267,
250
+ "grad_norm": 6.065480709075928,
251
+ "learning_rate": 4.205246913580247e-05,
252
+ "loss": 0.4079,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 0.6584362139917695,
257
+ "grad_norm": 7.807397842407227,
258
+ "learning_rate": 4.1795267489711935e-05,
259
+ "loss": 0.5353,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 0.6790123456790124,
264
+ "grad_norm": 4.977638244628906,
265
+ "learning_rate": 4.1563786008230455e-05,
266
+ "loss": 0.3977,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 0.6995884773662552,
271
+ "grad_norm": 0.7537054419517517,
272
+ "learning_rate": 4.130658436213992e-05,
273
+ "loss": 0.4792,
274
+ "step": 340
275
+ },
276
+ {
277
+ "epoch": 0.720164609053498,
278
+ "grad_norm": 1.4330915212631226,
279
+ "learning_rate": 4.104938271604938e-05,
280
+ "loss": 0.3986,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 0.7407407407407407,
285
+ "grad_norm": 3.951570510864258,
286
+ "learning_rate": 4.079218106995885e-05,
287
+ "loss": 0.4249,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 0.7613168724279835,
292
+ "grad_norm": 6.858752727508545,
293
+ "learning_rate": 4.053497942386831e-05,
294
+ "loss": 0.4735,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 0.7818930041152263,
299
+ "grad_norm": 7.806582927703857,
300
+ "learning_rate": 4.027777777777778e-05,
301
+ "loss": 0.4037,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 0.8024691358024691,
306
+ "grad_norm": 5.925368309020996,
307
+ "learning_rate": 4.0020576131687245e-05,
308
+ "loss": 0.3997,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 0.823045267489712,
313
+ "grad_norm": 15.64376163482666,
314
+ "learning_rate": 3.976337448559671e-05,
315
+ "loss": 0.3921,
316
+ "step": 400
317
+ },
318
+ {
319
+ "epoch": 0.823045267489712,
320
+ "eval_accuracy": 0.9183673469387755,
321
+ "eval_loss": 0.3085317313671112,
322
+ "eval_runtime": 5.5462,
323
+ "eval_samples_per_second": 123.689,
324
+ "eval_steps_per_second": 15.506,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 0.8436213991769548,
329
+ "grad_norm": 0.49218544363975525,
330
+ "learning_rate": 3.950617283950617e-05,
331
+ "loss": 0.394,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 0.8641975308641975,
336
+ "grad_norm": 0.5128059983253479,
337
+ "learning_rate": 3.924897119341564e-05,
338
+ "loss": 0.2721,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 0.8847736625514403,
343
+ "grad_norm": 6.586925029754639,
344
+ "learning_rate": 3.89917695473251e-05,
345
+ "loss": 0.4608,
346
+ "step": 430
347
+ },
348
+ {
349
+ "epoch": 0.9053497942386831,
350
+ "grad_norm": 4.243907451629639,
351
+ "learning_rate": 3.873456790123457e-05,
352
+ "loss": 0.4534,
353
+ "step": 440
354
+ },
355
+ {
356
+ "epoch": 0.9259259259259259,
357
+ "grad_norm": 0.5013654828071594,
358
+ "learning_rate": 3.8477366255144036e-05,
359
+ "loss": 0.3535,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 0.9465020576131687,
364
+ "grad_norm": 12.352477073669434,
365
+ "learning_rate": 3.8220164609053496e-05,
366
+ "loss": 0.3733,
367
+ "step": 460
368
+ },
369
+ {
370
+ "epoch": 0.9670781893004116,
371
+ "grad_norm": 7.122161865234375,
372
+ "learning_rate": 3.7962962962962964e-05,
373
+ "loss": 0.4486,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 0.9876543209876543,
378
+ "grad_norm": 4.856133937835693,
379
+ "learning_rate": 3.770576131687243e-05,
380
+ "loss": 0.4629,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 1.008230452674897,
385
+ "grad_norm": 0.6018617749214172,
386
+ "learning_rate": 3.74485596707819e-05,
387
+ "loss": 0.2749,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 1.02880658436214,
392
+ "grad_norm": 8.492673873901367,
393
+ "learning_rate": 3.719135802469136e-05,
394
+ "loss": 0.3695,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 1.02880658436214,
399
+ "eval_accuracy": 0.9183673469387755,
400
+ "eval_loss": 0.30907005071640015,
401
+ "eval_runtime": 5.8028,
402
+ "eval_samples_per_second": 118.218,
403
+ "eval_steps_per_second": 14.82,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 1.0493827160493827,
408
+ "grad_norm": 2.352186679840088,
409
+ "learning_rate": 3.6934156378600826e-05,
410
+ "loss": 0.3995,
411
+ "step": 510
412
+ },
413
+ {
414
+ "epoch": 1.0699588477366255,
415
+ "grad_norm": 2.0695996284484863,
416
+ "learning_rate": 3.667695473251029e-05,
417
+ "loss": 0.2422,
418
+ "step": 520
419
+ },
420
+ {
421
+ "epoch": 1.0905349794238683,
422
+ "grad_norm": 0.4764484167098999,
423
+ "learning_rate": 3.6419753086419754e-05,
424
+ "loss": 0.3073,
425
+ "step": 530
426
+ },
427
+ {
428
+ "epoch": 1.1111111111111112,
429
+ "grad_norm": 2.4346959590911865,
430
+ "learning_rate": 3.6162551440329215e-05,
431
+ "loss": 0.279,
432
+ "step": 540
433
+ },
434
+ {
435
+ "epoch": 1.131687242798354,
436
+ "grad_norm": 1.6526964902877808,
437
+ "learning_rate": 3.590534979423868e-05,
438
+ "loss": 0.217,
439
+ "step": 550
440
+ },
441
+ {
442
+ "epoch": 1.1522633744855968,
443
+ "grad_norm": 1.5531551837921143,
444
+ "learning_rate": 3.564814814814815e-05,
445
+ "loss": 0.3906,
446
+ "step": 560
447
+ },
448
+ {
449
+ "epoch": 1.1728395061728394,
450
+ "grad_norm": 0.41519421339035034,
451
+ "learning_rate": 3.539094650205762e-05,
452
+ "loss": 0.1809,
453
+ "step": 570
454
+ },
455
+ {
456
+ "epoch": 1.1934156378600824,
457
+ "grad_norm": 8.064279556274414,
458
+ "learning_rate": 3.5133744855967084e-05,
459
+ "loss": 0.2767,
460
+ "step": 580
461
+ },
462
+ {
463
+ "epoch": 1.213991769547325,
464
+ "grad_norm": 0.4536079466342926,
465
+ "learning_rate": 3.4876543209876545e-05,
466
+ "loss": 0.4032,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 1.2345679012345678,
471
+ "grad_norm": 12.668285369873047,
472
+ "learning_rate": 3.461934156378601e-05,
473
+ "loss": 0.2634,
474
+ "step": 600
475
+ },
476
+ {
477
+ "epoch": 1.2345679012345678,
478
+ "eval_accuracy": 0.9081632653061225,
479
+ "eval_loss": 0.3338664472103119,
480
+ "eval_runtime": 5.3039,
481
+ "eval_samples_per_second": 129.338,
482
+ "eval_steps_per_second": 16.214,
483
+ "step": 600
484
+ },
485
+ {
486
+ "epoch": 1.2551440329218106,
487
+ "grad_norm": 3.9776062965393066,
488
+ "learning_rate": 3.436213991769547e-05,
489
+ "loss": 0.3071,
490
+ "step": 610
491
+ },
492
+ {
493
+ "epoch": 1.2757201646090535,
494
+ "grad_norm": 3.995609760284424,
495
+ "learning_rate": 3.410493827160494e-05,
496
+ "loss": 0.3862,
497
+ "step": 620
498
+ },
499
+ {
500
+ "epoch": 1.2962962962962963,
501
+ "grad_norm": 1.187232255935669,
502
+ "learning_rate": 3.38477366255144e-05,
503
+ "loss": 0.4498,
504
+ "step": 630
505
+ },
506
+ {
507
+ "epoch": 1.316872427983539,
508
+ "grad_norm": 17.8300724029541,
509
+ "learning_rate": 3.3590534979423874e-05,
510
+ "loss": 0.3186,
511
+ "step": 640
512
+ },
513
+ {
514
+ "epoch": 1.337448559670782,
515
+ "grad_norm": 1.6956249475479126,
516
+ "learning_rate": 3.3333333333333335e-05,
517
+ "loss": 0.2368,
518
+ "step": 650
519
+ },
520
+ {
521
+ "epoch": 1.3580246913580247,
522
+ "grad_norm": 0.9367788434028625,
523
+ "learning_rate": 3.30761316872428e-05,
524
+ "loss": 0.4461,
525
+ "step": 660
526
+ },
527
+ {
528
+ "epoch": 1.3786008230452675,
529
+ "grad_norm": 3.084554433822632,
530
+ "learning_rate": 3.281893004115226e-05,
531
+ "loss": 0.3022,
532
+ "step": 670
533
+ },
534
+ {
535
+ "epoch": 1.3991769547325104,
536
+ "grad_norm": 1.4184356927871704,
537
+ "learning_rate": 3.256172839506173e-05,
538
+ "loss": 0.3542,
539
+ "step": 680
540
+ },
541
+ {
542
+ "epoch": 1.4197530864197532,
543
+ "grad_norm": 9.956780433654785,
544
+ "learning_rate": 3.230452674897119e-05,
545
+ "loss": 0.2772,
546
+ "step": 690
547
+ },
548
+ {
549
+ "epoch": 1.4403292181069958,
550
+ "grad_norm": 7.774564266204834,
551
+ "learning_rate": 3.204732510288066e-05,
552
+ "loss": 0.4788,
553
+ "step": 700
554
+ },
555
+ {
556
+ "epoch": 1.4403292181069958,
557
+ "eval_accuracy": 0.9256559766763849,
558
+ "eval_loss": 0.28270086646080017,
559
+ "eval_runtime": 5.1862,
560
+ "eval_samples_per_second": 132.274,
561
+ "eval_steps_per_second": 16.582,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 1.4609053497942388,
566
+ "grad_norm": 1.6019172668457031,
567
+ "learning_rate": 3.1790123456790125e-05,
568
+ "loss": 0.3656,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 1.4814814814814814,
573
+ "grad_norm": 0.32796138525009155,
574
+ "learning_rate": 3.153292181069959e-05,
575
+ "loss": 0.291,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 1.5020576131687244,
580
+ "grad_norm": 6.803758144378662,
581
+ "learning_rate": 3.127572016460906e-05,
582
+ "loss": 0.2756,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 1.522633744855967,
587
+ "grad_norm": 9.335061073303223,
588
+ "learning_rate": 3.101851851851852e-05,
589
+ "loss": 0.353,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 1.5432098765432098,
594
+ "grad_norm": 11.15013599395752,
595
+ "learning_rate": 3.076131687242799e-05,
596
+ "loss": 0.3542,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 1.5637860082304527,
601
+ "grad_norm": 7.025611400604248,
602
+ "learning_rate": 3.050411522633745e-05,
603
+ "loss": 0.3576,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 1.5843621399176955,
608
+ "grad_norm": 4.891916275024414,
609
+ "learning_rate": 3.0246913580246916e-05,
610
+ "loss": 0.1898,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 1.6049382716049383,
615
+ "grad_norm": 4.149532318115234,
616
+ "learning_rate": 2.998971193415638e-05,
617
+ "loss": 0.3467,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 1.625514403292181,
622
+ "grad_norm": 4.81355094909668,
623
+ "learning_rate": 2.9732510288065847e-05,
624
+ "loss": 0.2927,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 1.646090534979424,
629
+ "grad_norm": 5.665321350097656,
630
+ "learning_rate": 2.9475308641975308e-05,
631
+ "loss": 0.3337,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 1.646090534979424,
636
+ "eval_accuracy": 0.934402332361516,
637
+ "eval_loss": 0.24985870718955994,
638
+ "eval_runtime": 11.5489,
639
+ "eval_samples_per_second": 59.4,
640
+ "eval_steps_per_second": 7.447,
641
+ "step": 800
642
+ },
643
+ {
644
+ "epoch": 1.6666666666666665,
645
+ "grad_norm": 5.743719100952148,
646
+ "learning_rate": 2.9218106995884775e-05,
647
+ "loss": 0.2494,
648
+ "step": 810
649
+ },
650
+ {
651
+ "epoch": 1.6872427983539096,
652
+ "grad_norm": 0.2070736289024353,
653
+ "learning_rate": 2.896090534979424e-05,
654
+ "loss": 0.3724,
655
+ "step": 820
656
+ },
657
+ {
658
+ "epoch": 1.7078189300411522,
659
+ "grad_norm": 0.6346050500869751,
660
+ "learning_rate": 2.8703703703703706e-05,
661
+ "loss": 0.4756,
662
+ "step": 830
663
+ },
664
+ {
665
+ "epoch": 1.7283950617283952,
666
+ "grad_norm": 10.402286529541016,
667
+ "learning_rate": 2.8446502057613174e-05,
668
+ "loss": 0.265,
669
+ "step": 840
670
+ },
671
+ {
672
+ "epoch": 1.7489711934156378,
673
+ "grad_norm": 10.685330390930176,
674
+ "learning_rate": 2.8189300411522634e-05,
675
+ "loss": 0.301,
676
+ "step": 850
677
+ },
678
+ {
679
+ "epoch": 1.7695473251028808,
680
+ "grad_norm": 8.265226364135742,
681
+ "learning_rate": 2.79320987654321e-05,
682
+ "loss": 0.3098,
683
+ "step": 860
684
+ },
685
+ {
686
+ "epoch": 1.7901234567901234,
687
+ "grad_norm": 8.33575439453125,
688
+ "learning_rate": 2.7674897119341565e-05,
689
+ "loss": 0.2974,
690
+ "step": 870
691
+ },
692
+ {
693
+ "epoch": 1.8106995884773662,
694
+ "grad_norm": 8.888866424560547,
695
+ "learning_rate": 2.7417695473251033e-05,
696
+ "loss": 0.1666,
697
+ "step": 880
698
+ },
699
+ {
700
+ "epoch": 1.831275720164609,
701
+ "grad_norm": 8.413542747497559,
702
+ "learning_rate": 2.7160493827160493e-05,
703
+ "loss": 0.2286,
704
+ "step": 890
705
+ },
706
+ {
707
+ "epoch": 1.8518518518518519,
708
+ "grad_norm": 1.5029722452163696,
709
+ "learning_rate": 2.690329218106996e-05,
710
+ "loss": 0.34,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 1.8518518518518519,
715
+ "eval_accuracy": 0.9314868804664723,
716
+ "eval_loss": 0.2585590183734894,
717
+ "eval_runtime": 6.5895,
718
+ "eval_samples_per_second": 104.105,
719
+ "eval_steps_per_second": 13.051,
720
+ "step": 900
721
+ },
722
+ {
723
+ "epoch": 1.8724279835390947,
724
+ "grad_norm": 0.20841029286384583,
725
+ "learning_rate": 2.6646090534979425e-05,
726
+ "loss": 0.275,
727
+ "step": 910
728
+ },
729
+ {
730
+ "epoch": 1.8930041152263375,
731
+ "grad_norm": 9.179852485656738,
732
+ "learning_rate": 2.6388888888888892e-05,
733
+ "loss": 0.2304,
734
+ "step": 920
735
+ },
736
+ {
737
+ "epoch": 1.9135802469135803,
738
+ "grad_norm": 0.9464486241340637,
739
+ "learning_rate": 2.6131687242798352e-05,
740
+ "loss": 0.2368,
741
+ "step": 930
742
+ },
743
+ {
744
+ "epoch": 1.934156378600823,
745
+ "grad_norm": 10.512286186218262,
746
+ "learning_rate": 2.587448559670782e-05,
747
+ "loss": 0.2969,
748
+ "step": 940
749
+ },
750
+ {
751
+ "epoch": 1.954732510288066,
752
+ "grad_norm": 3.54328989982605,
753
+ "learning_rate": 2.5617283950617287e-05,
754
+ "loss": 0.2327,
755
+ "step": 950
756
+ },
757
+ {
758
+ "epoch": 1.9753086419753085,
759
+ "grad_norm": 0.5239406824111938,
760
+ "learning_rate": 2.536008230452675e-05,
761
+ "loss": 0.3048,
762
+ "step": 960
763
+ },
764
+ {
765
+ "epoch": 1.9958847736625516,
766
+ "grad_norm": 0.25483438372612,
767
+ "learning_rate": 2.510288065843622e-05,
768
+ "loss": 0.3584,
769
+ "step": 970
770
+ },
771
+ {
772
+ "epoch": 2.016460905349794,
773
+ "grad_norm": 0.4868137538433075,
774
+ "learning_rate": 2.484567901234568e-05,
775
+ "loss": 0.3359,
776
+ "step": 980
777
+ },
778
+ {
779
+ "epoch": 2.037037037037037,
780
+ "grad_norm": 6.471540927886963,
781
+ "learning_rate": 2.4588477366255143e-05,
782
+ "loss": 0.4194,
783
+ "step": 990
784
+ },
785
+ {
786
+ "epoch": 2.05761316872428,
787
+ "grad_norm": 0.21457789838314056,
788
+ "learning_rate": 2.433127572016461e-05,
789
+ "loss": 0.2424,
790
+ "step": 1000
791
+ },
792
+ {
793
+ "epoch": 2.05761316872428,
794
+ "eval_accuracy": 0.9402332361516035,
795
+ "eval_loss": 0.2248041331768036,
796
+ "eval_runtime": 5.6496,
797
+ "eval_samples_per_second": 121.424,
798
+ "eval_steps_per_second": 15.222,
799
+ "step": 1000
800
+ },
801
+ {
802
+ "epoch": 2.078189300411523,
803
+ "grad_norm": 0.2115119844675064,
804
+ "learning_rate": 2.4074074074074074e-05,
805
+ "loss": 0.1537,
806
+ "step": 1010
807
+ },
808
+ {
809
+ "epoch": 2.0987654320987654,
810
+ "grad_norm": 0.7327996492385864,
811
+ "learning_rate": 2.381687242798354e-05,
812
+ "loss": 0.1881,
813
+ "step": 1020
814
+ },
815
+ {
816
+ "epoch": 2.119341563786008,
817
+ "grad_norm": 11.874442100524902,
818
+ "learning_rate": 2.3559670781893005e-05,
819
+ "loss": 0.2633,
820
+ "step": 1030
821
+ },
822
+ {
823
+ "epoch": 2.139917695473251,
824
+ "grad_norm": 0.2526969015598297,
825
+ "learning_rate": 2.3302469135802473e-05,
826
+ "loss": 0.1868,
827
+ "step": 1040
828
+ },
829
+ {
830
+ "epoch": 2.1604938271604937,
831
+ "grad_norm": 0.23793959617614746,
832
+ "learning_rate": 2.3045267489711937e-05,
833
+ "loss": 0.2138,
834
+ "step": 1050
835
+ },
836
+ {
837
+ "epoch": 2.1810699588477367,
838
+ "grad_norm": 0.319592148065567,
839
+ "learning_rate": 2.27880658436214e-05,
840
+ "loss": 0.1997,
841
+ "step": 1060
842
+ },
843
+ {
844
+ "epoch": 2.2016460905349793,
845
+ "grad_norm": 0.14912565052509308,
846
+ "learning_rate": 2.2530864197530865e-05,
847
+ "loss": 0.284,
848
+ "step": 1070
849
+ },
850
+ {
851
+ "epoch": 2.2222222222222223,
852
+ "grad_norm": 5.384796142578125,
853
+ "learning_rate": 2.2273662551440332e-05,
854
+ "loss": 0.299,
855
+ "step": 1080
856
+ },
857
+ {
858
+ "epoch": 2.242798353909465,
859
+ "grad_norm": 5.962587356567383,
860
+ "learning_rate": 2.2016460905349796e-05,
861
+ "loss": 0.18,
862
+ "step": 1090
863
+ },
864
+ {
865
+ "epoch": 2.263374485596708,
866
+ "grad_norm": 0.7156618237495422,
867
+ "learning_rate": 2.175925925925926e-05,
868
+ "loss": 0.1559,
869
+ "step": 1100
870
+ },
871
+ {
872
+ "epoch": 2.263374485596708,
873
+ "eval_accuracy": 0.934402332361516,
874
+ "eval_loss": 0.2332872599363327,
875
+ "eval_runtime": 5.7482,
876
+ "eval_samples_per_second": 119.342,
877
+ "eval_steps_per_second": 14.961,
878
+ "step": 1100
879
+ },
880
+ {
881
+ "epoch": 2.2839506172839505,
882
+ "grad_norm": 6.270691394805908,
883
+ "learning_rate": 2.152777777777778e-05,
884
+ "loss": 0.2359,
885
+ "step": 1110
886
+ },
887
+ {
888
+ "epoch": 2.3045267489711936,
889
+ "grad_norm": 2.6105377674102783,
890
+ "learning_rate": 2.1270576131687244e-05,
891
+ "loss": 0.3313,
892
+ "step": 1120
893
+ },
894
+ {
895
+ "epoch": 2.325102880658436,
896
+ "grad_norm": 0.20199181139469147,
897
+ "learning_rate": 2.101337448559671e-05,
898
+ "loss": 0.1454,
899
+ "step": 1130
900
+ },
901
+ {
902
+ "epoch": 2.3456790123456788,
903
+ "grad_norm": 11.37586498260498,
904
+ "learning_rate": 2.0756172839506175e-05,
905
+ "loss": 0.4299,
906
+ "step": 1140
907
+ },
908
+ {
909
+ "epoch": 2.366255144032922,
910
+ "grad_norm": 0.4003171920776367,
911
+ "learning_rate": 2.049897119341564e-05,
912
+ "loss": 0.1871,
913
+ "step": 1150
914
+ },
915
+ {
916
+ "epoch": 2.386831275720165,
917
+ "grad_norm": 0.5666722059249878,
918
+ "learning_rate": 2.0241769547325103e-05,
919
+ "loss": 0.3776,
920
+ "step": 1160
921
+ },
922
+ {
923
+ "epoch": 2.4074074074074074,
924
+ "grad_norm": 13.603349685668945,
925
+ "learning_rate": 1.998456790123457e-05,
926
+ "loss": 0.3819,
927
+ "step": 1170
928
+ },
929
+ {
930
+ "epoch": 2.42798353909465,
931
+ "grad_norm": 0.16143706440925598,
932
+ "learning_rate": 1.9727366255144034e-05,
933
+ "loss": 0.4033,
934
+ "step": 1180
935
+ },
936
+ {
937
+ "epoch": 2.448559670781893,
938
+ "grad_norm": 5.120613098144531,
939
+ "learning_rate": 1.9470164609053498e-05,
940
+ "loss": 0.2198,
941
+ "step": 1190
942
+ },
943
+ {
944
+ "epoch": 2.4691358024691357,
945
+ "grad_norm": 2.5423364639282227,
946
+ "learning_rate": 1.9212962962962962e-05,
947
+ "loss": 0.351,
948
+ "step": 1200
949
+ },
950
+ {
951
+ "epoch": 2.4691358024691357,
952
+ "eval_accuracy": 0.9358600583090378,
953
+ "eval_loss": 0.24949392676353455,
954
+ "eval_runtime": 5.7687,
955
+ "eval_samples_per_second": 118.918,
956
+ "eval_steps_per_second": 14.908,
957
+ "step": 1200
958
+ },
959
+ {
960
+ "epoch": 2.4897119341563787,
961
+ "grad_norm": 0.5970525741577148,
962
+ "learning_rate": 1.895576131687243e-05,
963
+ "loss": 0.0758,
964
+ "step": 1210
965
+ },
966
+ {
967
+ "epoch": 2.5102880658436213,
968
+ "grad_norm": 0.23254498839378357,
969
+ "learning_rate": 1.8698559670781893e-05,
970
+ "loss": 0.4038,
971
+ "step": 1220
972
+ },
973
+ {
974
+ "epoch": 2.5308641975308643,
975
+ "grad_norm": 0.7684284448623657,
976
+ "learning_rate": 1.8441358024691357e-05,
977
+ "loss": 0.3073,
978
+ "step": 1230
979
+ },
980
+ {
981
+ "epoch": 2.551440329218107,
982
+ "grad_norm": 2.35357403755188,
983
+ "learning_rate": 1.8184156378600824e-05,
984
+ "loss": 0.2705,
985
+ "step": 1240
986
+ },
987
+ {
988
+ "epoch": 2.57201646090535,
989
+ "grad_norm": 15.403181076049805,
990
+ "learning_rate": 1.792695473251029e-05,
991
+ "loss": 0.3099,
992
+ "step": 1250
993
+ },
994
+ {
995
+ "epoch": 2.5925925925925926,
996
+ "grad_norm": 9.536227226257324,
997
+ "learning_rate": 1.7669753086419756e-05,
998
+ "loss": 0.1969,
999
+ "step": 1260
1000
+ },
1001
+ {
1002
+ "epoch": 2.613168724279835,
1003
+ "grad_norm": 0.228355273604393,
1004
+ "learning_rate": 1.741255144032922e-05,
1005
+ "loss": 0.2381,
1006
+ "step": 1270
1007
+ },
1008
+ {
1009
+ "epoch": 2.633744855967078,
1010
+ "grad_norm": 7.860360145568848,
1011
+ "learning_rate": 1.7155349794238684e-05,
1012
+ "loss": 0.3721,
1013
+ "step": 1280
1014
+ },
1015
+ {
1016
+ "epoch": 2.6543209876543212,
1017
+ "grad_norm": 13.305447578430176,
1018
+ "learning_rate": 1.6898148148148148e-05,
1019
+ "loss": 0.2834,
1020
+ "step": 1290
1021
+ },
1022
+ {
1023
+ "epoch": 2.674897119341564,
1024
+ "grad_norm": 0.13963976502418518,
1025
+ "learning_rate": 1.6640946502057615e-05,
1026
+ "loss": 0.2206,
1027
+ "step": 1300
1028
+ },
1029
+ {
1030
+ "epoch": 2.674897119341564,
1031
+ "eval_accuracy": 0.924198250728863,
1032
+ "eval_loss": 0.26219838857650757,
1033
+ "eval_runtime": 5.6243,
1034
+ "eval_samples_per_second": 121.97,
1035
+ "eval_steps_per_second": 15.291,
1036
+ "step": 1300
1037
+ },
1038
+ {
1039
+ "epoch": 2.6954732510288064,
1040
+ "grad_norm": 7.862311840057373,
1041
+ "learning_rate": 1.638374485596708e-05,
1042
+ "loss": 0.2344,
1043
+ "step": 1310
1044
+ },
1045
+ {
1046
+ "epoch": 2.7160493827160495,
1047
+ "grad_norm": 2.6695802211761475,
1048
+ "learning_rate": 1.6126543209876543e-05,
1049
+ "loss": 0.3519,
1050
+ "step": 1320
1051
+ },
1052
+ {
1053
+ "epoch": 2.736625514403292,
1054
+ "grad_norm": 1.0053679943084717,
1055
+ "learning_rate": 1.5869341563786007e-05,
1056
+ "loss": 0.1638,
1057
+ "step": 1330
1058
+ },
1059
+ {
1060
+ "epoch": 2.757201646090535,
1061
+ "grad_norm": 9.059231758117676,
1062
+ "learning_rate": 1.5612139917695474e-05,
1063
+ "loss": 0.1577,
1064
+ "step": 1340
1065
+ },
1066
+ {
1067
+ "epoch": 2.7777777777777777,
1068
+ "grad_norm": 6.931950569152832,
1069
+ "learning_rate": 1.5354938271604938e-05,
1070
+ "loss": 0.1858,
1071
+ "step": 1350
1072
+ },
1073
+ {
1074
+ "epoch": 2.7983539094650207,
1075
+ "grad_norm": 2.1699206829071045,
1076
+ "learning_rate": 1.5097736625514405e-05,
1077
+ "loss": 0.1309,
1078
+ "step": 1360
1079
+ },
1080
+ {
1081
+ "epoch": 2.8189300411522633,
1082
+ "grad_norm": 2.3047924041748047,
1083
+ "learning_rate": 1.4840534979423871e-05,
1084
+ "loss": 0.1979,
1085
+ "step": 1370
1086
+ },
1087
+ {
1088
+ "epoch": 2.8395061728395063,
1089
+ "grad_norm": 6.254756927490234,
1090
+ "learning_rate": 1.4583333333333335e-05,
1091
+ "loss": 0.1391,
1092
+ "step": 1380
1093
+ },
1094
+ {
1095
+ "epoch": 2.860082304526749,
1096
+ "grad_norm": 20.224390029907227,
1097
+ "learning_rate": 1.43261316872428e-05,
1098
+ "loss": 0.1719,
1099
+ "step": 1390
1100
+ },
1101
+ {
1102
+ "epoch": 2.8806584362139915,
1103
+ "grad_norm": 17.755252838134766,
1104
+ "learning_rate": 1.4068930041152264e-05,
1105
+ "loss": 0.3814,
1106
+ "step": 1400
1107
+ },
1108
+ {
1109
+ "epoch": 2.8806584362139915,
1110
+ "eval_accuracy": 0.9154518950437318,
1111
+ "eval_loss": 0.31383997201919556,
1112
+ "eval_runtime": 5.7706,
1113
+ "eval_samples_per_second": 118.878,
1114
+ "eval_steps_per_second": 14.903,
1115
+ "step": 1400
1116
+ },
1117
+ {
1118
+ "epoch": 2.9012345679012346,
1119
+ "grad_norm": 9.217977523803711,
1120
+ "learning_rate": 1.381172839506173e-05,
1121
+ "loss": 0.4575,
1122
+ "step": 1410
1123
+ },
1124
+ {
1125
+ "epoch": 2.9218106995884776,
1126
+ "grad_norm": 13.919651985168457,
1127
+ "learning_rate": 1.3554526748971194e-05,
1128
+ "loss": 0.2728,
1129
+ "step": 1420
1130
+ },
1131
+ {
1132
+ "epoch": 2.94238683127572,
1133
+ "grad_norm": 1.9513192176818848,
1134
+ "learning_rate": 1.329732510288066e-05,
1135
+ "loss": 0.1897,
1136
+ "step": 1430
1137
+ },
1138
+ {
1139
+ "epoch": 2.962962962962963,
1140
+ "grad_norm": 0.12064926326274872,
1141
+ "learning_rate": 1.3040123456790124e-05,
1142
+ "loss": 0.1415,
1143
+ "step": 1440
1144
+ },
1145
+ {
1146
+ "epoch": 2.983539094650206,
1147
+ "grad_norm": 0.1655590832233429,
1148
+ "learning_rate": 1.278292181069959e-05,
1149
+ "loss": 0.1414,
1150
+ "step": 1450
1151
+ },
1152
+ {
1153
+ "epoch": 3.0041152263374484,
1154
+ "grad_norm": 10.342666625976562,
1155
+ "learning_rate": 1.2525720164609053e-05,
1156
+ "loss": 0.2966,
1157
+ "step": 1460
1158
+ },
1159
+ {
1160
+ "epoch": 3.0246913580246915,
1161
+ "grad_norm": 0.5797888040542603,
1162
+ "learning_rate": 1.2268518518518519e-05,
1163
+ "loss": 0.1845,
1164
+ "step": 1470
1165
+ },
1166
+ {
1167
+ "epoch": 3.045267489711934,
1168
+ "grad_norm": 7.389481544494629,
1169
+ "learning_rate": 1.2011316872427984e-05,
1170
+ "loss": 0.2512,
1171
+ "step": 1480
1172
+ },
1173
+ {
1174
+ "epoch": 3.065843621399177,
1175
+ "grad_norm": 0.2333269566297531,
1176
+ "learning_rate": 1.1754115226337448e-05,
1177
+ "loss": 0.4019,
1178
+ "step": 1490
1179
+ },
1180
+ {
1181
+ "epoch": 3.0864197530864197,
1182
+ "grad_norm": 4.2674241065979,
1183
+ "learning_rate": 1.1496913580246914e-05,
1184
+ "loss": 0.2141,
1185
+ "step": 1500
1186
+ },
1187
+ {
1188
+ "epoch": 3.0864197530864197,
1189
+ "eval_accuracy": 0.9314868804664723,
1190
+ "eval_loss": 0.261305034160614,
1191
+ "eval_runtime": 5.7707,
1192
+ "eval_samples_per_second": 118.877,
1193
+ "eval_steps_per_second": 14.903,
1194
+ "step": 1500
1195
+ },
1196
+ {
1197
+ "epoch": 3.1069958847736627,
1198
+ "grad_norm": 0.2830299437046051,
1199
+ "learning_rate": 1.1239711934156378e-05,
1200
+ "loss": 0.2904,
1201
+ "step": 1510
1202
+ },
1203
+ {
1204
+ "epoch": 3.1275720164609053,
1205
+ "grad_norm": 0.17907337844371796,
1206
+ "learning_rate": 1.0982510288065845e-05,
1207
+ "loss": 0.1494,
1208
+ "step": 1520
1209
+ },
1210
+ {
1211
+ "epoch": 3.148148148148148,
1212
+ "grad_norm": 9.283103942871094,
1213
+ "learning_rate": 1.072530864197531e-05,
1214
+ "loss": 0.0753,
1215
+ "step": 1530
1216
+ },
1217
+ {
1218
+ "epoch": 3.168724279835391,
1219
+ "grad_norm": 0.2662203907966614,
1220
+ "learning_rate": 1.0468106995884775e-05,
1221
+ "loss": 0.2067,
1222
+ "step": 1540
1223
+ },
1224
+ {
1225
+ "epoch": 3.1893004115226335,
1226
+ "grad_norm": 6.983056545257568,
1227
+ "learning_rate": 1.0210905349794239e-05,
1228
+ "loss": 0.173,
1229
+ "step": 1550
1230
+ },
1231
+ {
1232
+ "epoch": 3.2098765432098766,
1233
+ "grad_norm": 4.426745414733887,
1234
+ "learning_rate": 9.953703703703704e-06,
1235
+ "loss": 0.2352,
1236
+ "step": 1560
1237
+ },
1238
+ {
1239
+ "epoch": 3.230452674897119,
1240
+ "grad_norm": 2.4332549571990967,
1241
+ "learning_rate": 9.696502057613168e-06,
1242
+ "loss": 0.1356,
1243
+ "step": 1570
1244
+ },
1245
+ {
1246
+ "epoch": 3.251028806584362,
1247
+ "grad_norm": 0.45729583501815796,
1248
+ "learning_rate": 9.439300411522634e-06,
1249
+ "loss": 0.3083,
1250
+ "step": 1580
1251
+ },
1252
+ {
1253
+ "epoch": 3.271604938271605,
1254
+ "grad_norm": 0.12547095119953156,
1255
+ "learning_rate": 9.1820987654321e-06,
1256
+ "loss": 0.1432,
1257
+ "step": 1590
1258
+ },
1259
+ {
1260
+ "epoch": 3.292181069958848,
1261
+ "grad_norm": 0.8154440522193909,
1262
+ "learning_rate": 8.924897119341565e-06,
1263
+ "loss": 0.112,
1264
+ "step": 1600
1265
+ },
1266
+ {
1267
+ "epoch": 3.292181069958848,
1268
+ "eval_accuracy": 0.9402332361516035,
1269
+ "eval_loss": 0.22655406594276428,
1270
+ "eval_runtime": 5.8164,
1271
+ "eval_samples_per_second": 117.942,
1272
+ "eval_steps_per_second": 14.786,
1273
+ "step": 1600
1274
+ },
1275
+ {
1276
+ "epoch": 3.3127572016460904,
1277
+ "grad_norm": 0.14023926854133606,
1278
+ "learning_rate": 8.66769547325103e-06,
1279
+ "loss": 0.1534,
1280
+ "step": 1610
1281
+ },
1282
+ {
1283
+ "epoch": 3.3333333333333335,
1284
+ "grad_norm": 6.381242275238037,
1285
+ "learning_rate": 8.410493827160495e-06,
1286
+ "loss": 0.1949,
1287
+ "step": 1620
1288
+ },
1289
+ {
1290
+ "epoch": 3.353909465020576,
1291
+ "grad_norm": 0.39883196353912354,
1292
+ "learning_rate": 8.153292181069959e-06,
1293
+ "loss": 0.1436,
1294
+ "step": 1630
1295
+ },
1296
+ {
1297
+ "epoch": 3.374485596707819,
1298
+ "grad_norm": 3.331747055053711,
1299
+ "learning_rate": 7.896090534979424e-06,
1300
+ "loss": 0.2292,
1301
+ "step": 1640
1302
+ },
1303
+ {
1304
+ "epoch": 3.3950617283950617,
1305
+ "grad_norm": 0.16310296952724457,
1306
+ "learning_rate": 7.63888888888889e-06,
1307
+ "loss": 0.1053,
1308
+ "step": 1650
1309
+ },
1310
+ {
1311
+ "epoch": 3.4156378600823043,
1312
+ "grad_norm": 13.472603797912598,
1313
+ "learning_rate": 7.381687242798355e-06,
1314
+ "loss": 0.1804,
1315
+ "step": 1660
1316
+ },
1317
+ {
1318
+ "epoch": 3.4362139917695473,
1319
+ "grad_norm": 5.14309549331665,
1320
+ "learning_rate": 7.12448559670782e-06,
1321
+ "loss": 0.1998,
1322
+ "step": 1670
1323
+ },
1324
+ {
1325
+ "epoch": 3.45679012345679,
1326
+ "grad_norm": 6.7929511070251465,
1327
+ "learning_rate": 6.8672839506172845e-06,
1328
+ "loss": 0.2037,
1329
+ "step": 1680
1330
+ },
1331
+ {
1332
+ "epoch": 3.477366255144033,
1333
+ "grad_norm": 1.212302327156067,
1334
+ "learning_rate": 6.635802469135803e-06,
1335
+ "loss": 0.0803,
1336
+ "step": 1690
1337
+ },
1338
+ {
1339
+ "epoch": 3.4979423868312756,
1340
+ "grad_norm": 0.1481838822364807,
1341
+ "learning_rate": 6.3786008230452675e-06,
1342
+ "loss": 0.0631,
1343
+ "step": 1700
1344
+ },
1345
+ {
1346
+ "epoch": 3.4979423868312756,
1347
+ "eval_accuracy": 0.9402332361516035,
1348
+ "eval_loss": 0.22551974654197693,
1349
+ "eval_runtime": 5.7428,
1350
+ "eval_samples_per_second": 119.454,
1351
+ "eval_steps_per_second": 14.975,
1352
+ "step": 1700
1353
+ },
1354
+ {
1355
+ "epoch": 3.5185185185185186,
1356
+ "grad_norm": 0.26120343804359436,
1357
+ "learning_rate": 6.121399176954733e-06,
1358
+ "loss": 0.3273,
1359
+ "step": 1710
1360
+ },
1361
+ {
1362
+ "epoch": 3.539094650205761,
1363
+ "grad_norm": 11.670561790466309,
1364
+ "learning_rate": 5.864197530864198e-06,
1365
+ "loss": 0.2464,
1366
+ "step": 1720
1367
+ },
1368
+ {
1369
+ "epoch": 3.5596707818930042,
1370
+ "grad_norm": 3.529303550720215,
1371
+ "learning_rate": 5.606995884773663e-06,
1372
+ "loss": 0.2335,
1373
+ "step": 1730
1374
+ },
1375
+ {
1376
+ "epoch": 3.580246913580247,
1377
+ "grad_norm": 5.341615200042725,
1378
+ "learning_rate": 5.3497942386831275e-06,
1379
+ "loss": 0.1774,
1380
+ "step": 1740
1381
+ },
1382
+ {
1383
+ "epoch": 3.60082304526749,
1384
+ "grad_norm": 11.935771942138672,
1385
+ "learning_rate": 5.092592592592592e-06,
1386
+ "loss": 0.1993,
1387
+ "step": 1750
1388
+ },
1389
+ {
1390
+ "epoch": 3.6213991769547325,
1391
+ "grad_norm": 7.190194606781006,
1392
+ "learning_rate": 4.835390946502058e-06,
1393
+ "loss": 0.2104,
1394
+ "step": 1760
1395
+ },
1396
+ {
1397
+ "epoch": 3.6419753086419755,
1398
+ "grad_norm": 0.41313230991363525,
1399
+ "learning_rate": 4.578189300411523e-06,
1400
+ "loss": 0.193,
1401
+ "step": 1770
1402
+ },
1403
+ {
1404
+ "epoch": 3.662551440329218,
1405
+ "grad_norm": 0.33850809931755066,
1406
+ "learning_rate": 4.3209876543209875e-06,
1407
+ "loss": 0.2527,
1408
+ "step": 1780
1409
+ },
1410
+ {
1411
+ "epoch": 3.6831275720164607,
1412
+ "grad_norm": 7.03388786315918,
1413
+ "learning_rate": 4.063786008230453e-06,
1414
+ "loss": 0.2271,
1415
+ "step": 1790
1416
+ },
1417
+ {
1418
+ "epoch": 3.7037037037037037,
1419
+ "grad_norm": 1.179168939590454,
1420
+ "learning_rate": 3.806584362139918e-06,
1421
+ "loss": 0.1986,
1422
+ "step": 1800
1423
+ },
1424
+ {
1425
+ "epoch": 3.7037037037037037,
1426
+ "eval_accuracy": 0.9416909620991254,
1427
+ "eval_loss": 0.22252099215984344,
1428
+ "eval_runtime": 5.7883,
1429
+ "eval_samples_per_second": 118.516,
1430
+ "eval_steps_per_second": 14.858,
1431
+ "step": 1800
1432
+ },
1433
+ {
1434
+ "epoch": 3.7242798353909468,
1435
+ "grad_norm": 5.670774459838867,
1436
+ "learning_rate": 3.5493827160493827e-06,
1437
+ "loss": 0.1168,
1438
+ "step": 1810
1439
+ },
1440
+ {
1441
+ "epoch": 3.7448559670781894,
1442
+ "grad_norm": 2.660778045654297,
1443
+ "learning_rate": 3.2921810699588483e-06,
1444
+ "loss": 0.1296,
1445
+ "step": 1820
1446
+ },
1447
+ {
1448
+ "epoch": 3.765432098765432,
1449
+ "grad_norm": 5.741485118865967,
1450
+ "learning_rate": 3.034979423868313e-06,
1451
+ "loss": 0.222,
1452
+ "step": 1830
1453
+ },
1454
+ {
1455
+ "epoch": 3.786008230452675,
1456
+ "grad_norm": 18.328266143798828,
1457
+ "learning_rate": 2.777777777777778e-06,
1458
+ "loss": 0.1688,
1459
+ "step": 1840
1460
+ },
1461
+ {
1462
+ "epoch": 3.8065843621399176,
1463
+ "grad_norm": 13.7136869430542,
1464
+ "learning_rate": 2.5205761316872427e-06,
1465
+ "loss": 0.2308,
1466
+ "step": 1850
1467
+ },
1468
+ {
1469
+ "epoch": 3.8271604938271606,
1470
+ "grad_norm": 7.43013858795166,
1471
+ "learning_rate": 2.263374485596708e-06,
1472
+ "loss": 0.1076,
1473
+ "step": 1860
1474
+ },
1475
+ {
1476
+ "epoch": 3.847736625514403,
1477
+ "grad_norm": 0.5525858402252197,
1478
+ "learning_rate": 2.0061728395061727e-06,
1479
+ "loss": 0.2191,
1480
+ "step": 1870
1481
+ },
1482
+ {
1483
+ "epoch": 3.8683127572016462,
1484
+ "grad_norm": 17.919668197631836,
1485
+ "learning_rate": 1.748971193415638e-06,
1486
+ "loss": 0.3505,
1487
+ "step": 1880
1488
+ },
1489
+ {
1490
+ "epoch": 3.888888888888889,
1491
+ "grad_norm": 0.1572398841381073,
1492
+ "learning_rate": 1.491769547325103e-06,
1493
+ "loss": 0.1558,
1494
+ "step": 1890
1495
+ },
1496
+ {
1497
+ "epoch": 3.909465020576132,
1498
+ "grad_norm": 16.84402084350586,
1499
+ "learning_rate": 1.234567901234568e-06,
1500
+ "loss": 0.2345,
1501
+ "step": 1900
1502
+ },
1503
+ {
1504
+ "epoch": 3.909465020576132,
1505
+ "eval_accuracy": 0.9373177842565598,
1506
+ "eval_loss": 0.2235153466463089,
1507
+ "eval_runtime": 5.8914,
1508
+ "eval_samples_per_second": 116.44,
1509
+ "eval_steps_per_second": 14.597,
1510
+ "step": 1900
1511
+ },
1512
+ {
1513
+ "epoch": 3.9300411522633745,
1514
+ "grad_norm": 8.612730026245117,
1515
+ "learning_rate": 9.773662551440331e-07,
1516
+ "loss": 0.2431,
1517
+ "step": 1910
1518
+ },
1519
+ {
1520
+ "epoch": 3.950617283950617,
1521
+ "grad_norm": 14.113691329956055,
1522
+ "learning_rate": 7.20164609053498e-07,
1523
+ "loss": 0.1582,
1524
+ "step": 1920
1525
+ },
1526
+ {
1527
+ "epoch": 3.97119341563786,
1528
+ "grad_norm": 18.373720169067383,
1529
+ "learning_rate": 4.6296296296296297e-07,
1530
+ "loss": 0.3902,
1531
+ "step": 1930
1532
+ },
1533
+ {
1534
+ "epoch": 3.991769547325103,
1535
+ "grad_norm": 0.15571478009223938,
1536
+ "learning_rate": 2.0576131687242802e-07,
1537
+ "loss": 0.1868,
1538
+ "step": 1940
1539
+ },
1540
+ {
1541
+ "epoch": 4.0,
1542
+ "step": 1944,
1543
+ "total_flos": 1.2048994477712425e+18,
1544
+ "train_loss": 0.3544519797212793,
1545
+ "train_runtime": 505.606,
1546
+ "train_samples_per_second": 30.751,
1547
+ "train_steps_per_second": 3.845
1548
+ }
1549
+ ],
1550
+ "logging_steps": 10,
1551
+ "max_steps": 1944,
1552
+ "num_input_tokens_seen": 0,
1553
+ "num_train_epochs": 4,
1554
+ "save_steps": 100,
1555
+ "total_flos": 1.2048994477712425e+18,
1556
+ "train_batch_size": 8,
1557
+ "trial_name": null,
1558
+ "trial_params": null
1559
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d007e07ddc3a1e8e5651fde477bf71022d14fbea752379650dc256c5d0cf134a
3
+ size 4984