Anders L|hr commited on
Commit
d4b56e8
·
1 Parent(s): a196bef

Best 5 class model

Browse files
HuggingfaceBest5ClassModel ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit a196bef4dd25c395f33565fcc21a540eae728a16
checkpoint-5432/HuggingfaceBest5ClassModel ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit a196bef4dd25c395f33565fcc21a540eae728a16
checkpoint-5432/config.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "anderloh/Hugginhface-master-wav2vec-pretreined-5-class-train-test",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSequenceClassification"
11
+ ],
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 128,
15
+ "codevector_dim": 128,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 256,
20
+ 256,
21
+ 256,
22
+ 256,
23
+ 256,
24
+ 256,
25
+ 256
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "sum",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.0,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.0,
56
+ "finetuning_task": "audio-classification",
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.0,
59
+ "hidden_dropout_prob": 0.0,
60
+ "hidden_size": 384,
61
+ "id2label": {
62
+ "0": "Helicopter",
63
+ "1": "Jet",
64
+ "2": "Racecar",
65
+ "3": "Rail",
66
+ "4": "Truck"
67
+ },
68
+ "initializer_range": 0.02,
69
+ "intermediate_size": 1536,
70
+ "label2id": {
71
+ "Helicopter": "0",
72
+ "Jet": "1",
73
+ "Racecar": "2",
74
+ "Rail": "3",
75
+ "Truck": "4"
76
+ },
77
+ "layer_norm_eps": 1e-05,
78
+ "layerdrop": 0.0,
79
+ "mask_feature_length": 10,
80
+ "mask_feature_min_masks": 0,
81
+ "mask_feature_prob": 0.0,
82
+ "mask_time_length": 10,
83
+ "mask_time_min_masks": 2,
84
+ "mask_time_prob": 0.65,
85
+ "model_type": "wav2vec2",
86
+ "num_adapter_layers": 3,
87
+ "num_attention_heads": 6,
88
+ "num_codevector_groups": 2,
89
+ "num_codevectors_per_group": 320,
90
+ "num_conv_pos_embedding_groups": 16,
91
+ "num_conv_pos_embeddings": 128,
92
+ "num_feat_extract_layers": 7,
93
+ "num_hidden_layers": 6,
94
+ "num_negatives": 100,
95
+ "output_hidden_size": 384,
96
+ "pad_token_id": 0,
97
+ "proj_codevector_dim": 128,
98
+ "tdnn_dilation": [
99
+ 1,
100
+ 2,
101
+ 3,
102
+ 1,
103
+ 1
104
+ ],
105
+ "tdnn_dim": [
106
+ 512,
107
+ 512,
108
+ 512,
109
+ 512,
110
+ 1500
111
+ ],
112
+ "tdnn_kernel": [
113
+ 5,
114
+ 3,
115
+ 3,
116
+ 1,
117
+ 1
118
+ ],
119
+ "torch_dtype": "float32",
120
+ "transformers_version": "4.39.0.dev0",
121
+ "use_weighted_layer_sum": false,
122
+ "vocab_size": 32,
123
+ "xvector_output_dim": 512
124
+ }
checkpoint-5432/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12b72afca351d5838b740f0ec6003c2e1e2a8c0f5156e6629ad3e2ef735bb540
3
+ size 52151348
checkpoint-5432/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd83ae850a6f38e955a9a0ecf3728e21f005733ae6ae7f948544118441a4714b
3
+ size 95909946
checkpoint-5432/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
checkpoint-5432/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0524d4c5bb50cb3a888e246202453f6e7f310c8e7d978c2791811f64716d2d2c
3
+ size 14244
checkpoint-5432/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66b1ff4c92709ee514d60150ad1c67a13e001dac71642548c74f443c1156d358
3
+ size 1064
checkpoint-5432/trainer_state.json ADDED
@@ -0,0 +1,2793 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7937062937062938,
3
+ "best_model_checkpoint": "wav2vec2-5Class-train-test-finetune/checkpoint-4122",
4
+ "epoch": 224.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5432,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.99,
13
+ "eval_accuracy": 0.34265734265734266,
14
+ "eval_loss": 1.5984586477279663,
15
+ "eval_runtime": 5.3437,
16
+ "eval_samples_per_second": 53.521,
17
+ "eval_steps_per_second": 3.368,
18
+ "step": 24
19
+ },
20
+ {
21
+ "epoch": 1.98,
22
+ "eval_accuracy": 0.33916083916083917,
23
+ "eval_loss": 1.5969289541244507,
24
+ "eval_runtime": 3.8653,
25
+ "eval_samples_per_second": 73.992,
26
+ "eval_steps_per_second": 4.657,
27
+ "step": 48
28
+ },
29
+ {
30
+ "epoch": 2.06,
31
+ "grad_norm": 1.0544973611831665,
32
+ "learning_rate": 2.4999999999999998e-06,
33
+ "loss": 1.5969,
34
+ "step": 50
35
+ },
36
+ {
37
+ "epoch": 2.97,
38
+ "eval_accuracy": 0.32867132867132864,
39
+ "eval_loss": 1.5943816900253296,
40
+ "eval_runtime": 6.1748,
41
+ "eval_samples_per_second": 46.317,
42
+ "eval_steps_per_second": 2.915,
43
+ "step": 72
44
+ },
45
+ {
46
+ "epoch": 4.0,
47
+ "eval_accuracy": 0.3146853146853147,
48
+ "eval_loss": 1.5906767845153809,
49
+ "eval_runtime": 5.1678,
50
+ "eval_samples_per_second": 55.343,
51
+ "eval_steps_per_second": 3.483,
52
+ "step": 97
53
+ },
54
+ {
55
+ "epoch": 4.12,
56
+ "grad_norm": 0.8443157076835632,
57
+ "learning_rate": 4.9999999999999996e-06,
58
+ "loss": 1.5896,
59
+ "step": 100
60
+ },
61
+ {
62
+ "epoch": 4.99,
63
+ "eval_accuracy": 0.2972027972027972,
64
+ "eval_loss": 1.5860023498535156,
65
+ "eval_runtime": 4.9416,
66
+ "eval_samples_per_second": 57.876,
67
+ "eval_steps_per_second": 3.643,
68
+ "step": 121
69
+ },
70
+ {
71
+ "epoch": 5.98,
72
+ "eval_accuracy": 0.2692307692307692,
73
+ "eval_loss": 1.5806005001068115,
74
+ "eval_runtime": 4.1837,
75
+ "eval_samples_per_second": 68.36,
76
+ "eval_steps_per_second": 4.302,
77
+ "step": 145
78
+ },
79
+ {
80
+ "epoch": 6.19,
81
+ "grad_norm": 1.0938074588775635,
82
+ "learning_rate": 7.5e-06,
83
+ "loss": 1.5743,
84
+ "step": 150
85
+ },
86
+ {
87
+ "epoch": 6.97,
88
+ "eval_accuracy": 0.25874125874125875,
89
+ "eval_loss": 1.5742768049240112,
90
+ "eval_runtime": 7.1914,
91
+ "eval_samples_per_second": 39.77,
92
+ "eval_steps_per_second": 2.503,
93
+ "step": 169
94
+ },
95
+ {
96
+ "epoch": 8.0,
97
+ "eval_accuracy": 0.23426573426573427,
98
+ "eval_loss": 1.5664165019989014,
99
+ "eval_runtime": 5.6489,
100
+ "eval_samples_per_second": 50.629,
101
+ "eval_steps_per_second": 3.186,
102
+ "step": 194
103
+ },
104
+ {
105
+ "epoch": 8.25,
106
+ "grad_norm": 0.9692079424858093,
107
+ "learning_rate": 9.999999999999999e-06,
108
+ "loss": 1.5508,
109
+ "step": 200
110
+ },
111
+ {
112
+ "epoch": 8.99,
113
+ "eval_accuracy": 0.22727272727272727,
114
+ "eval_loss": 1.557572841644287,
115
+ "eval_runtime": 5.5182,
116
+ "eval_samples_per_second": 51.828,
117
+ "eval_steps_per_second": 3.262,
118
+ "step": 218
119
+ },
120
+ {
121
+ "epoch": 9.98,
122
+ "eval_accuracy": 0.22727272727272727,
123
+ "eval_loss": 1.5482373237609863,
124
+ "eval_runtime": 5.3205,
125
+ "eval_samples_per_second": 53.754,
126
+ "eval_steps_per_second": 3.383,
127
+ "step": 242
128
+ },
129
+ {
130
+ "epoch": 10.31,
131
+ "grad_norm": 1.02046799659729,
132
+ "learning_rate": 1.25e-05,
133
+ "loss": 1.5157,
134
+ "step": 250
135
+ },
136
+ {
137
+ "epoch": 10.97,
138
+ "eval_accuracy": 0.22727272727272727,
139
+ "eval_loss": 1.539355993270874,
140
+ "eval_runtime": 6.3116,
141
+ "eval_samples_per_second": 45.313,
142
+ "eval_steps_per_second": 2.852,
143
+ "step": 266
144
+ },
145
+ {
146
+ "epoch": 12.0,
147
+ "eval_accuracy": 0.22727272727272727,
148
+ "eval_loss": 1.5350520610809326,
149
+ "eval_runtime": 4.3422,
150
+ "eval_samples_per_second": 65.865,
151
+ "eval_steps_per_second": 4.145,
152
+ "step": 291
153
+ },
154
+ {
155
+ "epoch": 12.37,
156
+ "grad_norm": 1.6058833599090576,
157
+ "learning_rate": 1.5e-05,
158
+ "loss": 1.4534,
159
+ "step": 300
160
+ },
161
+ {
162
+ "epoch": 12.99,
163
+ "eval_accuracy": 0.22727272727272727,
164
+ "eval_loss": 1.5525730848312378,
165
+ "eval_runtime": 5.245,
166
+ "eval_samples_per_second": 54.528,
167
+ "eval_steps_per_second": 3.432,
168
+ "step": 315
169
+ },
170
+ {
171
+ "epoch": 13.98,
172
+ "eval_accuracy": 0.22727272727272727,
173
+ "eval_loss": 1.599926471710205,
174
+ "eval_runtime": 6.0088,
175
+ "eval_samples_per_second": 47.597,
176
+ "eval_steps_per_second": 2.996,
177
+ "step": 339
178
+ },
179
+ {
180
+ "epoch": 14.43,
181
+ "grad_norm": 0.8243080377578735,
182
+ "learning_rate": 1.7500000000000002e-05,
183
+ "loss": 1.3638,
184
+ "step": 350
185
+ },
186
+ {
187
+ "epoch": 14.97,
188
+ "eval_accuracy": 0.22727272727272727,
189
+ "eval_loss": 1.5896875858306885,
190
+ "eval_runtime": 4.8752,
191
+ "eval_samples_per_second": 58.664,
192
+ "eval_steps_per_second": 3.692,
193
+ "step": 363
194
+ },
195
+ {
196
+ "epoch": 16.0,
197
+ "eval_accuracy": 0.26573426573426573,
198
+ "eval_loss": 1.560091495513916,
199
+ "eval_runtime": 5.5082,
200
+ "eval_samples_per_second": 51.922,
201
+ "eval_steps_per_second": 3.268,
202
+ "step": 388
203
+ },
204
+ {
205
+ "epoch": 16.49,
206
+ "grad_norm": 0.7977257370948792,
207
+ "learning_rate": 1.9999999999999998e-05,
208
+ "loss": 1.2951,
209
+ "step": 400
210
+ },
211
+ {
212
+ "epoch": 16.99,
213
+ "eval_accuracy": 0.2937062937062937,
214
+ "eval_loss": 1.5349317789077759,
215
+ "eval_runtime": 4.7526,
216
+ "eval_samples_per_second": 60.178,
217
+ "eval_steps_per_second": 3.787,
218
+ "step": 412
219
+ },
220
+ {
221
+ "epoch": 17.98,
222
+ "eval_accuracy": 0.34265734265734266,
223
+ "eval_loss": 1.5053907632827759,
224
+ "eval_runtime": 4.8638,
225
+ "eval_samples_per_second": 58.801,
226
+ "eval_steps_per_second": 3.701,
227
+ "step": 436
228
+ },
229
+ {
230
+ "epoch": 18.56,
231
+ "grad_norm": 0.7064552903175354,
232
+ "learning_rate": 2.25e-05,
233
+ "loss": 1.2369,
234
+ "step": 450
235
+ },
236
+ {
237
+ "epoch": 18.97,
238
+ "eval_accuracy": 0.3741258741258741,
239
+ "eval_loss": 1.4689087867736816,
240
+ "eval_runtime": 4.3712,
241
+ "eval_samples_per_second": 65.428,
242
+ "eval_steps_per_second": 4.118,
243
+ "step": 460
244
+ },
245
+ {
246
+ "epoch": 20.0,
247
+ "eval_accuracy": 0.4370629370629371,
248
+ "eval_loss": 1.404613971710205,
249
+ "eval_runtime": 4.7203,
250
+ "eval_samples_per_second": 60.59,
251
+ "eval_steps_per_second": 3.813,
252
+ "step": 485
253
+ },
254
+ {
255
+ "epoch": 20.62,
256
+ "grad_norm": 0.598238468170166,
257
+ "learning_rate": 2.5e-05,
258
+ "loss": 1.1566,
259
+ "step": 500
260
+ },
261
+ {
262
+ "epoch": 20.99,
263
+ "eval_accuracy": 0.4405594405594406,
264
+ "eval_loss": 1.3691043853759766,
265
+ "eval_runtime": 6.6443,
266
+ "eval_samples_per_second": 43.044,
267
+ "eval_steps_per_second": 2.709,
268
+ "step": 509
269
+ },
270
+ {
271
+ "epoch": 21.98,
272
+ "eval_accuracy": 0.4825174825174825,
273
+ "eval_loss": 1.3120107650756836,
274
+ "eval_runtime": 4.9585,
275
+ "eval_samples_per_second": 57.679,
276
+ "eval_steps_per_second": 3.63,
277
+ "step": 533
278
+ },
279
+ {
280
+ "epoch": 22.68,
281
+ "grad_norm": 0.682925820350647,
282
+ "learning_rate": 2.75e-05,
283
+ "loss": 1.0676,
284
+ "step": 550
285
+ },
286
+ {
287
+ "epoch": 22.97,
288
+ "eval_accuracy": 0.486013986013986,
289
+ "eval_loss": 1.2839338779449463,
290
+ "eval_runtime": 4.0382,
291
+ "eval_samples_per_second": 70.824,
292
+ "eval_steps_per_second": 4.457,
293
+ "step": 557
294
+ },
295
+ {
296
+ "epoch": 24.0,
297
+ "eval_accuracy": 0.5104895104895105,
298
+ "eval_loss": 1.2549891471862793,
299
+ "eval_runtime": 5.1896,
300
+ "eval_samples_per_second": 55.11,
301
+ "eval_steps_per_second": 3.468,
302
+ "step": 582
303
+ },
304
+ {
305
+ "epoch": 24.74,
306
+ "grad_norm": 1.1368101835250854,
307
+ "learning_rate": 3e-05,
308
+ "loss": 0.992,
309
+ "step": 600
310
+ },
311
+ {
312
+ "epoch": 24.99,
313
+ "eval_accuracy": 0.5209790209790209,
314
+ "eval_loss": 1.2106566429138184,
315
+ "eval_runtime": 6.8941,
316
+ "eval_samples_per_second": 41.485,
317
+ "eval_steps_per_second": 2.611,
318
+ "step": 606
319
+ },
320
+ {
321
+ "epoch": 25.98,
322
+ "eval_accuracy": 0.5384615384615384,
323
+ "eval_loss": 1.1711338758468628,
324
+ "eval_runtime": 4.9707,
325
+ "eval_samples_per_second": 57.537,
326
+ "eval_steps_per_second": 3.621,
327
+ "step": 630
328
+ },
329
+ {
330
+ "epoch": 26.8,
331
+ "grad_norm": 0.9649831056594849,
332
+ "learning_rate": 2.9722222222222223e-05,
333
+ "loss": 0.9272,
334
+ "step": 650
335
+ },
336
+ {
337
+ "epoch": 26.97,
338
+ "eval_accuracy": 0.5594405594405595,
339
+ "eval_loss": 1.1318116188049316,
340
+ "eval_runtime": 5.5564,
341
+ "eval_samples_per_second": 51.472,
342
+ "eval_steps_per_second": 3.24,
343
+ "step": 654
344
+ },
345
+ {
346
+ "epoch": 28.0,
347
+ "eval_accuracy": 0.6153846153846154,
348
+ "eval_loss": 1.0594333410263062,
349
+ "eval_runtime": 4.6773,
350
+ "eval_samples_per_second": 61.147,
351
+ "eval_steps_per_second": 3.848,
352
+ "step": 679
353
+ },
354
+ {
355
+ "epoch": 28.87,
356
+ "grad_norm": 0.883937418460846,
357
+ "learning_rate": 2.9444444444444445e-05,
358
+ "loss": 0.8478,
359
+ "step": 700
360
+ },
361
+ {
362
+ "epoch": 28.99,
363
+ "eval_accuracy": 0.6013986013986014,
364
+ "eval_loss": 1.054669737815857,
365
+ "eval_runtime": 4.9219,
366
+ "eval_samples_per_second": 58.108,
367
+ "eval_steps_per_second": 3.657,
368
+ "step": 703
369
+ },
370
+ {
371
+ "epoch": 29.98,
372
+ "eval_accuracy": 0.6363636363636364,
373
+ "eval_loss": 0.9822685122489929,
374
+ "eval_runtime": 6.3133,
375
+ "eval_samples_per_second": 45.302,
376
+ "eval_steps_per_second": 2.851,
377
+ "step": 727
378
+ },
379
+ {
380
+ "epoch": 30.93,
381
+ "grad_norm": 1.3742878437042236,
382
+ "learning_rate": 2.9166666666666666e-05,
383
+ "loss": 0.7627,
384
+ "step": 750
385
+ },
386
+ {
387
+ "epoch": 30.97,
388
+ "eval_accuracy": 0.6398601398601399,
389
+ "eval_loss": 1.00295090675354,
390
+ "eval_runtime": 6.154,
391
+ "eval_samples_per_second": 46.473,
392
+ "eval_steps_per_second": 2.925,
393
+ "step": 751
394
+ },
395
+ {
396
+ "epoch": 32.0,
397
+ "eval_accuracy": 0.6608391608391608,
398
+ "eval_loss": 0.930969774723053,
399
+ "eval_runtime": 5.6747,
400
+ "eval_samples_per_second": 50.399,
401
+ "eval_steps_per_second": 3.172,
402
+ "step": 776
403
+ },
404
+ {
405
+ "epoch": 32.99,
406
+ "grad_norm": 1.329268217086792,
407
+ "learning_rate": 2.8888888888888888e-05,
408
+ "loss": 0.7266,
409
+ "step": 800
410
+ },
411
+ {
412
+ "epoch": 32.99,
413
+ "eval_accuracy": 0.6678321678321678,
414
+ "eval_loss": 0.9228739738464355,
415
+ "eval_runtime": 5.382,
416
+ "eval_samples_per_second": 53.14,
417
+ "eval_steps_per_second": 3.344,
418
+ "step": 800
419
+ },
420
+ {
421
+ "epoch": 33.98,
422
+ "eval_accuracy": 0.6958041958041958,
423
+ "eval_loss": 0.8684509992599487,
424
+ "eval_runtime": 4.8497,
425
+ "eval_samples_per_second": 58.973,
426
+ "eval_steps_per_second": 3.712,
427
+ "step": 824
428
+ },
429
+ {
430
+ "epoch": 34.97,
431
+ "eval_accuracy": 0.6643356643356644,
432
+ "eval_loss": 0.8954732418060303,
433
+ "eval_runtime": 5.2083,
434
+ "eval_samples_per_second": 54.912,
435
+ "eval_steps_per_second": 3.456,
436
+ "step": 848
437
+ },
438
+ {
439
+ "epoch": 35.05,
440
+ "grad_norm": 1.3892701864242554,
441
+ "learning_rate": 2.8611111111111113e-05,
442
+ "loss": 0.6906,
443
+ "step": 850
444
+ },
445
+ {
446
+ "epoch": 36.0,
447
+ "eval_accuracy": 0.6713286713286714,
448
+ "eval_loss": 0.9125654101371765,
449
+ "eval_runtime": 5.3068,
450
+ "eval_samples_per_second": 53.894,
451
+ "eval_steps_per_second": 3.392,
452
+ "step": 873
453
+ },
454
+ {
455
+ "epoch": 36.99,
456
+ "eval_accuracy": 0.6923076923076923,
457
+ "eval_loss": 0.8543534874916077,
458
+ "eval_runtime": 4.3351,
459
+ "eval_samples_per_second": 65.974,
460
+ "eval_steps_per_second": 4.152,
461
+ "step": 897
462
+ },
463
+ {
464
+ "epoch": 37.11,
465
+ "grad_norm": 0.836291491985321,
466
+ "learning_rate": 2.8333333333333332e-05,
467
+ "loss": 0.6721,
468
+ "step": 900
469
+ },
470
+ {
471
+ "epoch": 37.98,
472
+ "eval_accuracy": 0.6923076923076923,
473
+ "eval_loss": 0.8480322957038879,
474
+ "eval_runtime": 5.1861,
475
+ "eval_samples_per_second": 55.147,
476
+ "eval_steps_per_second": 3.471,
477
+ "step": 921
478
+ },
479
+ {
480
+ "epoch": 38.97,
481
+ "eval_accuracy": 0.7097902097902098,
482
+ "eval_loss": 0.8354606628417969,
483
+ "eval_runtime": 6.3247,
484
+ "eval_samples_per_second": 45.22,
485
+ "eval_steps_per_second": 2.846,
486
+ "step": 945
487
+ },
488
+ {
489
+ "epoch": 39.18,
490
+ "grad_norm": 1.6499431133270264,
491
+ "learning_rate": 2.8055555555555557e-05,
492
+ "loss": 0.6442,
493
+ "step": 950
494
+ },
495
+ {
496
+ "epoch": 40.0,
497
+ "eval_accuracy": 0.6958041958041958,
498
+ "eval_loss": 0.8412452340126038,
499
+ "eval_runtime": 5.2281,
500
+ "eval_samples_per_second": 54.704,
501
+ "eval_steps_per_second": 3.443,
502
+ "step": 970
503
+ },
504
+ {
505
+ "epoch": 40.99,
506
+ "eval_accuracy": 0.6888111888111889,
507
+ "eval_loss": 0.8356389999389648,
508
+ "eval_runtime": 4.8326,
509
+ "eval_samples_per_second": 59.181,
510
+ "eval_steps_per_second": 3.725,
511
+ "step": 994
512
+ },
513
+ {
514
+ "epoch": 41.24,
515
+ "grad_norm": 1.1766818761825562,
516
+ "learning_rate": 2.777777777777778e-05,
517
+ "loss": 0.6465,
518
+ "step": 1000
519
+ },
520
+ {
521
+ "epoch": 41.98,
522
+ "eval_accuracy": 0.7062937062937062,
523
+ "eval_loss": 0.8180016875267029,
524
+ "eval_runtime": 5.7926,
525
+ "eval_samples_per_second": 49.374,
526
+ "eval_steps_per_second": 3.107,
527
+ "step": 1018
528
+ },
529
+ {
530
+ "epoch": 42.97,
531
+ "eval_accuracy": 0.7027972027972028,
532
+ "eval_loss": 0.8103991150856018,
533
+ "eval_runtime": 5.5185,
534
+ "eval_samples_per_second": 51.825,
535
+ "eval_steps_per_second": 3.262,
536
+ "step": 1042
537
+ },
538
+ {
539
+ "epoch": 43.3,
540
+ "grad_norm": 0.9722403287887573,
541
+ "learning_rate": 2.75e-05,
542
+ "loss": 0.6086,
543
+ "step": 1050
544
+ },
545
+ {
546
+ "epoch": 44.0,
547
+ "eval_accuracy": 0.6958041958041958,
548
+ "eval_loss": 0.8162235617637634,
549
+ "eval_runtime": 4.9174,
550
+ "eval_samples_per_second": 58.161,
551
+ "eval_steps_per_second": 3.66,
552
+ "step": 1067
553
+ },
554
+ {
555
+ "epoch": 44.99,
556
+ "eval_accuracy": 0.7027972027972028,
557
+ "eval_loss": 0.7957289218902588,
558
+ "eval_runtime": 4.6891,
559
+ "eval_samples_per_second": 60.992,
560
+ "eval_steps_per_second": 3.839,
561
+ "step": 1091
562
+ },
563
+ {
564
+ "epoch": 45.36,
565
+ "grad_norm": 1.269113302230835,
566
+ "learning_rate": 2.7222222222222223e-05,
567
+ "loss": 0.5863,
568
+ "step": 1100
569
+ },
570
+ {
571
+ "epoch": 45.98,
572
+ "eval_accuracy": 0.6958041958041958,
573
+ "eval_loss": 0.8143528699874878,
574
+ "eval_runtime": 6.6805,
575
+ "eval_samples_per_second": 42.811,
576
+ "eval_steps_per_second": 2.694,
577
+ "step": 1115
578
+ },
579
+ {
580
+ "epoch": 46.97,
581
+ "eval_accuracy": 0.7027972027972028,
582
+ "eval_loss": 0.78568434715271,
583
+ "eval_runtime": 4.7422,
584
+ "eval_samples_per_second": 60.31,
585
+ "eval_steps_per_second": 3.796,
586
+ "step": 1139
587
+ },
588
+ {
589
+ "epoch": 47.42,
590
+ "grad_norm": 0.9775255918502808,
591
+ "learning_rate": 2.6944444444444445e-05,
592
+ "loss": 0.5877,
593
+ "step": 1150
594
+ },
595
+ {
596
+ "epoch": 48.0,
597
+ "eval_accuracy": 0.7132867132867133,
598
+ "eval_loss": 0.7764595150947571,
599
+ "eval_runtime": 5.76,
600
+ "eval_samples_per_second": 49.653,
601
+ "eval_steps_per_second": 3.125,
602
+ "step": 1164
603
+ },
604
+ {
605
+ "epoch": 48.99,
606
+ "eval_accuracy": 0.6993006993006993,
607
+ "eval_loss": 0.7881478071212769,
608
+ "eval_runtime": 5.4965,
609
+ "eval_samples_per_second": 52.033,
610
+ "eval_steps_per_second": 3.275,
611
+ "step": 1188
612
+ },
613
+ {
614
+ "epoch": 49.48,
615
+ "grad_norm": 1.540124773979187,
616
+ "learning_rate": 2.6666666666666667e-05,
617
+ "loss": 0.5629,
618
+ "step": 1200
619
+ },
620
+ {
621
+ "epoch": 49.98,
622
+ "eval_accuracy": 0.7097902097902098,
623
+ "eval_loss": 0.7658265829086304,
624
+ "eval_runtime": 4.731,
625
+ "eval_samples_per_second": 60.452,
626
+ "eval_steps_per_second": 3.805,
627
+ "step": 1212
628
+ },
629
+ {
630
+ "epoch": 50.97,
631
+ "eval_accuracy": 0.7132867132867133,
632
+ "eval_loss": 0.7723098397254944,
633
+ "eval_runtime": 5.8352,
634
+ "eval_samples_per_second": 49.013,
635
+ "eval_steps_per_second": 3.085,
636
+ "step": 1236
637
+ },
638
+ {
639
+ "epoch": 51.55,
640
+ "grad_norm": 1.2498500347137451,
641
+ "learning_rate": 2.6388888888888892e-05,
642
+ "loss": 0.5476,
643
+ "step": 1250
644
+ },
645
+ {
646
+ "epoch": 52.0,
647
+ "eval_accuracy": 0.7097902097902098,
648
+ "eval_loss": 0.7603952884674072,
649
+ "eval_runtime": 4.448,
650
+ "eval_samples_per_second": 64.299,
651
+ "eval_steps_per_second": 4.047,
652
+ "step": 1261
653
+ },
654
+ {
655
+ "epoch": 52.99,
656
+ "eval_accuracy": 0.7202797202797203,
657
+ "eval_loss": 0.7554137706756592,
658
+ "eval_runtime": 6.4218,
659
+ "eval_samples_per_second": 44.536,
660
+ "eval_steps_per_second": 2.803,
661
+ "step": 1285
662
+ },
663
+ {
664
+ "epoch": 53.61,
665
+ "grad_norm": 0.9919388890266418,
666
+ "learning_rate": 2.6116666666666667e-05,
667
+ "loss": 0.5357,
668
+ "step": 1300
669
+ },
670
+ {
671
+ "epoch": 53.98,
672
+ "eval_accuracy": 0.7307692307692307,
673
+ "eval_loss": 0.7458928227424622,
674
+ "eval_runtime": 5.3791,
675
+ "eval_samples_per_second": 53.168,
676
+ "eval_steps_per_second": 3.346,
677
+ "step": 1309
678
+ },
679
+ {
680
+ "epoch": 54.97,
681
+ "eval_accuracy": 0.7132867132867133,
682
+ "eval_loss": 0.7632877230644226,
683
+ "eval_runtime": 5.278,
684
+ "eval_samples_per_second": 54.187,
685
+ "eval_steps_per_second": 3.41,
686
+ "step": 1333
687
+ },
688
+ {
689
+ "epoch": 55.67,
690
+ "grad_norm": 1.688183307647705,
691
+ "learning_rate": 2.5838888888888892e-05,
692
+ "loss": 0.5335,
693
+ "step": 1350
694
+ },
695
+ {
696
+ "epoch": 56.0,
697
+ "eval_accuracy": 0.7167832167832168,
698
+ "eval_loss": 0.768308162689209,
699
+ "eval_runtime": 5.7022,
700
+ "eval_samples_per_second": 50.156,
701
+ "eval_steps_per_second": 3.157,
702
+ "step": 1358
703
+ },
704
+ {
705
+ "epoch": 56.99,
706
+ "eval_accuracy": 0.7307692307692307,
707
+ "eval_loss": 0.7380541563034058,
708
+ "eval_runtime": 4.522,
709
+ "eval_samples_per_second": 63.247,
710
+ "eval_steps_per_second": 3.981,
711
+ "step": 1382
712
+ },
713
+ {
714
+ "epoch": 57.73,
715
+ "grad_norm": 1.4895784854888916,
716
+ "learning_rate": 2.556111111111111e-05,
717
+ "loss": 0.5107,
718
+ "step": 1400
719
+ },
720
+ {
721
+ "epoch": 57.98,
722
+ "eval_accuracy": 0.7377622377622378,
723
+ "eval_loss": 0.7308338284492493,
724
+ "eval_runtime": 4.4787,
725
+ "eval_samples_per_second": 63.857,
726
+ "eval_steps_per_second": 4.019,
727
+ "step": 1406
728
+ },
729
+ {
730
+ "epoch": 58.97,
731
+ "eval_accuracy": 0.7237762237762237,
732
+ "eval_loss": 0.7441032528877258,
733
+ "eval_runtime": 5.8744,
734
+ "eval_samples_per_second": 48.685,
735
+ "eval_steps_per_second": 3.064,
736
+ "step": 1430
737
+ },
738
+ {
739
+ "epoch": 59.79,
740
+ "grad_norm": 1.4925004243850708,
741
+ "learning_rate": 2.5283333333333332e-05,
742
+ "loss": 0.5105,
743
+ "step": 1450
744
+ },
745
+ {
746
+ "epoch": 60.0,
747
+ "eval_accuracy": 0.7307692307692307,
748
+ "eval_loss": 0.7481815218925476,
749
+ "eval_runtime": 7.272,
750
+ "eval_samples_per_second": 39.329,
751
+ "eval_steps_per_second": 2.475,
752
+ "step": 1455
753
+ },
754
+ {
755
+ "epoch": 60.99,
756
+ "eval_accuracy": 0.7342657342657343,
757
+ "eval_loss": 0.733482301235199,
758
+ "eval_runtime": 4.6235,
759
+ "eval_samples_per_second": 61.858,
760
+ "eval_steps_per_second": 3.893,
761
+ "step": 1479
762
+ },
763
+ {
764
+ "epoch": 61.86,
765
+ "grad_norm": 1.3200663328170776,
766
+ "learning_rate": 2.5005555555555558e-05,
767
+ "loss": 0.4914,
768
+ "step": 1500
769
+ },
770
+ {
771
+ "epoch": 61.98,
772
+ "eval_accuracy": 0.7447552447552448,
773
+ "eval_loss": 0.7241908311843872,
774
+ "eval_runtime": 4.8198,
775
+ "eval_samples_per_second": 59.338,
776
+ "eval_steps_per_second": 3.735,
777
+ "step": 1503
778
+ },
779
+ {
780
+ "epoch": 62.97,
781
+ "eval_accuracy": 0.7377622377622378,
782
+ "eval_loss": 0.7321043014526367,
783
+ "eval_runtime": 5.8929,
784
+ "eval_samples_per_second": 48.533,
785
+ "eval_steps_per_second": 3.055,
786
+ "step": 1527
787
+ },
788
+ {
789
+ "epoch": 63.92,
790
+ "grad_norm": 1.1309747695922852,
791
+ "learning_rate": 2.472777777777778e-05,
792
+ "loss": 0.4839,
793
+ "step": 1550
794
+ },
795
+ {
796
+ "epoch": 64.0,
797
+ "eval_accuracy": 0.7342657342657343,
798
+ "eval_loss": 0.7220665216445923,
799
+ "eval_runtime": 5.8635,
800
+ "eval_samples_per_second": 48.776,
801
+ "eval_steps_per_second": 3.07,
802
+ "step": 1552
803
+ },
804
+ {
805
+ "epoch": 64.99,
806
+ "eval_accuracy": 0.7412587412587412,
807
+ "eval_loss": 0.7136482000350952,
808
+ "eval_runtime": 4.3102,
809
+ "eval_samples_per_second": 66.354,
810
+ "eval_steps_per_second": 4.176,
811
+ "step": 1576
812
+ },
813
+ {
814
+ "epoch": 65.98,
815
+ "grad_norm": 1.1314157247543335,
816
+ "learning_rate": 2.4449999999999998e-05,
817
+ "loss": 0.4751,
818
+ "step": 1600
819
+ },
820
+ {
821
+ "epoch": 65.98,
822
+ "eval_accuracy": 0.7412587412587412,
823
+ "eval_loss": 0.7198111414909363,
824
+ "eval_runtime": 4.7841,
825
+ "eval_samples_per_second": 59.781,
826
+ "eval_steps_per_second": 3.762,
827
+ "step": 1600
828
+ },
829
+ {
830
+ "epoch": 66.97,
831
+ "eval_accuracy": 0.7377622377622378,
832
+ "eval_loss": 0.7145721912384033,
833
+ "eval_runtime": 6.347,
834
+ "eval_samples_per_second": 45.061,
835
+ "eval_steps_per_second": 2.836,
836
+ "step": 1624
837
+ },
838
+ {
839
+ "epoch": 68.0,
840
+ "eval_accuracy": 0.7447552447552448,
841
+ "eval_loss": 0.6970916390419006,
842
+ "eval_runtime": 5.6871,
843
+ "eval_samples_per_second": 50.289,
844
+ "eval_steps_per_second": 3.165,
845
+ "step": 1649
846
+ },
847
+ {
848
+ "epoch": 68.04,
849
+ "grad_norm": 2.397585153579712,
850
+ "learning_rate": 2.4172222222222223e-05,
851
+ "loss": 0.4639,
852
+ "step": 1650
853
+ },
854
+ {
855
+ "epoch": 68.99,
856
+ "eval_accuracy": 0.7272727272727273,
857
+ "eval_loss": 0.7201464176177979,
858
+ "eval_runtime": 4.4157,
859
+ "eval_samples_per_second": 64.769,
860
+ "eval_steps_per_second": 4.076,
861
+ "step": 1673
862
+ },
863
+ {
864
+ "epoch": 69.98,
865
+ "eval_accuracy": 0.7307692307692307,
866
+ "eval_loss": 0.7244682312011719,
867
+ "eval_runtime": 5.4392,
868
+ "eval_samples_per_second": 52.581,
869
+ "eval_steps_per_second": 3.309,
870
+ "step": 1697
871
+ },
872
+ {
873
+ "epoch": 70.1,
874
+ "grad_norm": 2.062610387802124,
875
+ "learning_rate": 2.3894444444444445e-05,
876
+ "loss": 0.4581,
877
+ "step": 1700
878
+ },
879
+ {
880
+ "epoch": 70.97,
881
+ "eval_accuracy": 0.7447552447552448,
882
+ "eval_loss": 0.7077587842941284,
883
+ "eval_runtime": 5.1002,
884
+ "eval_samples_per_second": 56.076,
885
+ "eval_steps_per_second": 3.529,
886
+ "step": 1721
887
+ },
888
+ {
889
+ "epoch": 72.0,
890
+ "eval_accuracy": 0.7517482517482518,
891
+ "eval_loss": 0.6957913637161255,
892
+ "eval_runtime": 4.4485,
893
+ "eval_samples_per_second": 64.291,
894
+ "eval_steps_per_second": 4.046,
895
+ "step": 1746
896
+ },
897
+ {
898
+ "epoch": 72.16,
899
+ "grad_norm": 2.7808456420898438,
900
+ "learning_rate": 2.3616666666666667e-05,
901
+ "loss": 0.4643,
902
+ "step": 1750
903
+ },
904
+ {
905
+ "epoch": 72.99,
906
+ "eval_accuracy": 0.7447552447552448,
907
+ "eval_loss": 0.7036928534507751,
908
+ "eval_runtime": 5.9101,
909
+ "eval_samples_per_second": 48.392,
910
+ "eval_steps_per_second": 3.046,
911
+ "step": 1770
912
+ },
913
+ {
914
+ "epoch": 73.98,
915
+ "eval_accuracy": 0.7482517482517482,
916
+ "eval_loss": 0.71629399061203,
917
+ "eval_runtime": 6.0211,
918
+ "eval_samples_per_second": 47.5,
919
+ "eval_steps_per_second": 2.989,
920
+ "step": 1794
921
+ },
922
+ {
923
+ "epoch": 74.23,
924
+ "grad_norm": 1.78495192527771,
925
+ "learning_rate": 2.333888888888889e-05,
926
+ "loss": 0.442,
927
+ "step": 1800
928
+ },
929
+ {
930
+ "epoch": 74.97,
931
+ "eval_accuracy": 0.7377622377622378,
932
+ "eval_loss": 0.6997957229614258,
933
+ "eval_runtime": 4.4212,
934
+ "eval_samples_per_second": 64.688,
935
+ "eval_steps_per_second": 4.071,
936
+ "step": 1818
937
+ },
938
+ {
939
+ "epoch": 76.0,
940
+ "eval_accuracy": 0.7447552447552448,
941
+ "eval_loss": 0.6946483850479126,
942
+ "eval_runtime": 4.0507,
943
+ "eval_samples_per_second": 70.605,
944
+ "eval_steps_per_second": 4.444,
945
+ "step": 1843
946
+ },
947
+ {
948
+ "epoch": 76.29,
949
+ "grad_norm": 1.7383118867874146,
950
+ "learning_rate": 2.306111111111111e-05,
951
+ "loss": 0.4305,
952
+ "step": 1850
953
+ },
954
+ {
955
+ "epoch": 76.99,
956
+ "eval_accuracy": 0.7552447552447552,
957
+ "eval_loss": 0.6857091784477234,
958
+ "eval_runtime": 4.1718,
959
+ "eval_samples_per_second": 68.556,
960
+ "eval_steps_per_second": 4.315,
961
+ "step": 1867
962
+ },
963
+ {
964
+ "epoch": 77.98,
965
+ "eval_accuracy": 0.7447552447552448,
966
+ "eval_loss": 0.6936307549476624,
967
+ "eval_runtime": 3.8781,
968
+ "eval_samples_per_second": 73.747,
969
+ "eval_steps_per_second": 4.641,
970
+ "step": 1891
971
+ },
972
+ {
973
+ "epoch": 78.35,
974
+ "grad_norm": 1.047067403793335,
975
+ "learning_rate": 2.2783333333333336e-05,
976
+ "loss": 0.4416,
977
+ "step": 1900
978
+ },
979
+ {
980
+ "epoch": 78.97,
981
+ "eval_accuracy": 0.7517482517482518,
982
+ "eval_loss": 0.6965110301971436,
983
+ "eval_runtime": 5.1318,
984
+ "eval_samples_per_second": 55.731,
985
+ "eval_steps_per_second": 3.508,
986
+ "step": 1915
987
+ },
988
+ {
989
+ "epoch": 80.0,
990
+ "eval_accuracy": 0.7482517482517482,
991
+ "eval_loss": 0.7017127871513367,
992
+ "eval_runtime": 4.3418,
993
+ "eval_samples_per_second": 65.871,
994
+ "eval_steps_per_second": 4.146,
995
+ "step": 1940
996
+ },
997
+ {
998
+ "epoch": 80.41,
999
+ "grad_norm": 1.5354928970336914,
1000
+ "learning_rate": 2.2505555555555554e-05,
1001
+ "loss": 0.428,
1002
+ "step": 1950
1003
+ },
1004
+ {
1005
+ "epoch": 80.99,
1006
+ "eval_accuracy": 0.7552447552447552,
1007
+ "eval_loss": 0.6970596313476562,
1008
+ "eval_runtime": 5.973,
1009
+ "eval_samples_per_second": 47.882,
1010
+ "eval_steps_per_second": 3.014,
1011
+ "step": 1964
1012
+ },
1013
+ {
1014
+ "epoch": 81.98,
1015
+ "eval_accuracy": 0.7552447552447552,
1016
+ "eval_loss": 0.6897542476654053,
1017
+ "eval_runtime": 5.0481,
1018
+ "eval_samples_per_second": 56.655,
1019
+ "eval_steps_per_second": 3.566,
1020
+ "step": 1988
1021
+ },
1022
+ {
1023
+ "epoch": 82.47,
1024
+ "grad_norm": 1.7141317129135132,
1025
+ "learning_rate": 2.2227777777777776e-05,
1026
+ "loss": 0.4093,
1027
+ "step": 2000
1028
+ },
1029
+ {
1030
+ "epoch": 82.97,
1031
+ "eval_accuracy": 0.7482517482517482,
1032
+ "eval_loss": 0.7004020810127258,
1033
+ "eval_runtime": 4.1986,
1034
+ "eval_samples_per_second": 68.118,
1035
+ "eval_steps_per_second": 4.287,
1036
+ "step": 2012
1037
+ },
1038
+ {
1039
+ "epoch": 84.0,
1040
+ "eval_accuracy": 0.7552447552447552,
1041
+ "eval_loss": 0.6867479681968689,
1042
+ "eval_runtime": 4.6871,
1043
+ "eval_samples_per_second": 61.018,
1044
+ "eval_steps_per_second": 3.84,
1045
+ "step": 2037
1046
+ },
1047
+ {
1048
+ "epoch": 84.54,
1049
+ "grad_norm": 2.0219666957855225,
1050
+ "learning_rate": 2.195e-05,
1051
+ "loss": 0.4148,
1052
+ "step": 2050
1053
+ },
1054
+ {
1055
+ "epoch": 84.99,
1056
+ "eval_accuracy": 0.7377622377622378,
1057
+ "eval_loss": 0.7070020437240601,
1058
+ "eval_runtime": 5.9326,
1059
+ "eval_samples_per_second": 48.208,
1060
+ "eval_steps_per_second": 3.034,
1061
+ "step": 2061
1062
+ },
1063
+ {
1064
+ "epoch": 85.98,
1065
+ "eval_accuracy": 0.7447552447552448,
1066
+ "eval_loss": 0.7030305862426758,
1067
+ "eval_runtime": 5.3564,
1068
+ "eval_samples_per_second": 53.394,
1069
+ "eval_steps_per_second": 3.36,
1070
+ "step": 2085
1071
+ },
1072
+ {
1073
+ "epoch": 86.6,
1074
+ "grad_norm": 1.4678714275360107,
1075
+ "learning_rate": 2.1672222222222223e-05,
1076
+ "loss": 0.3923,
1077
+ "step": 2100
1078
+ },
1079
+ {
1080
+ "epoch": 86.97,
1081
+ "eval_accuracy": 0.7587412587412588,
1082
+ "eval_loss": 0.678174614906311,
1083
+ "eval_runtime": 3.9745,
1084
+ "eval_samples_per_second": 71.96,
1085
+ "eval_steps_per_second": 4.529,
1086
+ "step": 2109
1087
+ },
1088
+ {
1089
+ "epoch": 88.0,
1090
+ "eval_accuracy": 0.7412587412587412,
1091
+ "eval_loss": 0.7166118621826172,
1092
+ "eval_runtime": 4.0358,
1093
+ "eval_samples_per_second": 70.866,
1094
+ "eval_steps_per_second": 4.46,
1095
+ "step": 2134
1096
+ },
1097
+ {
1098
+ "epoch": 88.66,
1099
+ "grad_norm": 1.589543342590332,
1100
+ "learning_rate": 2.1394444444444445e-05,
1101
+ "loss": 0.3964,
1102
+ "step": 2150
1103
+ },
1104
+ {
1105
+ "epoch": 88.99,
1106
+ "eval_accuracy": 0.7482517482517482,
1107
+ "eval_loss": 0.7075912952423096,
1108
+ "eval_runtime": 5.0331,
1109
+ "eval_samples_per_second": 56.823,
1110
+ "eval_steps_per_second": 3.576,
1111
+ "step": 2158
1112
+ },
1113
+ {
1114
+ "epoch": 89.98,
1115
+ "eval_accuracy": 0.7657342657342657,
1116
+ "eval_loss": 0.6867172122001648,
1117
+ "eval_runtime": 5.386,
1118
+ "eval_samples_per_second": 53.101,
1119
+ "eval_steps_per_second": 3.342,
1120
+ "step": 2182
1121
+ },
1122
+ {
1123
+ "epoch": 90.72,
1124
+ "grad_norm": 1.3886605501174927,
1125
+ "learning_rate": 2.1116666666666667e-05,
1126
+ "loss": 0.3846,
1127
+ "step": 2200
1128
+ },
1129
+ {
1130
+ "epoch": 90.97,
1131
+ "eval_accuracy": 0.7517482517482518,
1132
+ "eval_loss": 0.6913285851478577,
1133
+ "eval_runtime": 5.5324,
1134
+ "eval_samples_per_second": 51.696,
1135
+ "eval_steps_per_second": 3.254,
1136
+ "step": 2206
1137
+ },
1138
+ {
1139
+ "epoch": 92.0,
1140
+ "eval_accuracy": 0.7482517482517482,
1141
+ "eval_loss": 0.7160294651985168,
1142
+ "eval_runtime": 5.2753,
1143
+ "eval_samples_per_second": 54.215,
1144
+ "eval_steps_per_second": 3.412,
1145
+ "step": 2231
1146
+ },
1147
+ {
1148
+ "epoch": 92.78,
1149
+ "grad_norm": 2.4106783866882324,
1150
+ "learning_rate": 2.083888888888889e-05,
1151
+ "loss": 0.3654,
1152
+ "step": 2250
1153
+ },
1154
+ {
1155
+ "epoch": 92.99,
1156
+ "eval_accuracy": 0.7517482517482518,
1157
+ "eval_loss": 0.6765207052230835,
1158
+ "eval_runtime": 5.5671,
1159
+ "eval_samples_per_second": 51.373,
1160
+ "eval_steps_per_second": 3.233,
1161
+ "step": 2255
1162
+ },
1163
+ {
1164
+ "epoch": 93.98,
1165
+ "eval_accuracy": 0.7657342657342657,
1166
+ "eval_loss": 0.6881967186927795,
1167
+ "eval_runtime": 3.8228,
1168
+ "eval_samples_per_second": 74.814,
1169
+ "eval_steps_per_second": 4.709,
1170
+ "step": 2279
1171
+ },
1172
+ {
1173
+ "epoch": 94.85,
1174
+ "grad_norm": 0.8871183395385742,
1175
+ "learning_rate": 2.0561111111111114e-05,
1176
+ "loss": 0.3577,
1177
+ "step": 2300
1178
+ },
1179
+ {
1180
+ "epoch": 94.97,
1181
+ "eval_accuracy": 0.7552447552447552,
1182
+ "eval_loss": 0.6852585673332214,
1183
+ "eval_runtime": 4.7228,
1184
+ "eval_samples_per_second": 60.557,
1185
+ "eval_steps_per_second": 3.811,
1186
+ "step": 2303
1187
+ },
1188
+ {
1189
+ "epoch": 96.0,
1190
+ "eval_accuracy": 0.7552447552447552,
1191
+ "eval_loss": 0.7158808708190918,
1192
+ "eval_runtime": 5.6504,
1193
+ "eval_samples_per_second": 50.616,
1194
+ "eval_steps_per_second": 3.186,
1195
+ "step": 2328
1196
+ },
1197
+ {
1198
+ "epoch": 96.91,
1199
+ "grad_norm": 1.0019863843917847,
1200
+ "learning_rate": 2.0283333333333333e-05,
1201
+ "loss": 0.37,
1202
+ "step": 2350
1203
+ },
1204
+ {
1205
+ "epoch": 96.99,
1206
+ "eval_accuracy": 0.7657342657342657,
1207
+ "eval_loss": 0.6943120360374451,
1208
+ "eval_runtime": 4.8337,
1209
+ "eval_samples_per_second": 59.168,
1210
+ "eval_steps_per_second": 3.724,
1211
+ "step": 2352
1212
+ },
1213
+ {
1214
+ "epoch": 97.98,
1215
+ "eval_accuracy": 0.7587412587412588,
1216
+ "eval_loss": 0.7010317444801331,
1217
+ "eval_runtime": 4.6874,
1218
+ "eval_samples_per_second": 61.015,
1219
+ "eval_steps_per_second": 3.84,
1220
+ "step": 2376
1221
+ },
1222
+ {
1223
+ "epoch": 98.97,
1224
+ "grad_norm": 1.2908928394317627,
1225
+ "learning_rate": 2.0005555555555555e-05,
1226
+ "loss": 0.3473,
1227
+ "step": 2400
1228
+ },
1229
+ {
1230
+ "epoch": 98.97,
1231
+ "eval_accuracy": 0.7727272727272727,
1232
+ "eval_loss": 0.693758487701416,
1233
+ "eval_runtime": 4.7585,
1234
+ "eval_samples_per_second": 60.103,
1235
+ "eval_steps_per_second": 3.783,
1236
+ "step": 2400
1237
+ },
1238
+ {
1239
+ "epoch": 100.0,
1240
+ "eval_accuracy": 0.7587412587412588,
1241
+ "eval_loss": 0.6918778419494629,
1242
+ "eval_runtime": 6.6891,
1243
+ "eval_samples_per_second": 42.756,
1244
+ "eval_steps_per_second": 2.691,
1245
+ "step": 2425
1246
+ },
1247
+ {
1248
+ "epoch": 100.99,
1249
+ "eval_accuracy": 0.7552447552447552,
1250
+ "eval_loss": 0.6849302053451538,
1251
+ "eval_runtime": 4.4685,
1252
+ "eval_samples_per_second": 64.003,
1253
+ "eval_steps_per_second": 4.028,
1254
+ "step": 2449
1255
+ },
1256
+ {
1257
+ "epoch": 101.03,
1258
+ "grad_norm": 1.1730871200561523,
1259
+ "learning_rate": 1.972777777777778e-05,
1260
+ "loss": 0.3587,
1261
+ "step": 2450
1262
+ },
1263
+ {
1264
+ "epoch": 101.98,
1265
+ "eval_accuracy": 0.7587412587412588,
1266
+ "eval_loss": 0.6855939030647278,
1267
+ "eval_runtime": 4.3434,
1268
+ "eval_samples_per_second": 65.847,
1269
+ "eval_steps_per_second": 4.144,
1270
+ "step": 2473
1271
+ },
1272
+ {
1273
+ "epoch": 102.97,
1274
+ "eval_accuracy": 0.7517482517482518,
1275
+ "eval_loss": 0.7046144604682922,
1276
+ "eval_runtime": 4.7166,
1277
+ "eval_samples_per_second": 60.637,
1278
+ "eval_steps_per_second": 3.816,
1279
+ "step": 2497
1280
+ },
1281
+ {
1282
+ "epoch": 103.09,
1283
+ "grad_norm": 1.3693217039108276,
1284
+ "learning_rate": 1.945e-05,
1285
+ "loss": 0.3429,
1286
+ "step": 2500
1287
+ },
1288
+ {
1289
+ "epoch": 104.0,
1290
+ "eval_accuracy": 0.7727272727272727,
1291
+ "eval_loss": 0.6892997622489929,
1292
+ "eval_runtime": 5.3868,
1293
+ "eval_samples_per_second": 53.092,
1294
+ "eval_steps_per_second": 3.341,
1295
+ "step": 2522
1296
+ },
1297
+ {
1298
+ "epoch": 104.99,
1299
+ "eval_accuracy": 0.7622377622377622,
1300
+ "eval_loss": 0.6913393139839172,
1301
+ "eval_runtime": 5.09,
1302
+ "eval_samples_per_second": 56.188,
1303
+ "eval_steps_per_second": 3.536,
1304
+ "step": 2546
1305
+ },
1306
+ {
1307
+ "epoch": 105.15,
1308
+ "grad_norm": 1.923829436302185,
1309
+ "learning_rate": 1.9172222222222224e-05,
1310
+ "loss": 0.3549,
1311
+ "step": 2550
1312
+ },
1313
+ {
1314
+ "epoch": 105.98,
1315
+ "eval_accuracy": 0.7762237762237763,
1316
+ "eval_loss": 0.6880810856819153,
1317
+ "eval_runtime": 4.6668,
1318
+ "eval_samples_per_second": 61.283,
1319
+ "eval_steps_per_second": 3.857,
1320
+ "step": 2570
1321
+ },
1322
+ {
1323
+ "epoch": 106.97,
1324
+ "eval_accuracy": 0.7692307692307693,
1325
+ "eval_loss": 0.7097887396812439,
1326
+ "eval_runtime": 6.4652,
1327
+ "eval_samples_per_second": 44.237,
1328
+ "eval_steps_per_second": 2.784,
1329
+ "step": 2594
1330
+ },
1331
+ {
1332
+ "epoch": 107.22,
1333
+ "grad_norm": 2.702012062072754,
1334
+ "learning_rate": 1.8894444444444446e-05,
1335
+ "loss": 0.3403,
1336
+ "step": 2600
1337
+ },
1338
+ {
1339
+ "epoch": 108.0,
1340
+ "eval_accuracy": 0.7762237762237763,
1341
+ "eval_loss": 0.6878336668014526,
1342
+ "eval_runtime": 4.6923,
1343
+ "eval_samples_per_second": 60.951,
1344
+ "eval_steps_per_second": 3.836,
1345
+ "step": 2619
1346
+ },
1347
+ {
1348
+ "epoch": 108.99,
1349
+ "eval_accuracy": 0.7762237762237763,
1350
+ "eval_loss": 0.695954442024231,
1351
+ "eval_runtime": 4.4809,
1352
+ "eval_samples_per_second": 63.827,
1353
+ "eval_steps_per_second": 4.017,
1354
+ "step": 2643
1355
+ },
1356
+ {
1357
+ "epoch": 109.28,
1358
+ "grad_norm": 2.3427536487579346,
1359
+ "learning_rate": 1.8616666666666667e-05,
1360
+ "loss": 0.3253,
1361
+ "step": 2650
1362
+ },
1363
+ {
1364
+ "epoch": 109.98,
1365
+ "eval_accuracy": 0.7727272727272727,
1366
+ "eval_loss": 0.7005948424339294,
1367
+ "eval_runtime": 4.8882,
1368
+ "eval_samples_per_second": 58.508,
1369
+ "eval_steps_per_second": 3.682,
1370
+ "step": 2667
1371
+ },
1372
+ {
1373
+ "epoch": 110.97,
1374
+ "eval_accuracy": 0.7692307692307693,
1375
+ "eval_loss": 0.6916196346282959,
1376
+ "eval_runtime": 5.2891,
1377
+ "eval_samples_per_second": 54.073,
1378
+ "eval_steps_per_second": 3.403,
1379
+ "step": 2691
1380
+ },
1381
+ {
1382
+ "epoch": 111.34,
1383
+ "grad_norm": 2.178089141845703,
1384
+ "learning_rate": 1.833888888888889e-05,
1385
+ "loss": 0.3332,
1386
+ "step": 2700
1387
+ },
1388
+ {
1389
+ "epoch": 112.0,
1390
+ "eval_accuracy": 0.7657342657342657,
1391
+ "eval_loss": 0.7059447765350342,
1392
+ "eval_runtime": 4.7437,
1393
+ "eval_samples_per_second": 60.291,
1394
+ "eval_steps_per_second": 3.795,
1395
+ "step": 2716
1396
+ },
1397
+ {
1398
+ "epoch": 112.99,
1399
+ "eval_accuracy": 0.7867132867132867,
1400
+ "eval_loss": 0.6904045939445496,
1401
+ "eval_runtime": 4.9942,
1402
+ "eval_samples_per_second": 57.267,
1403
+ "eval_steps_per_second": 3.604,
1404
+ "step": 2740
1405
+ },
1406
+ {
1407
+ "epoch": 113.4,
1408
+ "grad_norm": 1.1625444889068604,
1409
+ "learning_rate": 1.806111111111111e-05,
1410
+ "loss": 0.3188,
1411
+ "step": 2750
1412
+ },
1413
+ {
1414
+ "epoch": 113.98,
1415
+ "eval_accuracy": 0.7727272727272727,
1416
+ "eval_loss": 0.6970774531364441,
1417
+ "eval_runtime": 6.4809,
1418
+ "eval_samples_per_second": 44.13,
1419
+ "eval_steps_per_second": 2.777,
1420
+ "step": 2764
1421
+ },
1422
+ {
1423
+ "epoch": 114.97,
1424
+ "eval_accuracy": 0.7797202797202797,
1425
+ "eval_loss": 0.700820803642273,
1426
+ "eval_runtime": 5.2617,
1427
+ "eval_samples_per_second": 54.355,
1428
+ "eval_steps_per_second": 3.421,
1429
+ "step": 2788
1430
+ },
1431
+ {
1432
+ "epoch": 115.46,
1433
+ "grad_norm": 1.2394715547561646,
1434
+ "learning_rate": 1.7783333333333333e-05,
1435
+ "loss": 0.3112,
1436
+ "step": 2800
1437
+ },
1438
+ {
1439
+ "epoch": 116.0,
1440
+ "eval_accuracy": 0.7797202797202797,
1441
+ "eval_loss": 0.7002130150794983,
1442
+ "eval_runtime": 5.0937,
1443
+ "eval_samples_per_second": 56.147,
1444
+ "eval_steps_per_second": 3.534,
1445
+ "step": 2813
1446
+ },
1447
+ {
1448
+ "epoch": 116.99,
1449
+ "eval_accuracy": 0.7692307692307693,
1450
+ "eval_loss": 0.6909505724906921,
1451
+ "eval_runtime": 4.7575,
1452
+ "eval_samples_per_second": 60.116,
1453
+ "eval_steps_per_second": 3.784,
1454
+ "step": 2837
1455
+ },
1456
+ {
1457
+ "epoch": 117.53,
1458
+ "grad_norm": 2.4334964752197266,
1459
+ "learning_rate": 1.7505555555555558e-05,
1460
+ "loss": 0.3153,
1461
+ "step": 2850
1462
+ },
1463
+ {
1464
+ "epoch": 117.98,
1465
+ "eval_accuracy": 0.7797202797202797,
1466
+ "eval_loss": 0.6957750916481018,
1467
+ "eval_runtime": 4.8105,
1468
+ "eval_samples_per_second": 59.453,
1469
+ "eval_steps_per_second": 3.742,
1470
+ "step": 2861
1471
+ },
1472
+ {
1473
+ "epoch": 118.97,
1474
+ "eval_accuracy": 0.7762237762237763,
1475
+ "eval_loss": 0.6867520213127136,
1476
+ "eval_runtime": 4.5411,
1477
+ "eval_samples_per_second": 62.98,
1478
+ "eval_steps_per_second": 3.964,
1479
+ "step": 2885
1480
+ },
1481
+ {
1482
+ "epoch": 119.59,
1483
+ "grad_norm": 0.769097089767456,
1484
+ "learning_rate": 1.7227777777777777e-05,
1485
+ "loss": 0.3006,
1486
+ "step": 2900
1487
+ },
1488
+ {
1489
+ "epoch": 120.0,
1490
+ "eval_accuracy": 0.7727272727272727,
1491
+ "eval_loss": 0.6890790462493896,
1492
+ "eval_runtime": 4.5864,
1493
+ "eval_samples_per_second": 62.358,
1494
+ "eval_steps_per_second": 3.925,
1495
+ "step": 2910
1496
+ },
1497
+ {
1498
+ "epoch": 120.99,
1499
+ "eval_accuracy": 0.7657342657342657,
1500
+ "eval_loss": 0.6889089941978455,
1501
+ "eval_runtime": 6.5804,
1502
+ "eval_samples_per_second": 43.462,
1503
+ "eval_steps_per_second": 2.735,
1504
+ "step": 2934
1505
+ },
1506
+ {
1507
+ "epoch": 121.65,
1508
+ "grad_norm": 1.8714542388916016,
1509
+ "learning_rate": 1.695e-05,
1510
+ "loss": 0.2967,
1511
+ "step": 2950
1512
+ },
1513
+ {
1514
+ "epoch": 121.98,
1515
+ "eval_accuracy": 0.7657342657342657,
1516
+ "eval_loss": 0.6935350894927979,
1517
+ "eval_runtime": 4.7491,
1518
+ "eval_samples_per_second": 60.223,
1519
+ "eval_steps_per_second": 3.79,
1520
+ "step": 2958
1521
+ },
1522
+ {
1523
+ "epoch": 122.97,
1524
+ "eval_accuracy": 0.7692307692307693,
1525
+ "eval_loss": 0.7058219909667969,
1526
+ "eval_runtime": 4.8941,
1527
+ "eval_samples_per_second": 58.438,
1528
+ "eval_steps_per_second": 3.678,
1529
+ "step": 2982
1530
+ },
1531
+ {
1532
+ "epoch": 123.71,
1533
+ "grad_norm": 2.062924385070801,
1534
+ "learning_rate": 1.6672222222222224e-05,
1535
+ "loss": 0.2939,
1536
+ "step": 3000
1537
+ },
1538
+ {
1539
+ "epoch": 124.0,
1540
+ "eval_accuracy": 0.7657342657342657,
1541
+ "eval_loss": 0.7220865488052368,
1542
+ "eval_runtime": 5.0487,
1543
+ "eval_samples_per_second": 56.648,
1544
+ "eval_steps_per_second": 3.565,
1545
+ "step": 3007
1546
+ },
1547
+ {
1548
+ "epoch": 124.99,
1549
+ "eval_accuracy": 0.7727272727272727,
1550
+ "eval_loss": 0.6857044696807861,
1551
+ "eval_runtime": 5.6134,
1552
+ "eval_samples_per_second": 50.95,
1553
+ "eval_steps_per_second": 3.207,
1554
+ "step": 3031
1555
+ },
1556
+ {
1557
+ "epoch": 125.77,
1558
+ "grad_norm": 1.7039302587509155,
1559
+ "learning_rate": 1.6394444444444446e-05,
1560
+ "loss": 0.3101,
1561
+ "step": 3050
1562
+ },
1563
+ {
1564
+ "epoch": 125.98,
1565
+ "eval_accuracy": 0.7762237762237763,
1566
+ "eval_loss": 0.6742061972618103,
1567
+ "eval_runtime": 5.3609,
1568
+ "eval_samples_per_second": 53.349,
1569
+ "eval_steps_per_second": 3.358,
1570
+ "step": 3055
1571
+ },
1572
+ {
1573
+ "epoch": 126.97,
1574
+ "eval_accuracy": 0.7727272727272727,
1575
+ "eval_loss": 0.7029407620429993,
1576
+ "eval_runtime": 5.8891,
1577
+ "eval_samples_per_second": 48.564,
1578
+ "eval_steps_per_second": 3.056,
1579
+ "step": 3079
1580
+ },
1581
+ {
1582
+ "epoch": 127.84,
1583
+ "grad_norm": 1.434970736503601,
1584
+ "learning_rate": 1.6116666666666668e-05,
1585
+ "loss": 0.284,
1586
+ "step": 3100
1587
+ },
1588
+ {
1589
+ "epoch": 128.0,
1590
+ "eval_accuracy": 0.7762237762237763,
1591
+ "eval_loss": 0.682050347328186,
1592
+ "eval_runtime": 5.1437,
1593
+ "eval_samples_per_second": 55.602,
1594
+ "eval_steps_per_second": 3.499,
1595
+ "step": 3104
1596
+ },
1597
+ {
1598
+ "epoch": 128.99,
1599
+ "eval_accuracy": 0.7762237762237763,
1600
+ "eval_loss": 0.68370121717453,
1601
+ "eval_runtime": 4.2733,
1602
+ "eval_samples_per_second": 66.927,
1603
+ "eval_steps_per_second": 4.212,
1604
+ "step": 3128
1605
+ },
1606
+ {
1607
+ "epoch": 129.9,
1608
+ "grad_norm": 1.320789098739624,
1609
+ "learning_rate": 1.583888888888889e-05,
1610
+ "loss": 0.2902,
1611
+ "step": 3150
1612
+ },
1613
+ {
1614
+ "epoch": 129.98,
1615
+ "eval_accuracy": 0.7727272727272727,
1616
+ "eval_loss": 0.6823462843894958,
1617
+ "eval_runtime": 5.7566,
1618
+ "eval_samples_per_second": 49.682,
1619
+ "eval_steps_per_second": 3.127,
1620
+ "step": 3152
1621
+ },
1622
+ {
1623
+ "epoch": 130.97,
1624
+ "eval_accuracy": 0.7762237762237763,
1625
+ "eval_loss": 0.6950440406799316,
1626
+ "eval_runtime": 4.9248,
1627
+ "eval_samples_per_second": 58.074,
1628
+ "eval_steps_per_second": 3.655,
1629
+ "step": 3176
1630
+ },
1631
+ {
1632
+ "epoch": 131.96,
1633
+ "grad_norm": 2.1280930042266846,
1634
+ "learning_rate": 1.556111111111111e-05,
1635
+ "loss": 0.301,
1636
+ "step": 3200
1637
+ },
1638
+ {
1639
+ "epoch": 132.0,
1640
+ "eval_accuracy": 0.7727272727272727,
1641
+ "eval_loss": 0.6800761818885803,
1642
+ "eval_runtime": 8.1328,
1643
+ "eval_samples_per_second": 35.166,
1644
+ "eval_steps_per_second": 2.213,
1645
+ "step": 3201
1646
+ },
1647
+ {
1648
+ "epoch": 132.99,
1649
+ "eval_accuracy": 0.7762237762237763,
1650
+ "eval_loss": 0.6867505311965942,
1651
+ "eval_runtime": 4.2532,
1652
+ "eval_samples_per_second": 67.244,
1653
+ "eval_steps_per_second": 4.232,
1654
+ "step": 3225
1655
+ },
1656
+ {
1657
+ "epoch": 133.98,
1658
+ "eval_accuracy": 0.7797202797202797,
1659
+ "eval_loss": 0.7061284184455872,
1660
+ "eval_runtime": 5.3031,
1661
+ "eval_samples_per_second": 53.93,
1662
+ "eval_steps_per_second": 3.394,
1663
+ "step": 3249
1664
+ },
1665
+ {
1666
+ "epoch": 134.02,
1667
+ "grad_norm": 1.532638669013977,
1668
+ "learning_rate": 1.5283333333333333e-05,
1669
+ "loss": 0.2736,
1670
+ "step": 3250
1671
+ },
1672
+ {
1673
+ "epoch": 134.97,
1674
+ "eval_accuracy": 0.7727272727272727,
1675
+ "eval_loss": 0.7114368677139282,
1676
+ "eval_runtime": 4.6536,
1677
+ "eval_samples_per_second": 61.458,
1678
+ "eval_steps_per_second": 3.868,
1679
+ "step": 3273
1680
+ },
1681
+ {
1682
+ "epoch": 136.0,
1683
+ "eval_accuracy": 0.7762237762237763,
1684
+ "eval_loss": 0.6914551854133606,
1685
+ "eval_runtime": 4.5505,
1686
+ "eval_samples_per_second": 62.851,
1687
+ "eval_steps_per_second": 3.956,
1688
+ "step": 3298
1689
+ },
1690
+ {
1691
+ "epoch": 136.08,
1692
+ "grad_norm": 2.0108492374420166,
1693
+ "learning_rate": 1.5005555555555555e-05,
1694
+ "loss": 0.2931,
1695
+ "step": 3300
1696
+ },
1697
+ {
1698
+ "epoch": 136.99,
1699
+ "eval_accuracy": 0.7797202797202797,
1700
+ "eval_loss": 0.7055917978286743,
1701
+ "eval_runtime": 5.3067,
1702
+ "eval_samples_per_second": 53.894,
1703
+ "eval_steps_per_second": 3.392,
1704
+ "step": 3322
1705
+ },
1706
+ {
1707
+ "epoch": 137.98,
1708
+ "eval_accuracy": 0.7727272727272727,
1709
+ "eval_loss": 0.7026935815811157,
1710
+ "eval_runtime": 5.186,
1711
+ "eval_samples_per_second": 55.149,
1712
+ "eval_steps_per_second": 3.471,
1713
+ "step": 3346
1714
+ },
1715
+ {
1716
+ "epoch": 138.14,
1717
+ "grad_norm": 1.0804469585418701,
1718
+ "learning_rate": 1.4727777777777779e-05,
1719
+ "loss": 0.2864,
1720
+ "step": 3350
1721
+ },
1722
+ {
1723
+ "epoch": 138.97,
1724
+ "eval_accuracy": 0.7657342657342657,
1725
+ "eval_loss": 0.6983500719070435,
1726
+ "eval_runtime": 6.955,
1727
+ "eval_samples_per_second": 41.122,
1728
+ "eval_steps_per_second": 2.588,
1729
+ "step": 3370
1730
+ },
1731
+ {
1732
+ "epoch": 140.0,
1733
+ "eval_accuracy": 0.7657342657342657,
1734
+ "eval_loss": 0.7168787121772766,
1735
+ "eval_runtime": 4.234,
1736
+ "eval_samples_per_second": 67.548,
1737
+ "eval_steps_per_second": 4.251,
1738
+ "step": 3395
1739
+ },
1740
+ {
1741
+ "epoch": 140.21,
1742
+ "grad_norm": 2.370694637298584,
1743
+ "learning_rate": 1.445e-05,
1744
+ "loss": 0.2765,
1745
+ "step": 3400
1746
+ },
1747
+ {
1748
+ "epoch": 140.99,
1749
+ "eval_accuracy": 0.7762237762237763,
1750
+ "eval_loss": 0.6960318088531494,
1751
+ "eval_runtime": 5.0294,
1752
+ "eval_samples_per_second": 56.865,
1753
+ "eval_steps_per_second": 3.579,
1754
+ "step": 3419
1755
+ },
1756
+ {
1757
+ "epoch": 141.98,
1758
+ "eval_accuracy": 0.7762237762237763,
1759
+ "eval_loss": 0.6990492343902588,
1760
+ "eval_runtime": 5.2727,
1761
+ "eval_samples_per_second": 54.242,
1762
+ "eval_steps_per_second": 3.414,
1763
+ "step": 3443
1764
+ },
1765
+ {
1766
+ "epoch": 142.27,
1767
+ "grad_norm": 1.6676194667816162,
1768
+ "learning_rate": 1.4172222222222222e-05,
1769
+ "loss": 0.2808,
1770
+ "step": 3450
1771
+ },
1772
+ {
1773
+ "epoch": 142.97,
1774
+ "eval_accuracy": 0.7797202797202797,
1775
+ "eval_loss": 0.706200897693634,
1776
+ "eval_runtime": 4.5273,
1777
+ "eval_samples_per_second": 63.173,
1778
+ "eval_steps_per_second": 3.976,
1779
+ "step": 3467
1780
+ },
1781
+ {
1782
+ "epoch": 144.0,
1783
+ "eval_accuracy": 0.7657342657342657,
1784
+ "eval_loss": 0.6821764707565308,
1785
+ "eval_runtime": 5.3614,
1786
+ "eval_samples_per_second": 53.344,
1787
+ "eval_steps_per_second": 3.357,
1788
+ "step": 3492
1789
+ },
1790
+ {
1791
+ "epoch": 144.33,
1792
+ "grad_norm": 1.9151145219802856,
1793
+ "learning_rate": 1.3894444444444444e-05,
1794
+ "loss": 0.2712,
1795
+ "step": 3500
1796
+ },
1797
+ {
1798
+ "epoch": 144.99,
1799
+ "eval_accuracy": 0.7762237762237763,
1800
+ "eval_loss": 0.7063603401184082,
1801
+ "eval_runtime": 4.9088,
1802
+ "eval_samples_per_second": 58.263,
1803
+ "eval_steps_per_second": 3.667,
1804
+ "step": 3516
1805
+ },
1806
+ {
1807
+ "epoch": 145.98,
1808
+ "eval_accuracy": 0.7692307692307693,
1809
+ "eval_loss": 0.7150112390518188,
1810
+ "eval_runtime": 7.2044,
1811
+ "eval_samples_per_second": 39.698,
1812
+ "eval_steps_per_second": 2.498,
1813
+ "step": 3540
1814
+ },
1815
+ {
1816
+ "epoch": 146.39,
1817
+ "grad_norm": 1.5093848705291748,
1818
+ "learning_rate": 1.3622222222222223e-05,
1819
+ "loss": 0.2726,
1820
+ "step": 3550
1821
+ },
1822
+ {
1823
+ "epoch": 146.97,
1824
+ "eval_accuracy": 0.7797202797202797,
1825
+ "eval_loss": 0.696849524974823,
1826
+ "eval_runtime": 4.9386,
1827
+ "eval_samples_per_second": 57.911,
1828
+ "eval_steps_per_second": 3.645,
1829
+ "step": 3564
1830
+ },
1831
+ {
1832
+ "epoch": 148.0,
1833
+ "eval_accuracy": 0.7727272727272727,
1834
+ "eval_loss": 0.7086759209632874,
1835
+ "eval_runtime": 4.4363,
1836
+ "eval_samples_per_second": 64.468,
1837
+ "eval_steps_per_second": 4.057,
1838
+ "step": 3589
1839
+ },
1840
+ {
1841
+ "epoch": 148.45,
1842
+ "grad_norm": 1.4403679370880127,
1843
+ "learning_rate": 1.3344444444444444e-05,
1844
+ "loss": 0.2607,
1845
+ "step": 3600
1846
+ },
1847
+ {
1848
+ "epoch": 148.99,
1849
+ "eval_accuracy": 0.7692307692307693,
1850
+ "eval_loss": 0.7129560112953186,
1851
+ "eval_runtime": 5.3809,
1852
+ "eval_samples_per_second": 53.15,
1853
+ "eval_steps_per_second": 3.345,
1854
+ "step": 3613
1855
+ },
1856
+ {
1857
+ "epoch": 149.98,
1858
+ "eval_accuracy": 0.7902097902097902,
1859
+ "eval_loss": 0.7080287933349609,
1860
+ "eval_runtime": 5.8187,
1861
+ "eval_samples_per_second": 49.152,
1862
+ "eval_steps_per_second": 3.093,
1863
+ "step": 3637
1864
+ },
1865
+ {
1866
+ "epoch": 150.52,
1867
+ "grad_norm": 2.036515235900879,
1868
+ "learning_rate": 1.3066666666666666e-05,
1869
+ "loss": 0.2546,
1870
+ "step": 3650
1871
+ },
1872
+ {
1873
+ "epoch": 150.97,
1874
+ "eval_accuracy": 0.7762237762237763,
1875
+ "eval_loss": 0.7088435888290405,
1876
+ "eval_runtime": 4.8742,
1877
+ "eval_samples_per_second": 58.677,
1878
+ "eval_steps_per_second": 3.693,
1879
+ "step": 3661
1880
+ },
1881
+ {
1882
+ "epoch": 152.0,
1883
+ "eval_accuracy": 0.7797202797202797,
1884
+ "eval_loss": 0.7030193209648132,
1885
+ "eval_runtime": 4.9492,
1886
+ "eval_samples_per_second": 57.787,
1887
+ "eval_steps_per_second": 3.637,
1888
+ "step": 3686
1889
+ },
1890
+ {
1891
+ "epoch": 152.58,
1892
+ "grad_norm": 1.200052261352539,
1893
+ "learning_rate": 1.2788888888888888e-05,
1894
+ "loss": 0.2563,
1895
+ "step": 3700
1896
+ },
1897
+ {
1898
+ "epoch": 152.99,
1899
+ "eval_accuracy": 0.7692307692307693,
1900
+ "eval_loss": 0.7077969908714294,
1901
+ "eval_runtime": 4.614,
1902
+ "eval_samples_per_second": 61.985,
1903
+ "eval_steps_per_second": 3.901,
1904
+ "step": 3710
1905
+ },
1906
+ {
1907
+ "epoch": 153.98,
1908
+ "eval_accuracy": 0.7727272727272727,
1909
+ "eval_loss": 0.700455904006958,
1910
+ "eval_runtime": 5.7657,
1911
+ "eval_samples_per_second": 49.604,
1912
+ "eval_steps_per_second": 3.122,
1913
+ "step": 3734
1914
+ },
1915
+ {
1916
+ "epoch": 154.64,
1917
+ "grad_norm": 2.2751214504241943,
1918
+ "learning_rate": 1.2511111111111112e-05,
1919
+ "loss": 0.2531,
1920
+ "step": 3750
1921
+ },
1922
+ {
1923
+ "epoch": 154.97,
1924
+ "eval_accuracy": 0.7727272727272727,
1925
+ "eval_loss": 0.7160292267799377,
1926
+ "eval_runtime": 5.1079,
1927
+ "eval_samples_per_second": 55.992,
1928
+ "eval_steps_per_second": 3.524,
1929
+ "step": 3758
1930
+ },
1931
+ {
1932
+ "epoch": 156.0,
1933
+ "eval_accuracy": 0.7797202797202797,
1934
+ "eval_loss": 0.7175909876823425,
1935
+ "eval_runtime": 5.4035,
1936
+ "eval_samples_per_second": 52.929,
1937
+ "eval_steps_per_second": 3.331,
1938
+ "step": 3783
1939
+ },
1940
+ {
1941
+ "epoch": 156.7,
1942
+ "grad_norm": 1.9024412631988525,
1943
+ "learning_rate": 1.2233333333333334e-05,
1944
+ "loss": 0.2446,
1945
+ "step": 3800
1946
+ },
1947
+ {
1948
+ "epoch": 156.99,
1949
+ "eval_accuracy": 0.7762237762237763,
1950
+ "eval_loss": 0.7190600037574768,
1951
+ "eval_runtime": 4.3633,
1952
+ "eval_samples_per_second": 65.546,
1953
+ "eval_steps_per_second": 4.125,
1954
+ "step": 3807
1955
+ },
1956
+ {
1957
+ "epoch": 157.98,
1958
+ "eval_accuracy": 0.7797202797202797,
1959
+ "eval_loss": 0.719641387462616,
1960
+ "eval_runtime": 5.0426,
1961
+ "eval_samples_per_second": 56.717,
1962
+ "eval_steps_per_second": 3.57,
1963
+ "step": 3831
1964
+ },
1965
+ {
1966
+ "epoch": 158.76,
1967
+ "grad_norm": 3.471806287765503,
1968
+ "learning_rate": 1.1955555555555556e-05,
1969
+ "loss": 0.2479,
1970
+ "step": 3850
1971
+ },
1972
+ {
1973
+ "epoch": 158.97,
1974
+ "eval_accuracy": 0.7797202797202797,
1975
+ "eval_loss": 0.7073430418968201,
1976
+ "eval_runtime": 3.6336,
1977
+ "eval_samples_per_second": 78.711,
1978
+ "eval_steps_per_second": 4.954,
1979
+ "step": 3855
1980
+ },
1981
+ {
1982
+ "epoch": 160.0,
1983
+ "eval_accuracy": 0.7797202797202797,
1984
+ "eval_loss": 0.7328661680221558,
1985
+ "eval_runtime": 5.2625,
1986
+ "eval_samples_per_second": 54.347,
1987
+ "eval_steps_per_second": 3.42,
1988
+ "step": 3880
1989
+ },
1990
+ {
1991
+ "epoch": 160.82,
1992
+ "grad_norm": 2.1171793937683105,
1993
+ "learning_rate": 1.1677777777777777e-05,
1994
+ "loss": 0.2523,
1995
+ "step": 3900
1996
+ },
1997
+ {
1998
+ "epoch": 160.99,
1999
+ "eval_accuracy": 0.7832167832167832,
2000
+ "eval_loss": 0.7158821821212769,
2001
+ "eval_runtime": 6.5877,
2002
+ "eval_samples_per_second": 43.414,
2003
+ "eval_steps_per_second": 2.732,
2004
+ "step": 3904
2005
+ },
2006
+ {
2007
+ "epoch": 161.98,
2008
+ "eval_accuracy": 0.7692307692307693,
2009
+ "eval_loss": 0.719171404838562,
2010
+ "eval_runtime": 4.5674,
2011
+ "eval_samples_per_second": 62.618,
2012
+ "eval_steps_per_second": 3.941,
2013
+ "step": 3928
2014
+ },
2015
+ {
2016
+ "epoch": 162.89,
2017
+ "grad_norm": 1.7515395879745483,
2018
+ "learning_rate": 1.1400000000000001e-05,
2019
+ "loss": 0.2523,
2020
+ "step": 3950
2021
+ },
2022
+ {
2023
+ "epoch": 162.97,
2024
+ "eval_accuracy": 0.7762237762237763,
2025
+ "eval_loss": 0.7281435132026672,
2026
+ "eval_runtime": 4.4866,
2027
+ "eval_samples_per_second": 63.746,
2028
+ "eval_steps_per_second": 4.012,
2029
+ "step": 3952
2030
+ },
2031
+ {
2032
+ "epoch": 164.0,
2033
+ "eval_accuracy": 0.7832167832167832,
2034
+ "eval_loss": 0.7078841328620911,
2035
+ "eval_runtime": 4.4241,
2036
+ "eval_samples_per_second": 64.645,
2037
+ "eval_steps_per_second": 4.069,
2038
+ "step": 3977
2039
+ },
2040
+ {
2041
+ "epoch": 164.95,
2042
+ "grad_norm": 1.456335186958313,
2043
+ "learning_rate": 1.1122222222222223e-05,
2044
+ "loss": 0.2422,
2045
+ "step": 4000
2046
+ },
2047
+ {
2048
+ "epoch": 164.99,
2049
+ "eval_accuracy": 0.7762237762237763,
2050
+ "eval_loss": 0.7161521911621094,
2051
+ "eval_runtime": 5.1239,
2052
+ "eval_samples_per_second": 55.817,
2053
+ "eval_steps_per_second": 3.513,
2054
+ "step": 4001
2055
+ },
2056
+ {
2057
+ "epoch": 165.98,
2058
+ "eval_accuracy": 0.7832167832167832,
2059
+ "eval_loss": 0.7190020084381104,
2060
+ "eval_runtime": 3.4488,
2061
+ "eval_samples_per_second": 82.926,
2062
+ "eval_steps_per_second": 5.219,
2063
+ "step": 4025
2064
+ },
2065
+ {
2066
+ "epoch": 166.97,
2067
+ "eval_accuracy": 0.7762237762237763,
2068
+ "eval_loss": 0.7311248779296875,
2069
+ "eval_runtime": 5.0389,
2070
+ "eval_samples_per_second": 56.759,
2071
+ "eval_steps_per_second": 3.572,
2072
+ "step": 4049
2073
+ },
2074
+ {
2075
+ "epoch": 167.01,
2076
+ "grad_norm": 1.2554075717926025,
2077
+ "learning_rate": 1.0844444444444445e-05,
2078
+ "loss": 0.242,
2079
+ "step": 4050
2080
+ },
2081
+ {
2082
+ "epoch": 168.0,
2083
+ "eval_accuracy": 0.7902097902097902,
2084
+ "eval_loss": 0.7110462188720703,
2085
+ "eval_runtime": 4.4612,
2086
+ "eval_samples_per_second": 64.108,
2087
+ "eval_steps_per_second": 4.035,
2088
+ "step": 4074
2089
+ },
2090
+ {
2091
+ "epoch": 168.99,
2092
+ "eval_accuracy": 0.7867132867132867,
2093
+ "eval_loss": 0.7028501629829407,
2094
+ "eval_runtime": 6.955,
2095
+ "eval_samples_per_second": 41.122,
2096
+ "eval_steps_per_second": 2.588,
2097
+ "step": 4098
2098
+ },
2099
+ {
2100
+ "epoch": 169.07,
2101
+ "grad_norm": 2.8003265857696533,
2102
+ "learning_rate": 1.0566666666666667e-05,
2103
+ "loss": 0.2392,
2104
+ "step": 4100
2105
+ },
2106
+ {
2107
+ "epoch": 169.98,
2108
+ "eval_accuracy": 0.7937062937062938,
2109
+ "eval_loss": 0.7108554840087891,
2110
+ "eval_runtime": 5.0033,
2111
+ "eval_samples_per_second": 57.162,
2112
+ "eval_steps_per_second": 3.598,
2113
+ "step": 4122
2114
+ },
2115
+ {
2116
+ "epoch": 170.97,
2117
+ "eval_accuracy": 0.7902097902097902,
2118
+ "eval_loss": 0.7106384634971619,
2119
+ "eval_runtime": 5.1984,
2120
+ "eval_samples_per_second": 55.017,
2121
+ "eval_steps_per_second": 3.463,
2122
+ "step": 4146
2123
+ },
2124
+ {
2125
+ "epoch": 171.13,
2126
+ "grad_norm": 2.1897969245910645,
2127
+ "learning_rate": 1.028888888888889e-05,
2128
+ "loss": 0.247,
2129
+ "step": 4150
2130
+ },
2131
+ {
2132
+ "epoch": 172.0,
2133
+ "eval_accuracy": 0.7867132867132867,
2134
+ "eval_loss": 0.7151694297790527,
2135
+ "eval_runtime": 5.1963,
2136
+ "eval_samples_per_second": 55.039,
2137
+ "eval_steps_per_second": 3.464,
2138
+ "step": 4171
2139
+ },
2140
+ {
2141
+ "epoch": 172.99,
2142
+ "eval_accuracy": 0.7657342657342657,
2143
+ "eval_loss": 0.7254167795181274,
2144
+ "eval_runtime": 4.4466,
2145
+ "eval_samples_per_second": 64.319,
2146
+ "eval_steps_per_second": 4.048,
2147
+ "step": 4195
2148
+ },
2149
+ {
2150
+ "epoch": 173.2,
2151
+ "grad_norm": 2.769357681274414,
2152
+ "learning_rate": 1.0011111111111112e-05,
2153
+ "loss": 0.2341,
2154
+ "step": 4200
2155
+ },
2156
+ {
2157
+ "epoch": 173.98,
2158
+ "eval_accuracy": 0.7832167832167832,
2159
+ "eval_loss": 0.7290962338447571,
2160
+ "eval_runtime": 6.2221,
2161
+ "eval_samples_per_second": 45.965,
2162
+ "eval_steps_per_second": 2.893,
2163
+ "step": 4219
2164
+ },
2165
+ {
2166
+ "epoch": 174.97,
2167
+ "eval_accuracy": 0.7867132867132867,
2168
+ "eval_loss": 0.7088623046875,
2169
+ "eval_runtime": 4.3709,
2170
+ "eval_samples_per_second": 65.433,
2171
+ "eval_steps_per_second": 4.118,
2172
+ "step": 4243
2173
+ },
2174
+ {
2175
+ "epoch": 175.26,
2176
+ "grad_norm": 2.044703483581543,
2177
+ "learning_rate": 9.733333333333332e-06,
2178
+ "loss": 0.2317,
2179
+ "step": 4250
2180
+ },
2181
+ {
2182
+ "epoch": 176.0,
2183
+ "eval_accuracy": 0.7902097902097902,
2184
+ "eval_loss": 0.7185826897621155,
2185
+ "eval_runtime": 5.4095,
2186
+ "eval_samples_per_second": 52.87,
2187
+ "eval_steps_per_second": 3.327,
2188
+ "step": 4268
2189
+ },
2190
+ {
2191
+ "epoch": 176.99,
2192
+ "eval_accuracy": 0.7797202797202797,
2193
+ "eval_loss": 0.7167823314666748,
2194
+ "eval_runtime": 4.9506,
2195
+ "eval_samples_per_second": 57.77,
2196
+ "eval_steps_per_second": 3.636,
2197
+ "step": 4292
2198
+ },
2199
+ {
2200
+ "epoch": 177.32,
2201
+ "grad_norm": 1.078834056854248,
2202
+ "learning_rate": 9.455555555555556e-06,
2203
+ "loss": 0.2269,
2204
+ "step": 4300
2205
+ },
2206
+ {
2207
+ "epoch": 177.98,
2208
+ "eval_accuracy": 0.7902097902097902,
2209
+ "eval_loss": 0.7237738966941833,
2210
+ "eval_runtime": 4.781,
2211
+ "eval_samples_per_second": 59.82,
2212
+ "eval_steps_per_second": 3.765,
2213
+ "step": 4316
2214
+ },
2215
+ {
2216
+ "epoch": 178.97,
2217
+ "eval_accuracy": 0.7867132867132867,
2218
+ "eval_loss": 0.7131801247596741,
2219
+ "eval_runtime": 4.6869,
2220
+ "eval_samples_per_second": 61.022,
2221
+ "eval_steps_per_second": 3.841,
2222
+ "step": 4340
2223
+ },
2224
+ {
2225
+ "epoch": 179.38,
2226
+ "grad_norm": 2.008120536804199,
2227
+ "learning_rate": 9.177777777777778e-06,
2228
+ "loss": 0.2283,
2229
+ "step": 4350
2230
+ },
2231
+ {
2232
+ "epoch": 180.0,
2233
+ "eval_accuracy": 0.7797202797202797,
2234
+ "eval_loss": 0.7384253144264221,
2235
+ "eval_runtime": 4.5879,
2236
+ "eval_samples_per_second": 62.338,
2237
+ "eval_steps_per_second": 3.923,
2238
+ "step": 4365
2239
+ },
2240
+ {
2241
+ "epoch": 180.99,
2242
+ "eval_accuracy": 0.7902097902097902,
2243
+ "eval_loss": 0.7002861499786377,
2244
+ "eval_runtime": 5.3238,
2245
+ "eval_samples_per_second": 53.721,
2246
+ "eval_steps_per_second": 3.381,
2247
+ "step": 4389
2248
+ },
2249
+ {
2250
+ "epoch": 181.44,
2251
+ "grad_norm": 1.9518792629241943,
2252
+ "learning_rate": 8.900000000000001e-06,
2253
+ "loss": 0.2303,
2254
+ "step": 4400
2255
+ },
2256
+ {
2257
+ "epoch": 181.98,
2258
+ "eval_accuracy": 0.7797202797202797,
2259
+ "eval_loss": 0.7278482913970947,
2260
+ "eval_runtime": 5.8358,
2261
+ "eval_samples_per_second": 49.008,
2262
+ "eval_steps_per_second": 3.084,
2263
+ "step": 4413
2264
+ },
2265
+ {
2266
+ "epoch": 182.97,
2267
+ "eval_accuracy": 0.7832167832167832,
2268
+ "eval_loss": 0.7143127918243408,
2269
+ "eval_runtime": 6.1229,
2270
+ "eval_samples_per_second": 46.71,
2271
+ "eval_steps_per_second": 2.94,
2272
+ "step": 4437
2273
+ },
2274
+ {
2275
+ "epoch": 183.51,
2276
+ "grad_norm": 1.0936890840530396,
2277
+ "learning_rate": 8.622222222222221e-06,
2278
+ "loss": 0.2109,
2279
+ "step": 4450
2280
+ },
2281
+ {
2282
+ "epoch": 184.0,
2283
+ "eval_accuracy": 0.7797202797202797,
2284
+ "eval_loss": 0.7406834363937378,
2285
+ "eval_runtime": 5.0467,
2286
+ "eval_samples_per_second": 56.671,
2287
+ "eval_steps_per_second": 3.567,
2288
+ "step": 4462
2289
+ },
2290
+ {
2291
+ "epoch": 184.99,
2292
+ "eval_accuracy": 0.7797202797202797,
2293
+ "eval_loss": 0.7053534388542175,
2294
+ "eval_runtime": 5.279,
2295
+ "eval_samples_per_second": 54.177,
2296
+ "eval_steps_per_second": 3.41,
2297
+ "step": 4486
2298
+ },
2299
+ {
2300
+ "epoch": 185.57,
2301
+ "grad_norm": 2.9350059032440186,
2302
+ "learning_rate": 8.344444444444445e-06,
2303
+ "loss": 0.2261,
2304
+ "step": 4500
2305
+ },
2306
+ {
2307
+ "epoch": 185.98,
2308
+ "eval_accuracy": 0.7727272727272727,
2309
+ "eval_loss": 0.7260809540748596,
2310
+ "eval_runtime": 5.4165,
2311
+ "eval_samples_per_second": 52.802,
2312
+ "eval_steps_per_second": 3.323,
2313
+ "step": 4510
2314
+ },
2315
+ {
2316
+ "epoch": 186.97,
2317
+ "eval_accuracy": 0.7902097902097902,
2318
+ "eval_loss": 0.7240064144134521,
2319
+ "eval_runtime": 5.4866,
2320
+ "eval_samples_per_second": 52.127,
2321
+ "eval_steps_per_second": 3.281,
2322
+ "step": 4534
2323
+ },
2324
+ {
2325
+ "epoch": 187.63,
2326
+ "grad_norm": 1.8322782516479492,
2327
+ "learning_rate": 8.066666666666667e-06,
2328
+ "loss": 0.2282,
2329
+ "step": 4550
2330
+ },
2331
+ {
2332
+ "epoch": 188.0,
2333
+ "eval_accuracy": 0.7867132867132867,
2334
+ "eval_loss": 0.7199599146842957,
2335
+ "eval_runtime": 4.6736,
2336
+ "eval_samples_per_second": 61.195,
2337
+ "eval_steps_per_second": 3.851,
2338
+ "step": 4559
2339
+ },
2340
+ {
2341
+ "epoch": 188.99,
2342
+ "eval_accuracy": 0.7797202797202797,
2343
+ "eval_loss": 0.7102844715118408,
2344
+ "eval_runtime": 5.4219,
2345
+ "eval_samples_per_second": 52.749,
2346
+ "eval_steps_per_second": 3.32,
2347
+ "step": 4583
2348
+ },
2349
+ {
2350
+ "epoch": 189.69,
2351
+ "grad_norm": 1.8777916431427002,
2352
+ "learning_rate": 7.78888888888889e-06,
2353
+ "loss": 0.2321,
2354
+ "step": 4600
2355
+ },
2356
+ {
2357
+ "epoch": 189.98,
2358
+ "eval_accuracy": 0.7797202797202797,
2359
+ "eval_loss": 0.7083376049995422,
2360
+ "eval_runtime": 5.9634,
2361
+ "eval_samples_per_second": 47.959,
2362
+ "eval_steps_per_second": 3.018,
2363
+ "step": 4607
2364
+ },
2365
+ {
2366
+ "epoch": 190.97,
2367
+ "eval_accuracy": 0.7832167832167832,
2368
+ "eval_loss": 0.7244677543640137,
2369
+ "eval_runtime": 5.2078,
2370
+ "eval_samples_per_second": 54.918,
2371
+ "eval_steps_per_second": 3.456,
2372
+ "step": 4631
2373
+ },
2374
+ {
2375
+ "epoch": 191.75,
2376
+ "grad_norm": 1.5277408361434937,
2377
+ "learning_rate": 7.5111111111111105e-06,
2378
+ "loss": 0.2261,
2379
+ "step": 4650
2380
+ },
2381
+ {
2382
+ "epoch": 192.0,
2383
+ "eval_accuracy": 0.7867132867132867,
2384
+ "eval_loss": 0.7124583721160889,
2385
+ "eval_runtime": 5.7079,
2386
+ "eval_samples_per_second": 50.106,
2387
+ "eval_steps_per_second": 3.154,
2388
+ "step": 4656
2389
+ },
2390
+ {
2391
+ "epoch": 192.99,
2392
+ "eval_accuracy": 0.7867132867132867,
2393
+ "eval_loss": 0.7308976054191589,
2394
+ "eval_runtime": 5.3404,
2395
+ "eval_samples_per_second": 53.554,
2396
+ "eval_steps_per_second": 3.371,
2397
+ "step": 4680
2398
+ },
2399
+ {
2400
+ "epoch": 193.81,
2401
+ "grad_norm": 2.095749616622925,
2402
+ "learning_rate": 7.233333333333333e-06,
2403
+ "loss": 0.2231,
2404
+ "step": 4700
2405
+ },
2406
+ {
2407
+ "epoch": 193.98,
2408
+ "eval_accuracy": 0.7832167832167832,
2409
+ "eval_loss": 0.7237818837165833,
2410
+ "eval_runtime": 4.6666,
2411
+ "eval_samples_per_second": 61.286,
2412
+ "eval_steps_per_second": 3.857,
2413
+ "step": 4704
2414
+ },
2415
+ {
2416
+ "epoch": 194.97,
2417
+ "eval_accuracy": 0.7832167832167832,
2418
+ "eval_loss": 0.7253320217132568,
2419
+ "eval_runtime": 5.8059,
2420
+ "eval_samples_per_second": 49.261,
2421
+ "eval_steps_per_second": 3.1,
2422
+ "step": 4728
2423
+ },
2424
+ {
2425
+ "epoch": 195.88,
2426
+ "grad_norm": 1.6955636739730835,
2427
+ "learning_rate": 6.955555555555556e-06,
2428
+ "loss": 0.2083,
2429
+ "step": 4750
2430
+ },
2431
+ {
2432
+ "epoch": 196.0,
2433
+ "eval_accuracy": 0.7832167832167832,
2434
+ "eval_loss": 0.7240011692047119,
2435
+ "eval_runtime": 6.0767,
2436
+ "eval_samples_per_second": 47.065,
2437
+ "eval_steps_per_second": 2.962,
2438
+ "step": 4753
2439
+ },
2440
+ {
2441
+ "epoch": 196.99,
2442
+ "eval_accuracy": 0.7832167832167832,
2443
+ "eval_loss": 0.7131750583648682,
2444
+ "eval_runtime": 5.3063,
2445
+ "eval_samples_per_second": 53.898,
2446
+ "eval_steps_per_second": 3.392,
2447
+ "step": 4777
2448
+ },
2449
+ {
2450
+ "epoch": 197.94,
2451
+ "grad_norm": 0.8933289051055908,
2452
+ "learning_rate": 6.677777777777778e-06,
2453
+ "loss": 0.2116,
2454
+ "step": 4800
2455
+ },
2456
+ {
2457
+ "epoch": 197.98,
2458
+ "eval_accuracy": 0.7867132867132867,
2459
+ "eval_loss": 0.7169559597969055,
2460
+ "eval_runtime": 5.5713,
2461
+ "eval_samples_per_second": 51.335,
2462
+ "eval_steps_per_second": 3.231,
2463
+ "step": 4801
2464
+ },
2465
+ {
2466
+ "epoch": 198.97,
2467
+ "eval_accuracy": 0.7832167832167832,
2468
+ "eval_loss": 0.7265609502792358,
2469
+ "eval_runtime": 4.1397,
2470
+ "eval_samples_per_second": 69.087,
2471
+ "eval_steps_per_second": 4.348,
2472
+ "step": 4825
2473
+ },
2474
+ {
2475
+ "epoch": 200.0,
2476
+ "grad_norm": 2.175414562225342,
2477
+ "learning_rate": 6.4000000000000006e-06,
2478
+ "loss": 0.2219,
2479
+ "step": 4850
2480
+ },
2481
+ {
2482
+ "epoch": 200.0,
2483
+ "eval_accuracy": 0.7832167832167832,
2484
+ "eval_loss": 0.7162622213363647,
2485
+ "eval_runtime": 5.2016,
2486
+ "eval_samples_per_second": 54.984,
2487
+ "eval_steps_per_second": 3.461,
2488
+ "step": 4850
2489
+ },
2490
+ {
2491
+ "epoch": 200.99,
2492
+ "eval_accuracy": 0.7797202797202797,
2493
+ "eval_loss": 0.7302048802375793,
2494
+ "eval_runtime": 4.9222,
2495
+ "eval_samples_per_second": 58.104,
2496
+ "eval_steps_per_second": 3.657,
2497
+ "step": 4874
2498
+ },
2499
+ {
2500
+ "epoch": 201.98,
2501
+ "eval_accuracy": 0.7832167832167832,
2502
+ "eval_loss": 0.7223746180534363,
2503
+ "eval_runtime": 4.6884,
2504
+ "eval_samples_per_second": 61.002,
2505
+ "eval_steps_per_second": 3.839,
2506
+ "step": 4898
2507
+ },
2508
+ {
2509
+ "epoch": 202.06,
2510
+ "grad_norm": 2.053739309310913,
2511
+ "learning_rate": 6.1222222222222224e-06,
2512
+ "loss": 0.2183,
2513
+ "step": 4900
2514
+ },
2515
+ {
2516
+ "epoch": 202.97,
2517
+ "eval_accuracy": 0.7797202797202797,
2518
+ "eval_loss": 0.7179226279258728,
2519
+ "eval_runtime": 4.5556,
2520
+ "eval_samples_per_second": 62.78,
2521
+ "eval_steps_per_second": 3.951,
2522
+ "step": 4922
2523
+ },
2524
+ {
2525
+ "epoch": 204.0,
2526
+ "eval_accuracy": 0.7797202797202797,
2527
+ "eval_loss": 0.7245286107063293,
2528
+ "eval_runtime": 5.7474,
2529
+ "eval_samples_per_second": 49.762,
2530
+ "eval_steps_per_second": 3.132,
2531
+ "step": 4947
2532
+ },
2533
+ {
2534
+ "epoch": 204.12,
2535
+ "grad_norm": 1.1081063747406006,
2536
+ "learning_rate": 5.844444444444444e-06,
2537
+ "loss": 0.2053,
2538
+ "step": 4950
2539
+ },
2540
+ {
2541
+ "epoch": 204.99,
2542
+ "eval_accuracy": 0.7832167832167832,
2543
+ "eval_loss": 0.7344977259635925,
2544
+ "eval_runtime": 5.4178,
2545
+ "eval_samples_per_second": 52.789,
2546
+ "eval_steps_per_second": 3.322,
2547
+ "step": 4971
2548
+ },
2549
+ {
2550
+ "epoch": 205.98,
2551
+ "eval_accuracy": 0.7832167832167832,
2552
+ "eval_loss": 0.7249557971954346,
2553
+ "eval_runtime": 5.6352,
2554
+ "eval_samples_per_second": 50.753,
2555
+ "eval_steps_per_second": 3.194,
2556
+ "step": 4995
2557
+ },
2558
+ {
2559
+ "epoch": 206.19,
2560
+ "grad_norm": 1.09213125705719,
2561
+ "learning_rate": 5.566666666666667e-06,
2562
+ "loss": 0.2113,
2563
+ "step": 5000
2564
+ },
2565
+ {
2566
+ "epoch": 206.97,
2567
+ "eval_accuracy": 0.7832167832167832,
2568
+ "eval_loss": 0.7246001958847046,
2569
+ "eval_runtime": 4.9071,
2570
+ "eval_samples_per_second": 58.283,
2571
+ "eval_steps_per_second": 3.668,
2572
+ "step": 5019
2573
+ },
2574
+ {
2575
+ "epoch": 208.0,
2576
+ "eval_accuracy": 0.7867132867132867,
2577
+ "eval_loss": 0.7270117998123169,
2578
+ "eval_runtime": 5.8385,
2579
+ "eval_samples_per_second": 48.985,
2580
+ "eval_steps_per_second": 3.083,
2581
+ "step": 5044
2582
+ },
2583
+ {
2584
+ "epoch": 208.25,
2585
+ "grad_norm": 1.6693130731582642,
2586
+ "learning_rate": 5.288888888888889e-06,
2587
+ "loss": 0.2152,
2588
+ "step": 5050
2589
+ },
2590
+ {
2591
+ "epoch": 208.99,
2592
+ "eval_accuracy": 0.7867132867132867,
2593
+ "eval_loss": 0.7285901308059692,
2594
+ "eval_runtime": 5.489,
2595
+ "eval_samples_per_second": 52.104,
2596
+ "eval_steps_per_second": 3.279,
2597
+ "step": 5068
2598
+ },
2599
+ {
2600
+ "epoch": 209.98,
2601
+ "eval_accuracy": 0.7797202797202797,
2602
+ "eval_loss": 0.7332947254180908,
2603
+ "eval_runtime": 5.3017,
2604
+ "eval_samples_per_second": 53.945,
2605
+ "eval_steps_per_second": 3.395,
2606
+ "step": 5092
2607
+ },
2608
+ {
2609
+ "epoch": 210.31,
2610
+ "grad_norm": 2.0511515140533447,
2611
+ "learning_rate": 5.011111111111112e-06,
2612
+ "loss": 0.2129,
2613
+ "step": 5100
2614
+ },
2615
+ {
2616
+ "epoch": 210.97,
2617
+ "eval_accuracy": 0.7797202797202797,
2618
+ "eval_loss": 0.7307863831520081,
2619
+ "eval_runtime": 5.2991,
2620
+ "eval_samples_per_second": 53.971,
2621
+ "eval_steps_per_second": 3.397,
2622
+ "step": 5116
2623
+ },
2624
+ {
2625
+ "epoch": 212.0,
2626
+ "eval_accuracy": 0.7797202797202797,
2627
+ "eval_loss": 0.7176437973976135,
2628
+ "eval_runtime": 4.9452,
2629
+ "eval_samples_per_second": 57.834,
2630
+ "eval_steps_per_second": 3.64,
2631
+ "step": 5141
2632
+ },
2633
+ {
2634
+ "epoch": 212.37,
2635
+ "grad_norm": 1.8491023778915405,
2636
+ "learning_rate": 4.7333333333333335e-06,
2637
+ "loss": 0.2173,
2638
+ "step": 5150
2639
+ },
2640
+ {
2641
+ "epoch": 212.99,
2642
+ "eval_accuracy": 0.7832167832167832,
2643
+ "eval_loss": 0.7334882020950317,
2644
+ "eval_runtime": 4.9602,
2645
+ "eval_samples_per_second": 57.659,
2646
+ "eval_steps_per_second": 3.629,
2647
+ "step": 5165
2648
+ },
2649
+ {
2650
+ "epoch": 213.98,
2651
+ "eval_accuracy": 0.7797202797202797,
2652
+ "eval_loss": 0.7268483638763428,
2653
+ "eval_runtime": 5.885,
2654
+ "eval_samples_per_second": 48.598,
2655
+ "eval_steps_per_second": 3.059,
2656
+ "step": 5189
2657
+ },
2658
+ {
2659
+ "epoch": 214.43,
2660
+ "grad_norm": 1.2067769765853882,
2661
+ "learning_rate": 4.455555555555556e-06,
2662
+ "loss": 0.2042,
2663
+ "step": 5200
2664
+ },
2665
+ {
2666
+ "epoch": 214.97,
2667
+ "eval_accuracy": 0.7902097902097902,
2668
+ "eval_loss": 0.7299237847328186,
2669
+ "eval_runtime": 5.7645,
2670
+ "eval_samples_per_second": 49.614,
2671
+ "eval_steps_per_second": 3.123,
2672
+ "step": 5213
2673
+ },
2674
+ {
2675
+ "epoch": 216.0,
2676
+ "eval_accuracy": 0.7902097902097902,
2677
+ "eval_loss": 0.7360625863075256,
2678
+ "eval_runtime": 4.7143,
2679
+ "eval_samples_per_second": 60.667,
2680
+ "eval_steps_per_second": 3.818,
2681
+ "step": 5238
2682
+ },
2683
+ {
2684
+ "epoch": 216.49,
2685
+ "grad_norm": 1.3863427639007568,
2686
+ "learning_rate": 4.177777777777777e-06,
2687
+ "loss": 0.2112,
2688
+ "step": 5250
2689
+ },
2690
+ {
2691
+ "epoch": 216.99,
2692
+ "eval_accuracy": 0.7902097902097902,
2693
+ "eval_loss": 0.723866879940033,
2694
+ "eval_runtime": 5.3445,
2695
+ "eval_samples_per_second": 53.513,
2696
+ "eval_steps_per_second": 3.368,
2697
+ "step": 5262
2698
+ },
2699
+ {
2700
+ "epoch": 217.98,
2701
+ "eval_accuracy": 0.7832167832167832,
2702
+ "eval_loss": 0.7252445220947266,
2703
+ "eval_runtime": 4.6314,
2704
+ "eval_samples_per_second": 61.753,
2705
+ "eval_steps_per_second": 3.887,
2706
+ "step": 5286
2707
+ },
2708
+ {
2709
+ "epoch": 218.56,
2710
+ "grad_norm": 1.1177924871444702,
2711
+ "learning_rate": 3.9e-06,
2712
+ "loss": 0.2007,
2713
+ "step": 5300
2714
+ },
2715
+ {
2716
+ "epoch": 218.97,
2717
+ "eval_accuracy": 0.7867132867132867,
2718
+ "eval_loss": 0.719983696937561,
2719
+ "eval_runtime": 4.865,
2720
+ "eval_samples_per_second": 58.787,
2721
+ "eval_steps_per_second": 3.7,
2722
+ "step": 5310
2723
+ },
2724
+ {
2725
+ "epoch": 220.0,
2726
+ "eval_accuracy": 0.7867132867132867,
2727
+ "eval_loss": 0.7195786237716675,
2728
+ "eval_runtime": 5.5422,
2729
+ "eval_samples_per_second": 51.604,
2730
+ "eval_steps_per_second": 3.248,
2731
+ "step": 5335
2732
+ },
2733
+ {
2734
+ "epoch": 220.62,
2735
+ "grad_norm": 1.413304090499878,
2736
+ "learning_rate": 3.6222222222222226e-06,
2737
+ "loss": 0.2163,
2738
+ "step": 5350
2739
+ },
2740
+ {
2741
+ "epoch": 220.99,
2742
+ "eval_accuracy": 0.7902097902097902,
2743
+ "eval_loss": 0.7309580445289612,
2744
+ "eval_runtime": 5.2512,
2745
+ "eval_samples_per_second": 54.463,
2746
+ "eval_steps_per_second": 3.428,
2747
+ "step": 5359
2748
+ },
2749
+ {
2750
+ "epoch": 221.98,
2751
+ "eval_accuracy": 0.7867132867132867,
2752
+ "eval_loss": 0.7313971519470215,
2753
+ "eval_runtime": 5.1151,
2754
+ "eval_samples_per_second": 55.913,
2755
+ "eval_steps_per_second": 3.519,
2756
+ "step": 5383
2757
+ },
2758
+ {
2759
+ "epoch": 222.68,
2760
+ "grad_norm": 3.0471901893615723,
2761
+ "learning_rate": 3.3444444444444445e-06,
2762
+ "loss": 0.2141,
2763
+ "step": 5400
2764
+ },
2765
+ {
2766
+ "epoch": 222.97,
2767
+ "eval_accuracy": 0.7832167832167832,
2768
+ "eval_loss": 0.727938175201416,
2769
+ "eval_runtime": 4.6405,
2770
+ "eval_samples_per_second": 61.631,
2771
+ "eval_steps_per_second": 3.879,
2772
+ "step": 5407
2773
+ },
2774
+ {
2775
+ "epoch": 224.0,
2776
+ "eval_accuracy": 0.7902097902097902,
2777
+ "eval_loss": 0.725923478603363,
2778
+ "eval_runtime": 5.0906,
2779
+ "eval_samples_per_second": 56.182,
2780
+ "eval_steps_per_second": 3.536,
2781
+ "step": 5432
2782
+ }
2783
+ ],
2784
+ "logging_steps": 50,
2785
+ "max_steps": 6000,
2786
+ "num_input_tokens_seen": 0,
2787
+ "num_train_epochs": 250,
2788
+ "save_steps": 500,
2789
+ "total_flos": 3.037085846065152e+18,
2790
+ "train_batch_size": 16,
2791
+ "trial_name": null,
2792
+ "trial_params": null
2793
+ }
checkpoint-5432/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3218c866c7b253f5d3295edcd44b4197864747f68600a16bb0f3d6f506131fb
3
+ size 4984
config.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "anderloh/Hugginhface-master-wav2vec-pretreined-5-class-train-test",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSequenceClassification"
11
+ ],
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 128,
15
+ "codevector_dim": 128,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 256,
20
+ 256,
21
+ 256,
22
+ 256,
23
+ 256,
24
+ 256,
25
+ 256
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "sum",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.0,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.0,
56
+ "finetuning_task": "audio-classification",
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.0,
59
+ "hidden_dropout_prob": 0.0,
60
+ "hidden_size": 384,
61
+ "id2label": {
62
+ "0": "Helicopter",
63
+ "1": "Jet",
64
+ "2": "Racecar",
65
+ "3": "Rail",
66
+ "4": "Truck"
67
+ },
68
+ "initializer_range": 0.02,
69
+ "intermediate_size": 1536,
70
+ "label2id": {
71
+ "Helicopter": "0",
72
+ "Jet": "1",
73
+ "Racecar": "2",
74
+ "Rail": "3",
75
+ "Truck": "4"
76
+ },
77
+ "layer_norm_eps": 1e-05,
78
+ "layerdrop": 0.0,
79
+ "mask_feature_length": 10,
80
+ "mask_feature_min_masks": 0,
81
+ "mask_feature_prob": 0.0,
82
+ "mask_time_length": 10,
83
+ "mask_time_min_masks": 2,
84
+ "mask_time_prob": 0.65,
85
+ "model_type": "wav2vec2",
86
+ "num_adapter_layers": 3,
87
+ "num_attention_heads": 6,
88
+ "num_codevector_groups": 2,
89
+ "num_codevectors_per_group": 320,
90
+ "num_conv_pos_embedding_groups": 16,
91
+ "num_conv_pos_embeddings": 128,
92
+ "num_feat_extract_layers": 7,
93
+ "num_hidden_layers": 6,
94
+ "num_negatives": 100,
95
+ "output_hidden_size": 384,
96
+ "pad_token_id": 0,
97
+ "proj_codevector_dim": 128,
98
+ "tdnn_dilation": [
99
+ 1,
100
+ 2,
101
+ 3,
102
+ 1,
103
+ 1
104
+ ],
105
+ "tdnn_dim": [
106
+ 512,
107
+ 512,
108
+ 512,
109
+ 512,
110
+ 1500
111
+ ],
112
+ "tdnn_kernel": [
113
+ 5,
114
+ 3,
115
+ 3,
116
+ 1,
117
+ 1
118
+ ],
119
+ "torch_dtype": "float32",
120
+ "transformers_version": "4.39.0.dev0",
121
+ "use_weighted_layer_sum": false,
122
+ "vocab_size": 32,
123
+ "xvector_output_dim": 512
124
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12b72afca351d5838b740f0ec6003c2e1e2a8c0f5156e6629ad3e2ef735bb540
3
+ size 52151348
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd83ae850a6f38e955a9a0ecf3728e21f005733ae6ae7f948544118441a4714b
3
+ size 95909946
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0524d4c5bb50cb3a888e246202453f6e7f310c8e7d978c2791811f64716d2d2c
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66b1ff4c92709ee514d60150ad1c67a13e001dac71642548c74f443c1156d358
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,2793 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7937062937062938,
3
+ "best_model_checkpoint": "wav2vec2-5Class-train-test-finetune/checkpoint-4122",
4
+ "epoch": 224.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5432,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.99,
13
+ "eval_accuracy": 0.34265734265734266,
14
+ "eval_loss": 1.5984586477279663,
15
+ "eval_runtime": 5.3437,
16
+ "eval_samples_per_second": 53.521,
17
+ "eval_steps_per_second": 3.368,
18
+ "step": 24
19
+ },
20
+ {
21
+ "epoch": 1.98,
22
+ "eval_accuracy": 0.33916083916083917,
23
+ "eval_loss": 1.5969289541244507,
24
+ "eval_runtime": 3.8653,
25
+ "eval_samples_per_second": 73.992,
26
+ "eval_steps_per_second": 4.657,
27
+ "step": 48
28
+ },
29
+ {
30
+ "epoch": 2.06,
31
+ "grad_norm": 1.0544973611831665,
32
+ "learning_rate": 2.4999999999999998e-06,
33
+ "loss": 1.5969,
34
+ "step": 50
35
+ },
36
+ {
37
+ "epoch": 2.97,
38
+ "eval_accuracy": 0.32867132867132864,
39
+ "eval_loss": 1.5943816900253296,
40
+ "eval_runtime": 6.1748,
41
+ "eval_samples_per_second": 46.317,
42
+ "eval_steps_per_second": 2.915,
43
+ "step": 72
44
+ },
45
+ {
46
+ "epoch": 4.0,
47
+ "eval_accuracy": 0.3146853146853147,
48
+ "eval_loss": 1.5906767845153809,
49
+ "eval_runtime": 5.1678,
50
+ "eval_samples_per_second": 55.343,
51
+ "eval_steps_per_second": 3.483,
52
+ "step": 97
53
+ },
54
+ {
55
+ "epoch": 4.12,
56
+ "grad_norm": 0.8443157076835632,
57
+ "learning_rate": 4.9999999999999996e-06,
58
+ "loss": 1.5896,
59
+ "step": 100
60
+ },
61
+ {
62
+ "epoch": 4.99,
63
+ "eval_accuracy": 0.2972027972027972,
64
+ "eval_loss": 1.5860023498535156,
65
+ "eval_runtime": 4.9416,
66
+ "eval_samples_per_second": 57.876,
67
+ "eval_steps_per_second": 3.643,
68
+ "step": 121
69
+ },
70
+ {
71
+ "epoch": 5.98,
72
+ "eval_accuracy": 0.2692307692307692,
73
+ "eval_loss": 1.5806005001068115,
74
+ "eval_runtime": 4.1837,
75
+ "eval_samples_per_second": 68.36,
76
+ "eval_steps_per_second": 4.302,
77
+ "step": 145
78
+ },
79
+ {
80
+ "epoch": 6.19,
81
+ "grad_norm": 1.0938074588775635,
82
+ "learning_rate": 7.5e-06,
83
+ "loss": 1.5743,
84
+ "step": 150
85
+ },
86
+ {
87
+ "epoch": 6.97,
88
+ "eval_accuracy": 0.25874125874125875,
89
+ "eval_loss": 1.5742768049240112,
90
+ "eval_runtime": 7.1914,
91
+ "eval_samples_per_second": 39.77,
92
+ "eval_steps_per_second": 2.503,
93
+ "step": 169
94
+ },
95
+ {
96
+ "epoch": 8.0,
97
+ "eval_accuracy": 0.23426573426573427,
98
+ "eval_loss": 1.5664165019989014,
99
+ "eval_runtime": 5.6489,
100
+ "eval_samples_per_second": 50.629,
101
+ "eval_steps_per_second": 3.186,
102
+ "step": 194
103
+ },
104
+ {
105
+ "epoch": 8.25,
106
+ "grad_norm": 0.9692079424858093,
107
+ "learning_rate": 9.999999999999999e-06,
108
+ "loss": 1.5508,
109
+ "step": 200
110
+ },
111
+ {
112
+ "epoch": 8.99,
113
+ "eval_accuracy": 0.22727272727272727,
114
+ "eval_loss": 1.557572841644287,
115
+ "eval_runtime": 5.5182,
116
+ "eval_samples_per_second": 51.828,
117
+ "eval_steps_per_second": 3.262,
118
+ "step": 218
119
+ },
120
+ {
121
+ "epoch": 9.98,
122
+ "eval_accuracy": 0.22727272727272727,
123
+ "eval_loss": 1.5482373237609863,
124
+ "eval_runtime": 5.3205,
125
+ "eval_samples_per_second": 53.754,
126
+ "eval_steps_per_second": 3.383,
127
+ "step": 242
128
+ },
129
+ {
130
+ "epoch": 10.31,
131
+ "grad_norm": 1.02046799659729,
132
+ "learning_rate": 1.25e-05,
133
+ "loss": 1.5157,
134
+ "step": 250
135
+ },
136
+ {
137
+ "epoch": 10.97,
138
+ "eval_accuracy": 0.22727272727272727,
139
+ "eval_loss": 1.539355993270874,
140
+ "eval_runtime": 6.3116,
141
+ "eval_samples_per_second": 45.313,
142
+ "eval_steps_per_second": 2.852,
143
+ "step": 266
144
+ },
145
+ {
146
+ "epoch": 12.0,
147
+ "eval_accuracy": 0.22727272727272727,
148
+ "eval_loss": 1.5350520610809326,
149
+ "eval_runtime": 4.3422,
150
+ "eval_samples_per_second": 65.865,
151
+ "eval_steps_per_second": 4.145,
152
+ "step": 291
153
+ },
154
+ {
155
+ "epoch": 12.37,
156
+ "grad_norm": 1.6058833599090576,
157
+ "learning_rate": 1.5e-05,
158
+ "loss": 1.4534,
159
+ "step": 300
160
+ },
161
+ {
162
+ "epoch": 12.99,
163
+ "eval_accuracy": 0.22727272727272727,
164
+ "eval_loss": 1.5525730848312378,
165
+ "eval_runtime": 5.245,
166
+ "eval_samples_per_second": 54.528,
167
+ "eval_steps_per_second": 3.432,
168
+ "step": 315
169
+ },
170
+ {
171
+ "epoch": 13.98,
172
+ "eval_accuracy": 0.22727272727272727,
173
+ "eval_loss": 1.599926471710205,
174
+ "eval_runtime": 6.0088,
175
+ "eval_samples_per_second": 47.597,
176
+ "eval_steps_per_second": 2.996,
177
+ "step": 339
178
+ },
179
+ {
180
+ "epoch": 14.43,
181
+ "grad_norm": 0.8243080377578735,
182
+ "learning_rate": 1.7500000000000002e-05,
183
+ "loss": 1.3638,
184
+ "step": 350
185
+ },
186
+ {
187
+ "epoch": 14.97,
188
+ "eval_accuracy": 0.22727272727272727,
189
+ "eval_loss": 1.5896875858306885,
190
+ "eval_runtime": 4.8752,
191
+ "eval_samples_per_second": 58.664,
192
+ "eval_steps_per_second": 3.692,
193
+ "step": 363
194
+ },
195
+ {
196
+ "epoch": 16.0,
197
+ "eval_accuracy": 0.26573426573426573,
198
+ "eval_loss": 1.560091495513916,
199
+ "eval_runtime": 5.5082,
200
+ "eval_samples_per_second": 51.922,
201
+ "eval_steps_per_second": 3.268,
202
+ "step": 388
203
+ },
204
+ {
205
+ "epoch": 16.49,
206
+ "grad_norm": 0.7977257370948792,
207
+ "learning_rate": 1.9999999999999998e-05,
208
+ "loss": 1.2951,
209
+ "step": 400
210
+ },
211
+ {
212
+ "epoch": 16.99,
213
+ "eval_accuracy": 0.2937062937062937,
214
+ "eval_loss": 1.5349317789077759,
215
+ "eval_runtime": 4.7526,
216
+ "eval_samples_per_second": 60.178,
217
+ "eval_steps_per_second": 3.787,
218
+ "step": 412
219
+ },
220
+ {
221
+ "epoch": 17.98,
222
+ "eval_accuracy": 0.34265734265734266,
223
+ "eval_loss": 1.5053907632827759,
224
+ "eval_runtime": 4.8638,
225
+ "eval_samples_per_second": 58.801,
226
+ "eval_steps_per_second": 3.701,
227
+ "step": 436
228
+ },
229
+ {
230
+ "epoch": 18.56,
231
+ "grad_norm": 0.7064552903175354,
232
+ "learning_rate": 2.25e-05,
233
+ "loss": 1.2369,
234
+ "step": 450
235
+ },
236
+ {
237
+ "epoch": 18.97,
238
+ "eval_accuracy": 0.3741258741258741,
239
+ "eval_loss": 1.4689087867736816,
240
+ "eval_runtime": 4.3712,
241
+ "eval_samples_per_second": 65.428,
242
+ "eval_steps_per_second": 4.118,
243
+ "step": 460
244
+ },
245
+ {
246
+ "epoch": 20.0,
247
+ "eval_accuracy": 0.4370629370629371,
248
+ "eval_loss": 1.404613971710205,
249
+ "eval_runtime": 4.7203,
250
+ "eval_samples_per_second": 60.59,
251
+ "eval_steps_per_second": 3.813,
252
+ "step": 485
253
+ },
254
+ {
255
+ "epoch": 20.62,
256
+ "grad_norm": 0.598238468170166,
257
+ "learning_rate": 2.5e-05,
258
+ "loss": 1.1566,
259
+ "step": 500
260
+ },
261
+ {
262
+ "epoch": 20.99,
263
+ "eval_accuracy": 0.4405594405594406,
264
+ "eval_loss": 1.3691043853759766,
265
+ "eval_runtime": 6.6443,
266
+ "eval_samples_per_second": 43.044,
267
+ "eval_steps_per_second": 2.709,
268
+ "step": 509
269
+ },
270
+ {
271
+ "epoch": 21.98,
272
+ "eval_accuracy": 0.4825174825174825,
273
+ "eval_loss": 1.3120107650756836,
274
+ "eval_runtime": 4.9585,
275
+ "eval_samples_per_second": 57.679,
276
+ "eval_steps_per_second": 3.63,
277
+ "step": 533
278
+ },
279
+ {
280
+ "epoch": 22.68,
281
+ "grad_norm": 0.682925820350647,
282
+ "learning_rate": 2.75e-05,
283
+ "loss": 1.0676,
284
+ "step": 550
285
+ },
286
+ {
287
+ "epoch": 22.97,
288
+ "eval_accuracy": 0.486013986013986,
289
+ "eval_loss": 1.2839338779449463,
290
+ "eval_runtime": 4.0382,
291
+ "eval_samples_per_second": 70.824,
292
+ "eval_steps_per_second": 4.457,
293
+ "step": 557
294
+ },
295
+ {
296
+ "epoch": 24.0,
297
+ "eval_accuracy": 0.5104895104895105,
298
+ "eval_loss": 1.2549891471862793,
299
+ "eval_runtime": 5.1896,
300
+ "eval_samples_per_second": 55.11,
301
+ "eval_steps_per_second": 3.468,
302
+ "step": 582
303
+ },
304
+ {
305
+ "epoch": 24.74,
306
+ "grad_norm": 1.1368101835250854,
307
+ "learning_rate": 3e-05,
308
+ "loss": 0.992,
309
+ "step": 600
310
+ },
311
+ {
312
+ "epoch": 24.99,
313
+ "eval_accuracy": 0.5209790209790209,
314
+ "eval_loss": 1.2106566429138184,
315
+ "eval_runtime": 6.8941,
316
+ "eval_samples_per_second": 41.485,
317
+ "eval_steps_per_second": 2.611,
318
+ "step": 606
319
+ },
320
+ {
321
+ "epoch": 25.98,
322
+ "eval_accuracy": 0.5384615384615384,
323
+ "eval_loss": 1.1711338758468628,
324
+ "eval_runtime": 4.9707,
325
+ "eval_samples_per_second": 57.537,
326
+ "eval_steps_per_second": 3.621,
327
+ "step": 630
328
+ },
329
+ {
330
+ "epoch": 26.8,
331
+ "grad_norm": 0.9649831056594849,
332
+ "learning_rate": 2.9722222222222223e-05,
333
+ "loss": 0.9272,
334
+ "step": 650
335
+ },
336
+ {
337
+ "epoch": 26.97,
338
+ "eval_accuracy": 0.5594405594405595,
339
+ "eval_loss": 1.1318116188049316,
340
+ "eval_runtime": 5.5564,
341
+ "eval_samples_per_second": 51.472,
342
+ "eval_steps_per_second": 3.24,
343
+ "step": 654
344
+ },
345
+ {
346
+ "epoch": 28.0,
347
+ "eval_accuracy": 0.6153846153846154,
348
+ "eval_loss": 1.0594333410263062,
349
+ "eval_runtime": 4.6773,
350
+ "eval_samples_per_second": 61.147,
351
+ "eval_steps_per_second": 3.848,
352
+ "step": 679
353
+ },
354
+ {
355
+ "epoch": 28.87,
356
+ "grad_norm": 0.883937418460846,
357
+ "learning_rate": 2.9444444444444445e-05,
358
+ "loss": 0.8478,
359
+ "step": 700
360
+ },
361
+ {
362
+ "epoch": 28.99,
363
+ "eval_accuracy": 0.6013986013986014,
364
+ "eval_loss": 1.054669737815857,
365
+ "eval_runtime": 4.9219,
366
+ "eval_samples_per_second": 58.108,
367
+ "eval_steps_per_second": 3.657,
368
+ "step": 703
369
+ },
370
+ {
371
+ "epoch": 29.98,
372
+ "eval_accuracy": 0.6363636363636364,
373
+ "eval_loss": 0.9822685122489929,
374
+ "eval_runtime": 6.3133,
375
+ "eval_samples_per_second": 45.302,
376
+ "eval_steps_per_second": 2.851,
377
+ "step": 727
378
+ },
379
+ {
380
+ "epoch": 30.93,
381
+ "grad_norm": 1.3742878437042236,
382
+ "learning_rate": 2.9166666666666666e-05,
383
+ "loss": 0.7627,
384
+ "step": 750
385
+ },
386
+ {
387
+ "epoch": 30.97,
388
+ "eval_accuracy": 0.6398601398601399,
389
+ "eval_loss": 1.00295090675354,
390
+ "eval_runtime": 6.154,
391
+ "eval_samples_per_second": 46.473,
392
+ "eval_steps_per_second": 2.925,
393
+ "step": 751
394
+ },
395
+ {
396
+ "epoch": 32.0,
397
+ "eval_accuracy": 0.6608391608391608,
398
+ "eval_loss": 0.930969774723053,
399
+ "eval_runtime": 5.6747,
400
+ "eval_samples_per_second": 50.399,
401
+ "eval_steps_per_second": 3.172,
402
+ "step": 776
403
+ },
404
+ {
405
+ "epoch": 32.99,
406
+ "grad_norm": 1.329268217086792,
407
+ "learning_rate": 2.8888888888888888e-05,
408
+ "loss": 0.7266,
409
+ "step": 800
410
+ },
411
+ {
412
+ "epoch": 32.99,
413
+ "eval_accuracy": 0.6678321678321678,
414
+ "eval_loss": 0.9228739738464355,
415
+ "eval_runtime": 5.382,
416
+ "eval_samples_per_second": 53.14,
417
+ "eval_steps_per_second": 3.344,
418
+ "step": 800
419
+ },
420
+ {
421
+ "epoch": 33.98,
422
+ "eval_accuracy": 0.6958041958041958,
423
+ "eval_loss": 0.8684509992599487,
424
+ "eval_runtime": 4.8497,
425
+ "eval_samples_per_second": 58.973,
426
+ "eval_steps_per_second": 3.712,
427
+ "step": 824
428
+ },
429
+ {
430
+ "epoch": 34.97,
431
+ "eval_accuracy": 0.6643356643356644,
432
+ "eval_loss": 0.8954732418060303,
433
+ "eval_runtime": 5.2083,
434
+ "eval_samples_per_second": 54.912,
435
+ "eval_steps_per_second": 3.456,
436
+ "step": 848
437
+ },
438
+ {
439
+ "epoch": 35.05,
440
+ "grad_norm": 1.3892701864242554,
441
+ "learning_rate": 2.8611111111111113e-05,
442
+ "loss": 0.6906,
443
+ "step": 850
444
+ },
445
+ {
446
+ "epoch": 36.0,
447
+ "eval_accuracy": 0.6713286713286714,
448
+ "eval_loss": 0.9125654101371765,
449
+ "eval_runtime": 5.3068,
450
+ "eval_samples_per_second": 53.894,
451
+ "eval_steps_per_second": 3.392,
452
+ "step": 873
453
+ },
454
+ {
455
+ "epoch": 36.99,
456
+ "eval_accuracy": 0.6923076923076923,
457
+ "eval_loss": 0.8543534874916077,
458
+ "eval_runtime": 4.3351,
459
+ "eval_samples_per_second": 65.974,
460
+ "eval_steps_per_second": 4.152,
461
+ "step": 897
462
+ },
463
+ {
464
+ "epoch": 37.11,
465
+ "grad_norm": 0.836291491985321,
466
+ "learning_rate": 2.8333333333333332e-05,
467
+ "loss": 0.6721,
468
+ "step": 900
469
+ },
470
+ {
471
+ "epoch": 37.98,
472
+ "eval_accuracy": 0.6923076923076923,
473
+ "eval_loss": 0.8480322957038879,
474
+ "eval_runtime": 5.1861,
475
+ "eval_samples_per_second": 55.147,
476
+ "eval_steps_per_second": 3.471,
477
+ "step": 921
478
+ },
479
+ {
480
+ "epoch": 38.97,
481
+ "eval_accuracy": 0.7097902097902098,
482
+ "eval_loss": 0.8354606628417969,
483
+ "eval_runtime": 6.3247,
484
+ "eval_samples_per_second": 45.22,
485
+ "eval_steps_per_second": 2.846,
486
+ "step": 945
487
+ },
488
+ {
489
+ "epoch": 39.18,
490
+ "grad_norm": 1.6499431133270264,
491
+ "learning_rate": 2.8055555555555557e-05,
492
+ "loss": 0.6442,
493
+ "step": 950
494
+ },
495
+ {
496
+ "epoch": 40.0,
497
+ "eval_accuracy": 0.6958041958041958,
498
+ "eval_loss": 0.8412452340126038,
499
+ "eval_runtime": 5.2281,
500
+ "eval_samples_per_second": 54.704,
501
+ "eval_steps_per_second": 3.443,
502
+ "step": 970
503
+ },
504
+ {
505
+ "epoch": 40.99,
506
+ "eval_accuracy": 0.6888111888111889,
507
+ "eval_loss": 0.8356389999389648,
508
+ "eval_runtime": 4.8326,
509
+ "eval_samples_per_second": 59.181,
510
+ "eval_steps_per_second": 3.725,
511
+ "step": 994
512
+ },
513
+ {
514
+ "epoch": 41.24,
515
+ "grad_norm": 1.1766818761825562,
516
+ "learning_rate": 2.777777777777778e-05,
517
+ "loss": 0.6465,
518
+ "step": 1000
519
+ },
520
+ {
521
+ "epoch": 41.98,
522
+ "eval_accuracy": 0.7062937062937062,
523
+ "eval_loss": 0.8180016875267029,
524
+ "eval_runtime": 5.7926,
525
+ "eval_samples_per_second": 49.374,
526
+ "eval_steps_per_second": 3.107,
527
+ "step": 1018
528
+ },
529
+ {
530
+ "epoch": 42.97,
531
+ "eval_accuracy": 0.7027972027972028,
532
+ "eval_loss": 0.8103991150856018,
533
+ "eval_runtime": 5.5185,
534
+ "eval_samples_per_second": 51.825,
535
+ "eval_steps_per_second": 3.262,
536
+ "step": 1042
537
+ },
538
+ {
539
+ "epoch": 43.3,
540
+ "grad_norm": 0.9722403287887573,
541
+ "learning_rate": 2.75e-05,
542
+ "loss": 0.6086,
543
+ "step": 1050
544
+ },
545
+ {
546
+ "epoch": 44.0,
547
+ "eval_accuracy": 0.6958041958041958,
548
+ "eval_loss": 0.8162235617637634,
549
+ "eval_runtime": 4.9174,
550
+ "eval_samples_per_second": 58.161,
551
+ "eval_steps_per_second": 3.66,
552
+ "step": 1067
553
+ },
554
+ {
555
+ "epoch": 44.99,
556
+ "eval_accuracy": 0.7027972027972028,
557
+ "eval_loss": 0.7957289218902588,
558
+ "eval_runtime": 4.6891,
559
+ "eval_samples_per_second": 60.992,
560
+ "eval_steps_per_second": 3.839,
561
+ "step": 1091
562
+ },
563
+ {
564
+ "epoch": 45.36,
565
+ "grad_norm": 1.269113302230835,
566
+ "learning_rate": 2.7222222222222223e-05,
567
+ "loss": 0.5863,
568
+ "step": 1100
569
+ },
570
+ {
571
+ "epoch": 45.98,
572
+ "eval_accuracy": 0.6958041958041958,
573
+ "eval_loss": 0.8143528699874878,
574
+ "eval_runtime": 6.6805,
575
+ "eval_samples_per_second": 42.811,
576
+ "eval_steps_per_second": 2.694,
577
+ "step": 1115
578
+ },
579
+ {
580
+ "epoch": 46.97,
581
+ "eval_accuracy": 0.7027972027972028,
582
+ "eval_loss": 0.78568434715271,
583
+ "eval_runtime": 4.7422,
584
+ "eval_samples_per_second": 60.31,
585
+ "eval_steps_per_second": 3.796,
586
+ "step": 1139
587
+ },
588
+ {
589
+ "epoch": 47.42,
590
+ "grad_norm": 0.9775255918502808,
591
+ "learning_rate": 2.6944444444444445e-05,
592
+ "loss": 0.5877,
593
+ "step": 1150
594
+ },
595
+ {
596
+ "epoch": 48.0,
597
+ "eval_accuracy": 0.7132867132867133,
598
+ "eval_loss": 0.7764595150947571,
599
+ "eval_runtime": 5.76,
600
+ "eval_samples_per_second": 49.653,
601
+ "eval_steps_per_second": 3.125,
602
+ "step": 1164
603
+ },
604
+ {
605
+ "epoch": 48.99,
606
+ "eval_accuracy": 0.6993006993006993,
607
+ "eval_loss": 0.7881478071212769,
608
+ "eval_runtime": 5.4965,
609
+ "eval_samples_per_second": 52.033,
610
+ "eval_steps_per_second": 3.275,
611
+ "step": 1188
612
+ },
613
+ {
614
+ "epoch": 49.48,
615
+ "grad_norm": 1.540124773979187,
616
+ "learning_rate": 2.6666666666666667e-05,
617
+ "loss": 0.5629,
618
+ "step": 1200
619
+ },
620
+ {
621
+ "epoch": 49.98,
622
+ "eval_accuracy": 0.7097902097902098,
623
+ "eval_loss": 0.7658265829086304,
624
+ "eval_runtime": 4.731,
625
+ "eval_samples_per_second": 60.452,
626
+ "eval_steps_per_second": 3.805,
627
+ "step": 1212
628
+ },
629
+ {
630
+ "epoch": 50.97,
631
+ "eval_accuracy": 0.7132867132867133,
632
+ "eval_loss": 0.7723098397254944,
633
+ "eval_runtime": 5.8352,
634
+ "eval_samples_per_second": 49.013,
635
+ "eval_steps_per_second": 3.085,
636
+ "step": 1236
637
+ },
638
+ {
639
+ "epoch": 51.55,
640
+ "grad_norm": 1.2498500347137451,
641
+ "learning_rate": 2.6388888888888892e-05,
642
+ "loss": 0.5476,
643
+ "step": 1250
644
+ },
645
+ {
646
+ "epoch": 52.0,
647
+ "eval_accuracy": 0.7097902097902098,
648
+ "eval_loss": 0.7603952884674072,
649
+ "eval_runtime": 4.448,
650
+ "eval_samples_per_second": 64.299,
651
+ "eval_steps_per_second": 4.047,
652
+ "step": 1261
653
+ },
654
+ {
655
+ "epoch": 52.99,
656
+ "eval_accuracy": 0.7202797202797203,
657
+ "eval_loss": 0.7554137706756592,
658
+ "eval_runtime": 6.4218,
659
+ "eval_samples_per_second": 44.536,
660
+ "eval_steps_per_second": 2.803,
661
+ "step": 1285
662
+ },
663
+ {
664
+ "epoch": 53.61,
665
+ "grad_norm": 0.9919388890266418,
666
+ "learning_rate": 2.6116666666666667e-05,
667
+ "loss": 0.5357,
668
+ "step": 1300
669
+ },
670
+ {
671
+ "epoch": 53.98,
672
+ "eval_accuracy": 0.7307692307692307,
673
+ "eval_loss": 0.7458928227424622,
674
+ "eval_runtime": 5.3791,
675
+ "eval_samples_per_second": 53.168,
676
+ "eval_steps_per_second": 3.346,
677
+ "step": 1309
678
+ },
679
+ {
680
+ "epoch": 54.97,
681
+ "eval_accuracy": 0.7132867132867133,
682
+ "eval_loss": 0.7632877230644226,
683
+ "eval_runtime": 5.278,
684
+ "eval_samples_per_second": 54.187,
685
+ "eval_steps_per_second": 3.41,
686
+ "step": 1333
687
+ },
688
+ {
689
+ "epoch": 55.67,
690
+ "grad_norm": 1.688183307647705,
691
+ "learning_rate": 2.5838888888888892e-05,
692
+ "loss": 0.5335,
693
+ "step": 1350
694
+ },
695
+ {
696
+ "epoch": 56.0,
697
+ "eval_accuracy": 0.7167832167832168,
698
+ "eval_loss": 0.768308162689209,
699
+ "eval_runtime": 5.7022,
700
+ "eval_samples_per_second": 50.156,
701
+ "eval_steps_per_second": 3.157,
702
+ "step": 1358
703
+ },
704
+ {
705
+ "epoch": 56.99,
706
+ "eval_accuracy": 0.7307692307692307,
707
+ "eval_loss": 0.7380541563034058,
708
+ "eval_runtime": 4.522,
709
+ "eval_samples_per_second": 63.247,
710
+ "eval_steps_per_second": 3.981,
711
+ "step": 1382
712
+ },
713
+ {
714
+ "epoch": 57.73,
715
+ "grad_norm": 1.4895784854888916,
716
+ "learning_rate": 2.556111111111111e-05,
717
+ "loss": 0.5107,
718
+ "step": 1400
719
+ },
720
+ {
721
+ "epoch": 57.98,
722
+ "eval_accuracy": 0.7377622377622378,
723
+ "eval_loss": 0.7308338284492493,
724
+ "eval_runtime": 4.4787,
725
+ "eval_samples_per_second": 63.857,
726
+ "eval_steps_per_second": 4.019,
727
+ "step": 1406
728
+ },
729
+ {
730
+ "epoch": 58.97,
731
+ "eval_accuracy": 0.7237762237762237,
732
+ "eval_loss": 0.7441032528877258,
733
+ "eval_runtime": 5.8744,
734
+ "eval_samples_per_second": 48.685,
735
+ "eval_steps_per_second": 3.064,
736
+ "step": 1430
737
+ },
738
+ {
739
+ "epoch": 59.79,
740
+ "grad_norm": 1.4925004243850708,
741
+ "learning_rate": 2.5283333333333332e-05,
742
+ "loss": 0.5105,
743
+ "step": 1450
744
+ },
745
+ {
746
+ "epoch": 60.0,
747
+ "eval_accuracy": 0.7307692307692307,
748
+ "eval_loss": 0.7481815218925476,
749
+ "eval_runtime": 7.272,
750
+ "eval_samples_per_second": 39.329,
751
+ "eval_steps_per_second": 2.475,
752
+ "step": 1455
753
+ },
754
+ {
755
+ "epoch": 60.99,
756
+ "eval_accuracy": 0.7342657342657343,
757
+ "eval_loss": 0.733482301235199,
758
+ "eval_runtime": 4.6235,
759
+ "eval_samples_per_second": 61.858,
760
+ "eval_steps_per_second": 3.893,
761
+ "step": 1479
762
+ },
763
+ {
764
+ "epoch": 61.86,
765
+ "grad_norm": 1.3200663328170776,
766
+ "learning_rate": 2.5005555555555558e-05,
767
+ "loss": 0.4914,
768
+ "step": 1500
769
+ },
770
+ {
771
+ "epoch": 61.98,
772
+ "eval_accuracy": 0.7447552447552448,
773
+ "eval_loss": 0.7241908311843872,
774
+ "eval_runtime": 4.8198,
775
+ "eval_samples_per_second": 59.338,
776
+ "eval_steps_per_second": 3.735,
777
+ "step": 1503
778
+ },
779
+ {
780
+ "epoch": 62.97,
781
+ "eval_accuracy": 0.7377622377622378,
782
+ "eval_loss": 0.7321043014526367,
783
+ "eval_runtime": 5.8929,
784
+ "eval_samples_per_second": 48.533,
785
+ "eval_steps_per_second": 3.055,
786
+ "step": 1527
787
+ },
788
+ {
789
+ "epoch": 63.92,
790
+ "grad_norm": 1.1309747695922852,
791
+ "learning_rate": 2.472777777777778e-05,
792
+ "loss": 0.4839,
793
+ "step": 1550
794
+ },
795
+ {
796
+ "epoch": 64.0,
797
+ "eval_accuracy": 0.7342657342657343,
798
+ "eval_loss": 0.7220665216445923,
799
+ "eval_runtime": 5.8635,
800
+ "eval_samples_per_second": 48.776,
801
+ "eval_steps_per_second": 3.07,
802
+ "step": 1552
803
+ },
804
+ {
805
+ "epoch": 64.99,
806
+ "eval_accuracy": 0.7412587412587412,
807
+ "eval_loss": 0.7136482000350952,
808
+ "eval_runtime": 4.3102,
809
+ "eval_samples_per_second": 66.354,
810
+ "eval_steps_per_second": 4.176,
811
+ "step": 1576
812
+ },
813
+ {
814
+ "epoch": 65.98,
815
+ "grad_norm": 1.1314157247543335,
816
+ "learning_rate": 2.4449999999999998e-05,
817
+ "loss": 0.4751,
818
+ "step": 1600
819
+ },
820
+ {
821
+ "epoch": 65.98,
822
+ "eval_accuracy": 0.7412587412587412,
823
+ "eval_loss": 0.7198111414909363,
824
+ "eval_runtime": 4.7841,
825
+ "eval_samples_per_second": 59.781,
826
+ "eval_steps_per_second": 3.762,
827
+ "step": 1600
828
+ },
829
+ {
830
+ "epoch": 66.97,
831
+ "eval_accuracy": 0.7377622377622378,
832
+ "eval_loss": 0.7145721912384033,
833
+ "eval_runtime": 6.347,
834
+ "eval_samples_per_second": 45.061,
835
+ "eval_steps_per_second": 2.836,
836
+ "step": 1624
837
+ },
838
+ {
839
+ "epoch": 68.0,
840
+ "eval_accuracy": 0.7447552447552448,
841
+ "eval_loss": 0.6970916390419006,
842
+ "eval_runtime": 5.6871,
843
+ "eval_samples_per_second": 50.289,
844
+ "eval_steps_per_second": 3.165,
845
+ "step": 1649
846
+ },
847
+ {
848
+ "epoch": 68.04,
849
+ "grad_norm": 2.397585153579712,
850
+ "learning_rate": 2.4172222222222223e-05,
851
+ "loss": 0.4639,
852
+ "step": 1650
853
+ },
854
+ {
855
+ "epoch": 68.99,
856
+ "eval_accuracy": 0.7272727272727273,
857
+ "eval_loss": 0.7201464176177979,
858
+ "eval_runtime": 4.4157,
859
+ "eval_samples_per_second": 64.769,
860
+ "eval_steps_per_second": 4.076,
861
+ "step": 1673
862
+ },
863
+ {
864
+ "epoch": 69.98,
865
+ "eval_accuracy": 0.7307692307692307,
866
+ "eval_loss": 0.7244682312011719,
867
+ "eval_runtime": 5.4392,
868
+ "eval_samples_per_second": 52.581,
869
+ "eval_steps_per_second": 3.309,
870
+ "step": 1697
871
+ },
872
+ {
873
+ "epoch": 70.1,
874
+ "grad_norm": 2.062610387802124,
875
+ "learning_rate": 2.3894444444444445e-05,
876
+ "loss": 0.4581,
877
+ "step": 1700
878
+ },
879
+ {
880
+ "epoch": 70.97,
881
+ "eval_accuracy": 0.7447552447552448,
882
+ "eval_loss": 0.7077587842941284,
883
+ "eval_runtime": 5.1002,
884
+ "eval_samples_per_second": 56.076,
885
+ "eval_steps_per_second": 3.529,
886
+ "step": 1721
887
+ },
888
+ {
889
+ "epoch": 72.0,
890
+ "eval_accuracy": 0.7517482517482518,
891
+ "eval_loss": 0.6957913637161255,
892
+ "eval_runtime": 4.4485,
893
+ "eval_samples_per_second": 64.291,
894
+ "eval_steps_per_second": 4.046,
895
+ "step": 1746
896
+ },
897
+ {
898
+ "epoch": 72.16,
899
+ "grad_norm": 2.7808456420898438,
900
+ "learning_rate": 2.3616666666666667e-05,
901
+ "loss": 0.4643,
902
+ "step": 1750
903
+ },
904
+ {
905
+ "epoch": 72.99,
906
+ "eval_accuracy": 0.7447552447552448,
907
+ "eval_loss": 0.7036928534507751,
908
+ "eval_runtime": 5.9101,
909
+ "eval_samples_per_second": 48.392,
910
+ "eval_steps_per_second": 3.046,
911
+ "step": 1770
912
+ },
913
+ {
914
+ "epoch": 73.98,
915
+ "eval_accuracy": 0.7482517482517482,
916
+ "eval_loss": 0.71629399061203,
917
+ "eval_runtime": 6.0211,
918
+ "eval_samples_per_second": 47.5,
919
+ "eval_steps_per_second": 2.989,
920
+ "step": 1794
921
+ },
922
+ {
923
+ "epoch": 74.23,
924
+ "grad_norm": 1.78495192527771,
925
+ "learning_rate": 2.333888888888889e-05,
926
+ "loss": 0.442,
927
+ "step": 1800
928
+ },
929
+ {
930
+ "epoch": 74.97,
931
+ "eval_accuracy": 0.7377622377622378,
932
+ "eval_loss": 0.6997957229614258,
933
+ "eval_runtime": 4.4212,
934
+ "eval_samples_per_second": 64.688,
935
+ "eval_steps_per_second": 4.071,
936
+ "step": 1818
937
+ },
938
+ {
939
+ "epoch": 76.0,
940
+ "eval_accuracy": 0.7447552447552448,
941
+ "eval_loss": 0.6946483850479126,
942
+ "eval_runtime": 4.0507,
943
+ "eval_samples_per_second": 70.605,
944
+ "eval_steps_per_second": 4.444,
945
+ "step": 1843
946
+ },
947
+ {
948
+ "epoch": 76.29,
949
+ "grad_norm": 1.7383118867874146,
950
+ "learning_rate": 2.306111111111111e-05,
951
+ "loss": 0.4305,
952
+ "step": 1850
953
+ },
954
+ {
955
+ "epoch": 76.99,
956
+ "eval_accuracy": 0.7552447552447552,
957
+ "eval_loss": 0.6857091784477234,
958
+ "eval_runtime": 4.1718,
959
+ "eval_samples_per_second": 68.556,
960
+ "eval_steps_per_second": 4.315,
961
+ "step": 1867
962
+ },
963
+ {
964
+ "epoch": 77.98,
965
+ "eval_accuracy": 0.7447552447552448,
966
+ "eval_loss": 0.6936307549476624,
967
+ "eval_runtime": 3.8781,
968
+ "eval_samples_per_second": 73.747,
969
+ "eval_steps_per_second": 4.641,
970
+ "step": 1891
971
+ },
972
+ {
973
+ "epoch": 78.35,
974
+ "grad_norm": 1.047067403793335,
975
+ "learning_rate": 2.2783333333333336e-05,
976
+ "loss": 0.4416,
977
+ "step": 1900
978
+ },
979
+ {
980
+ "epoch": 78.97,
981
+ "eval_accuracy": 0.7517482517482518,
982
+ "eval_loss": 0.6965110301971436,
983
+ "eval_runtime": 5.1318,
984
+ "eval_samples_per_second": 55.731,
985
+ "eval_steps_per_second": 3.508,
986
+ "step": 1915
987
+ },
988
+ {
989
+ "epoch": 80.0,
990
+ "eval_accuracy": 0.7482517482517482,
991
+ "eval_loss": 0.7017127871513367,
992
+ "eval_runtime": 4.3418,
993
+ "eval_samples_per_second": 65.871,
994
+ "eval_steps_per_second": 4.146,
995
+ "step": 1940
996
+ },
997
+ {
998
+ "epoch": 80.41,
999
+ "grad_norm": 1.5354928970336914,
1000
+ "learning_rate": 2.2505555555555554e-05,
1001
+ "loss": 0.428,
1002
+ "step": 1950
1003
+ },
1004
+ {
1005
+ "epoch": 80.99,
1006
+ "eval_accuracy": 0.7552447552447552,
1007
+ "eval_loss": 0.6970596313476562,
1008
+ "eval_runtime": 5.973,
1009
+ "eval_samples_per_second": 47.882,
1010
+ "eval_steps_per_second": 3.014,
1011
+ "step": 1964
1012
+ },
1013
+ {
1014
+ "epoch": 81.98,
1015
+ "eval_accuracy": 0.7552447552447552,
1016
+ "eval_loss": 0.6897542476654053,
1017
+ "eval_runtime": 5.0481,
1018
+ "eval_samples_per_second": 56.655,
1019
+ "eval_steps_per_second": 3.566,
1020
+ "step": 1988
1021
+ },
1022
+ {
1023
+ "epoch": 82.47,
1024
+ "grad_norm": 1.7141317129135132,
1025
+ "learning_rate": 2.2227777777777776e-05,
1026
+ "loss": 0.4093,
1027
+ "step": 2000
1028
+ },
1029
+ {
1030
+ "epoch": 82.97,
1031
+ "eval_accuracy": 0.7482517482517482,
1032
+ "eval_loss": 0.7004020810127258,
1033
+ "eval_runtime": 4.1986,
1034
+ "eval_samples_per_second": 68.118,
1035
+ "eval_steps_per_second": 4.287,
1036
+ "step": 2012
1037
+ },
1038
+ {
1039
+ "epoch": 84.0,
1040
+ "eval_accuracy": 0.7552447552447552,
1041
+ "eval_loss": 0.6867479681968689,
1042
+ "eval_runtime": 4.6871,
1043
+ "eval_samples_per_second": 61.018,
1044
+ "eval_steps_per_second": 3.84,
1045
+ "step": 2037
1046
+ },
1047
+ {
1048
+ "epoch": 84.54,
1049
+ "grad_norm": 2.0219666957855225,
1050
+ "learning_rate": 2.195e-05,
1051
+ "loss": 0.4148,
1052
+ "step": 2050
1053
+ },
1054
+ {
1055
+ "epoch": 84.99,
1056
+ "eval_accuracy": 0.7377622377622378,
1057
+ "eval_loss": 0.7070020437240601,
1058
+ "eval_runtime": 5.9326,
1059
+ "eval_samples_per_second": 48.208,
1060
+ "eval_steps_per_second": 3.034,
1061
+ "step": 2061
1062
+ },
1063
+ {
1064
+ "epoch": 85.98,
1065
+ "eval_accuracy": 0.7447552447552448,
1066
+ "eval_loss": 0.7030305862426758,
1067
+ "eval_runtime": 5.3564,
1068
+ "eval_samples_per_second": 53.394,
1069
+ "eval_steps_per_second": 3.36,
1070
+ "step": 2085
1071
+ },
1072
+ {
1073
+ "epoch": 86.6,
1074
+ "grad_norm": 1.4678714275360107,
1075
+ "learning_rate": 2.1672222222222223e-05,
1076
+ "loss": 0.3923,
1077
+ "step": 2100
1078
+ },
1079
+ {
1080
+ "epoch": 86.97,
1081
+ "eval_accuracy": 0.7587412587412588,
1082
+ "eval_loss": 0.678174614906311,
1083
+ "eval_runtime": 3.9745,
1084
+ "eval_samples_per_second": 71.96,
1085
+ "eval_steps_per_second": 4.529,
1086
+ "step": 2109
1087
+ },
1088
+ {
1089
+ "epoch": 88.0,
1090
+ "eval_accuracy": 0.7412587412587412,
1091
+ "eval_loss": 0.7166118621826172,
1092
+ "eval_runtime": 4.0358,
1093
+ "eval_samples_per_second": 70.866,
1094
+ "eval_steps_per_second": 4.46,
1095
+ "step": 2134
1096
+ },
1097
+ {
1098
+ "epoch": 88.66,
1099
+ "grad_norm": 1.589543342590332,
1100
+ "learning_rate": 2.1394444444444445e-05,
1101
+ "loss": 0.3964,
1102
+ "step": 2150
1103
+ },
1104
+ {
1105
+ "epoch": 88.99,
1106
+ "eval_accuracy": 0.7482517482517482,
1107
+ "eval_loss": 0.7075912952423096,
1108
+ "eval_runtime": 5.0331,
1109
+ "eval_samples_per_second": 56.823,
1110
+ "eval_steps_per_second": 3.576,
1111
+ "step": 2158
1112
+ },
1113
+ {
1114
+ "epoch": 89.98,
1115
+ "eval_accuracy": 0.7657342657342657,
1116
+ "eval_loss": 0.6867172122001648,
1117
+ "eval_runtime": 5.386,
1118
+ "eval_samples_per_second": 53.101,
1119
+ "eval_steps_per_second": 3.342,
1120
+ "step": 2182
1121
+ },
1122
+ {
1123
+ "epoch": 90.72,
1124
+ "grad_norm": 1.3886605501174927,
1125
+ "learning_rate": 2.1116666666666667e-05,
1126
+ "loss": 0.3846,
1127
+ "step": 2200
1128
+ },
1129
+ {
1130
+ "epoch": 90.97,
1131
+ "eval_accuracy": 0.7517482517482518,
1132
+ "eval_loss": 0.6913285851478577,
1133
+ "eval_runtime": 5.5324,
1134
+ "eval_samples_per_second": 51.696,
1135
+ "eval_steps_per_second": 3.254,
1136
+ "step": 2206
1137
+ },
1138
+ {
1139
+ "epoch": 92.0,
1140
+ "eval_accuracy": 0.7482517482517482,
1141
+ "eval_loss": 0.7160294651985168,
1142
+ "eval_runtime": 5.2753,
1143
+ "eval_samples_per_second": 54.215,
1144
+ "eval_steps_per_second": 3.412,
1145
+ "step": 2231
1146
+ },
1147
+ {
1148
+ "epoch": 92.78,
1149
+ "grad_norm": 2.4106783866882324,
1150
+ "learning_rate": 2.083888888888889e-05,
1151
+ "loss": 0.3654,
1152
+ "step": 2250
1153
+ },
1154
+ {
1155
+ "epoch": 92.99,
1156
+ "eval_accuracy": 0.7517482517482518,
1157
+ "eval_loss": 0.6765207052230835,
1158
+ "eval_runtime": 5.5671,
1159
+ "eval_samples_per_second": 51.373,
1160
+ "eval_steps_per_second": 3.233,
1161
+ "step": 2255
1162
+ },
1163
+ {
1164
+ "epoch": 93.98,
1165
+ "eval_accuracy": 0.7657342657342657,
1166
+ "eval_loss": 0.6881967186927795,
1167
+ "eval_runtime": 3.8228,
1168
+ "eval_samples_per_second": 74.814,
1169
+ "eval_steps_per_second": 4.709,
1170
+ "step": 2279
1171
+ },
1172
+ {
1173
+ "epoch": 94.85,
1174
+ "grad_norm": 0.8871183395385742,
1175
+ "learning_rate": 2.0561111111111114e-05,
1176
+ "loss": 0.3577,
1177
+ "step": 2300
1178
+ },
1179
+ {
1180
+ "epoch": 94.97,
1181
+ "eval_accuracy": 0.7552447552447552,
1182
+ "eval_loss": 0.6852585673332214,
1183
+ "eval_runtime": 4.7228,
1184
+ "eval_samples_per_second": 60.557,
1185
+ "eval_steps_per_second": 3.811,
1186
+ "step": 2303
1187
+ },
1188
+ {
1189
+ "epoch": 96.0,
1190
+ "eval_accuracy": 0.7552447552447552,
1191
+ "eval_loss": 0.7158808708190918,
1192
+ "eval_runtime": 5.6504,
1193
+ "eval_samples_per_second": 50.616,
1194
+ "eval_steps_per_second": 3.186,
1195
+ "step": 2328
1196
+ },
1197
+ {
1198
+ "epoch": 96.91,
1199
+ "grad_norm": 1.0019863843917847,
1200
+ "learning_rate": 2.0283333333333333e-05,
1201
+ "loss": 0.37,
1202
+ "step": 2350
1203
+ },
1204
+ {
1205
+ "epoch": 96.99,
1206
+ "eval_accuracy": 0.7657342657342657,
1207
+ "eval_loss": 0.6943120360374451,
1208
+ "eval_runtime": 4.8337,
1209
+ "eval_samples_per_second": 59.168,
1210
+ "eval_steps_per_second": 3.724,
1211
+ "step": 2352
1212
+ },
1213
+ {
1214
+ "epoch": 97.98,
1215
+ "eval_accuracy": 0.7587412587412588,
1216
+ "eval_loss": 0.7010317444801331,
1217
+ "eval_runtime": 4.6874,
1218
+ "eval_samples_per_second": 61.015,
1219
+ "eval_steps_per_second": 3.84,
1220
+ "step": 2376
1221
+ },
1222
+ {
1223
+ "epoch": 98.97,
1224
+ "grad_norm": 1.2908928394317627,
1225
+ "learning_rate": 2.0005555555555555e-05,
1226
+ "loss": 0.3473,
1227
+ "step": 2400
1228
+ },
1229
+ {
1230
+ "epoch": 98.97,
1231
+ "eval_accuracy": 0.7727272727272727,
1232
+ "eval_loss": 0.693758487701416,
1233
+ "eval_runtime": 4.7585,
1234
+ "eval_samples_per_second": 60.103,
1235
+ "eval_steps_per_second": 3.783,
1236
+ "step": 2400
1237
+ },
1238
+ {
1239
+ "epoch": 100.0,
1240
+ "eval_accuracy": 0.7587412587412588,
1241
+ "eval_loss": 0.6918778419494629,
1242
+ "eval_runtime": 6.6891,
1243
+ "eval_samples_per_second": 42.756,
1244
+ "eval_steps_per_second": 2.691,
1245
+ "step": 2425
1246
+ },
1247
+ {
1248
+ "epoch": 100.99,
1249
+ "eval_accuracy": 0.7552447552447552,
1250
+ "eval_loss": 0.6849302053451538,
1251
+ "eval_runtime": 4.4685,
1252
+ "eval_samples_per_second": 64.003,
1253
+ "eval_steps_per_second": 4.028,
1254
+ "step": 2449
1255
+ },
1256
+ {
1257
+ "epoch": 101.03,
1258
+ "grad_norm": 1.1730871200561523,
1259
+ "learning_rate": 1.972777777777778e-05,
1260
+ "loss": 0.3587,
1261
+ "step": 2450
1262
+ },
1263
+ {
1264
+ "epoch": 101.98,
1265
+ "eval_accuracy": 0.7587412587412588,
1266
+ "eval_loss": 0.6855939030647278,
1267
+ "eval_runtime": 4.3434,
1268
+ "eval_samples_per_second": 65.847,
1269
+ "eval_steps_per_second": 4.144,
1270
+ "step": 2473
1271
+ },
1272
+ {
1273
+ "epoch": 102.97,
1274
+ "eval_accuracy": 0.7517482517482518,
1275
+ "eval_loss": 0.7046144604682922,
1276
+ "eval_runtime": 4.7166,
1277
+ "eval_samples_per_second": 60.637,
1278
+ "eval_steps_per_second": 3.816,
1279
+ "step": 2497
1280
+ },
1281
+ {
1282
+ "epoch": 103.09,
1283
+ "grad_norm": 1.3693217039108276,
1284
+ "learning_rate": 1.945e-05,
1285
+ "loss": 0.3429,
1286
+ "step": 2500
1287
+ },
1288
+ {
1289
+ "epoch": 104.0,
1290
+ "eval_accuracy": 0.7727272727272727,
1291
+ "eval_loss": 0.6892997622489929,
1292
+ "eval_runtime": 5.3868,
1293
+ "eval_samples_per_second": 53.092,
1294
+ "eval_steps_per_second": 3.341,
1295
+ "step": 2522
1296
+ },
1297
+ {
1298
+ "epoch": 104.99,
1299
+ "eval_accuracy": 0.7622377622377622,
1300
+ "eval_loss": 0.6913393139839172,
1301
+ "eval_runtime": 5.09,
1302
+ "eval_samples_per_second": 56.188,
1303
+ "eval_steps_per_second": 3.536,
1304
+ "step": 2546
1305
+ },
1306
+ {
1307
+ "epoch": 105.15,
1308
+ "grad_norm": 1.923829436302185,
1309
+ "learning_rate": 1.9172222222222224e-05,
1310
+ "loss": 0.3549,
1311
+ "step": 2550
1312
+ },
1313
+ {
1314
+ "epoch": 105.98,
1315
+ "eval_accuracy": 0.7762237762237763,
1316
+ "eval_loss": 0.6880810856819153,
1317
+ "eval_runtime": 4.6668,
1318
+ "eval_samples_per_second": 61.283,
1319
+ "eval_steps_per_second": 3.857,
1320
+ "step": 2570
1321
+ },
1322
+ {
1323
+ "epoch": 106.97,
1324
+ "eval_accuracy": 0.7692307692307693,
1325
+ "eval_loss": 0.7097887396812439,
1326
+ "eval_runtime": 6.4652,
1327
+ "eval_samples_per_second": 44.237,
1328
+ "eval_steps_per_second": 2.784,
1329
+ "step": 2594
1330
+ },
1331
+ {
1332
+ "epoch": 107.22,
1333
+ "grad_norm": 2.702012062072754,
1334
+ "learning_rate": 1.8894444444444446e-05,
1335
+ "loss": 0.3403,
1336
+ "step": 2600
1337
+ },
1338
+ {
1339
+ "epoch": 108.0,
1340
+ "eval_accuracy": 0.7762237762237763,
1341
+ "eval_loss": 0.6878336668014526,
1342
+ "eval_runtime": 4.6923,
1343
+ "eval_samples_per_second": 60.951,
1344
+ "eval_steps_per_second": 3.836,
1345
+ "step": 2619
1346
+ },
1347
+ {
1348
+ "epoch": 108.99,
1349
+ "eval_accuracy": 0.7762237762237763,
1350
+ "eval_loss": 0.695954442024231,
1351
+ "eval_runtime": 4.4809,
1352
+ "eval_samples_per_second": 63.827,
1353
+ "eval_steps_per_second": 4.017,
1354
+ "step": 2643
1355
+ },
1356
+ {
1357
+ "epoch": 109.28,
1358
+ "grad_norm": 2.3427536487579346,
1359
+ "learning_rate": 1.8616666666666667e-05,
1360
+ "loss": 0.3253,
1361
+ "step": 2650
1362
+ },
1363
+ {
1364
+ "epoch": 109.98,
1365
+ "eval_accuracy": 0.7727272727272727,
1366
+ "eval_loss": 0.7005948424339294,
1367
+ "eval_runtime": 4.8882,
1368
+ "eval_samples_per_second": 58.508,
1369
+ "eval_steps_per_second": 3.682,
1370
+ "step": 2667
1371
+ },
1372
+ {
1373
+ "epoch": 110.97,
1374
+ "eval_accuracy": 0.7692307692307693,
1375
+ "eval_loss": 0.6916196346282959,
1376
+ "eval_runtime": 5.2891,
1377
+ "eval_samples_per_second": 54.073,
1378
+ "eval_steps_per_second": 3.403,
1379
+ "step": 2691
1380
+ },
1381
+ {
1382
+ "epoch": 111.34,
1383
+ "grad_norm": 2.178089141845703,
1384
+ "learning_rate": 1.833888888888889e-05,
1385
+ "loss": 0.3332,
1386
+ "step": 2700
1387
+ },
1388
+ {
1389
+ "epoch": 112.0,
1390
+ "eval_accuracy": 0.7657342657342657,
1391
+ "eval_loss": 0.7059447765350342,
1392
+ "eval_runtime": 4.7437,
1393
+ "eval_samples_per_second": 60.291,
1394
+ "eval_steps_per_second": 3.795,
1395
+ "step": 2716
1396
+ },
1397
+ {
1398
+ "epoch": 112.99,
1399
+ "eval_accuracy": 0.7867132867132867,
1400
+ "eval_loss": 0.6904045939445496,
1401
+ "eval_runtime": 4.9942,
1402
+ "eval_samples_per_second": 57.267,
1403
+ "eval_steps_per_second": 3.604,
1404
+ "step": 2740
1405
+ },
1406
+ {
1407
+ "epoch": 113.4,
1408
+ "grad_norm": 1.1625444889068604,
1409
+ "learning_rate": 1.806111111111111e-05,
1410
+ "loss": 0.3188,
1411
+ "step": 2750
1412
+ },
1413
+ {
1414
+ "epoch": 113.98,
1415
+ "eval_accuracy": 0.7727272727272727,
1416
+ "eval_loss": 0.6970774531364441,
1417
+ "eval_runtime": 6.4809,
1418
+ "eval_samples_per_second": 44.13,
1419
+ "eval_steps_per_second": 2.777,
1420
+ "step": 2764
1421
+ },
1422
+ {
1423
+ "epoch": 114.97,
1424
+ "eval_accuracy": 0.7797202797202797,
1425
+ "eval_loss": 0.700820803642273,
1426
+ "eval_runtime": 5.2617,
1427
+ "eval_samples_per_second": 54.355,
1428
+ "eval_steps_per_second": 3.421,
1429
+ "step": 2788
1430
+ },
1431
+ {
1432
+ "epoch": 115.46,
1433
+ "grad_norm": 1.2394715547561646,
1434
+ "learning_rate": 1.7783333333333333e-05,
1435
+ "loss": 0.3112,
1436
+ "step": 2800
1437
+ },
1438
+ {
1439
+ "epoch": 116.0,
1440
+ "eval_accuracy": 0.7797202797202797,
1441
+ "eval_loss": 0.7002130150794983,
1442
+ "eval_runtime": 5.0937,
1443
+ "eval_samples_per_second": 56.147,
1444
+ "eval_steps_per_second": 3.534,
1445
+ "step": 2813
1446
+ },
1447
+ {
1448
+ "epoch": 116.99,
1449
+ "eval_accuracy": 0.7692307692307693,
1450
+ "eval_loss": 0.6909505724906921,
1451
+ "eval_runtime": 4.7575,
1452
+ "eval_samples_per_second": 60.116,
1453
+ "eval_steps_per_second": 3.784,
1454
+ "step": 2837
1455
+ },
1456
+ {
1457
+ "epoch": 117.53,
1458
+ "grad_norm": 2.4334964752197266,
1459
+ "learning_rate": 1.7505555555555558e-05,
1460
+ "loss": 0.3153,
1461
+ "step": 2850
1462
+ },
1463
+ {
1464
+ "epoch": 117.98,
1465
+ "eval_accuracy": 0.7797202797202797,
1466
+ "eval_loss": 0.6957750916481018,
1467
+ "eval_runtime": 4.8105,
1468
+ "eval_samples_per_second": 59.453,
1469
+ "eval_steps_per_second": 3.742,
1470
+ "step": 2861
1471
+ },
1472
+ {
1473
+ "epoch": 118.97,
1474
+ "eval_accuracy": 0.7762237762237763,
1475
+ "eval_loss": 0.6867520213127136,
1476
+ "eval_runtime": 4.5411,
1477
+ "eval_samples_per_second": 62.98,
1478
+ "eval_steps_per_second": 3.964,
1479
+ "step": 2885
1480
+ },
1481
+ {
1482
+ "epoch": 119.59,
1483
+ "grad_norm": 0.769097089767456,
1484
+ "learning_rate": 1.7227777777777777e-05,
1485
+ "loss": 0.3006,
1486
+ "step": 2900
1487
+ },
1488
+ {
1489
+ "epoch": 120.0,
1490
+ "eval_accuracy": 0.7727272727272727,
1491
+ "eval_loss": 0.6890790462493896,
1492
+ "eval_runtime": 4.5864,
1493
+ "eval_samples_per_second": 62.358,
1494
+ "eval_steps_per_second": 3.925,
1495
+ "step": 2910
1496
+ },
1497
+ {
1498
+ "epoch": 120.99,
1499
+ "eval_accuracy": 0.7657342657342657,
1500
+ "eval_loss": 0.6889089941978455,
1501
+ "eval_runtime": 6.5804,
1502
+ "eval_samples_per_second": 43.462,
1503
+ "eval_steps_per_second": 2.735,
1504
+ "step": 2934
1505
+ },
1506
+ {
1507
+ "epoch": 121.65,
1508
+ "grad_norm": 1.8714542388916016,
1509
+ "learning_rate": 1.695e-05,
1510
+ "loss": 0.2967,
1511
+ "step": 2950
1512
+ },
1513
+ {
1514
+ "epoch": 121.98,
1515
+ "eval_accuracy": 0.7657342657342657,
1516
+ "eval_loss": 0.6935350894927979,
1517
+ "eval_runtime": 4.7491,
1518
+ "eval_samples_per_second": 60.223,
1519
+ "eval_steps_per_second": 3.79,
1520
+ "step": 2958
1521
+ },
1522
+ {
1523
+ "epoch": 122.97,
1524
+ "eval_accuracy": 0.7692307692307693,
1525
+ "eval_loss": 0.7058219909667969,
1526
+ "eval_runtime": 4.8941,
1527
+ "eval_samples_per_second": 58.438,
1528
+ "eval_steps_per_second": 3.678,
1529
+ "step": 2982
1530
+ },
1531
+ {
1532
+ "epoch": 123.71,
1533
+ "grad_norm": 2.062924385070801,
1534
+ "learning_rate": 1.6672222222222224e-05,
1535
+ "loss": 0.2939,
1536
+ "step": 3000
1537
+ },
1538
+ {
1539
+ "epoch": 124.0,
1540
+ "eval_accuracy": 0.7657342657342657,
1541
+ "eval_loss": 0.7220865488052368,
1542
+ "eval_runtime": 5.0487,
1543
+ "eval_samples_per_second": 56.648,
1544
+ "eval_steps_per_second": 3.565,
1545
+ "step": 3007
1546
+ },
1547
+ {
1548
+ "epoch": 124.99,
1549
+ "eval_accuracy": 0.7727272727272727,
1550
+ "eval_loss": 0.6857044696807861,
1551
+ "eval_runtime": 5.6134,
1552
+ "eval_samples_per_second": 50.95,
1553
+ "eval_steps_per_second": 3.207,
1554
+ "step": 3031
1555
+ },
1556
+ {
1557
+ "epoch": 125.77,
1558
+ "grad_norm": 1.7039302587509155,
1559
+ "learning_rate": 1.6394444444444446e-05,
1560
+ "loss": 0.3101,
1561
+ "step": 3050
1562
+ },
1563
+ {
1564
+ "epoch": 125.98,
1565
+ "eval_accuracy": 0.7762237762237763,
1566
+ "eval_loss": 0.6742061972618103,
1567
+ "eval_runtime": 5.3609,
1568
+ "eval_samples_per_second": 53.349,
1569
+ "eval_steps_per_second": 3.358,
1570
+ "step": 3055
1571
+ },
1572
+ {
1573
+ "epoch": 126.97,
1574
+ "eval_accuracy": 0.7727272727272727,
1575
+ "eval_loss": 0.7029407620429993,
1576
+ "eval_runtime": 5.8891,
1577
+ "eval_samples_per_second": 48.564,
1578
+ "eval_steps_per_second": 3.056,
1579
+ "step": 3079
1580
+ },
1581
+ {
1582
+ "epoch": 127.84,
1583
+ "grad_norm": 1.434970736503601,
1584
+ "learning_rate": 1.6116666666666668e-05,
1585
+ "loss": 0.284,
1586
+ "step": 3100
1587
+ },
1588
+ {
1589
+ "epoch": 128.0,
1590
+ "eval_accuracy": 0.7762237762237763,
1591
+ "eval_loss": 0.682050347328186,
1592
+ "eval_runtime": 5.1437,
1593
+ "eval_samples_per_second": 55.602,
1594
+ "eval_steps_per_second": 3.499,
1595
+ "step": 3104
1596
+ },
1597
+ {
1598
+ "epoch": 128.99,
1599
+ "eval_accuracy": 0.7762237762237763,
1600
+ "eval_loss": 0.68370121717453,
1601
+ "eval_runtime": 4.2733,
1602
+ "eval_samples_per_second": 66.927,
1603
+ "eval_steps_per_second": 4.212,
1604
+ "step": 3128
1605
+ },
1606
+ {
1607
+ "epoch": 129.9,
1608
+ "grad_norm": 1.320789098739624,
1609
+ "learning_rate": 1.583888888888889e-05,
1610
+ "loss": 0.2902,
1611
+ "step": 3150
1612
+ },
1613
+ {
1614
+ "epoch": 129.98,
1615
+ "eval_accuracy": 0.7727272727272727,
1616
+ "eval_loss": 0.6823462843894958,
1617
+ "eval_runtime": 5.7566,
1618
+ "eval_samples_per_second": 49.682,
1619
+ "eval_steps_per_second": 3.127,
1620
+ "step": 3152
1621
+ },
1622
+ {
1623
+ "epoch": 130.97,
1624
+ "eval_accuracy": 0.7762237762237763,
1625
+ "eval_loss": 0.6950440406799316,
1626
+ "eval_runtime": 4.9248,
1627
+ "eval_samples_per_second": 58.074,
1628
+ "eval_steps_per_second": 3.655,
1629
+ "step": 3176
1630
+ },
1631
+ {
1632
+ "epoch": 131.96,
1633
+ "grad_norm": 2.1280930042266846,
1634
+ "learning_rate": 1.556111111111111e-05,
1635
+ "loss": 0.301,
1636
+ "step": 3200
1637
+ },
1638
+ {
1639
+ "epoch": 132.0,
1640
+ "eval_accuracy": 0.7727272727272727,
1641
+ "eval_loss": 0.6800761818885803,
1642
+ "eval_runtime": 8.1328,
1643
+ "eval_samples_per_second": 35.166,
1644
+ "eval_steps_per_second": 2.213,
1645
+ "step": 3201
1646
+ },
1647
+ {
1648
+ "epoch": 132.99,
1649
+ "eval_accuracy": 0.7762237762237763,
1650
+ "eval_loss": 0.6867505311965942,
1651
+ "eval_runtime": 4.2532,
1652
+ "eval_samples_per_second": 67.244,
1653
+ "eval_steps_per_second": 4.232,
1654
+ "step": 3225
1655
+ },
1656
+ {
1657
+ "epoch": 133.98,
1658
+ "eval_accuracy": 0.7797202797202797,
1659
+ "eval_loss": 0.7061284184455872,
1660
+ "eval_runtime": 5.3031,
1661
+ "eval_samples_per_second": 53.93,
1662
+ "eval_steps_per_second": 3.394,
1663
+ "step": 3249
1664
+ },
1665
+ {
1666
+ "epoch": 134.02,
1667
+ "grad_norm": 1.532638669013977,
1668
+ "learning_rate": 1.5283333333333333e-05,
1669
+ "loss": 0.2736,
1670
+ "step": 3250
1671
+ },
1672
+ {
1673
+ "epoch": 134.97,
1674
+ "eval_accuracy": 0.7727272727272727,
1675
+ "eval_loss": 0.7114368677139282,
1676
+ "eval_runtime": 4.6536,
1677
+ "eval_samples_per_second": 61.458,
1678
+ "eval_steps_per_second": 3.868,
1679
+ "step": 3273
1680
+ },
1681
+ {
1682
+ "epoch": 136.0,
1683
+ "eval_accuracy": 0.7762237762237763,
1684
+ "eval_loss": 0.6914551854133606,
1685
+ "eval_runtime": 4.5505,
1686
+ "eval_samples_per_second": 62.851,
1687
+ "eval_steps_per_second": 3.956,
1688
+ "step": 3298
1689
+ },
1690
+ {
1691
+ "epoch": 136.08,
1692
+ "grad_norm": 2.0108492374420166,
1693
+ "learning_rate": 1.5005555555555555e-05,
1694
+ "loss": 0.2931,
1695
+ "step": 3300
1696
+ },
1697
+ {
1698
+ "epoch": 136.99,
1699
+ "eval_accuracy": 0.7797202797202797,
1700
+ "eval_loss": 0.7055917978286743,
1701
+ "eval_runtime": 5.3067,
1702
+ "eval_samples_per_second": 53.894,
1703
+ "eval_steps_per_second": 3.392,
1704
+ "step": 3322
1705
+ },
1706
+ {
1707
+ "epoch": 137.98,
1708
+ "eval_accuracy": 0.7727272727272727,
1709
+ "eval_loss": 0.7026935815811157,
1710
+ "eval_runtime": 5.186,
1711
+ "eval_samples_per_second": 55.149,
1712
+ "eval_steps_per_second": 3.471,
1713
+ "step": 3346
1714
+ },
1715
+ {
1716
+ "epoch": 138.14,
1717
+ "grad_norm": 1.0804469585418701,
1718
+ "learning_rate": 1.4727777777777779e-05,
1719
+ "loss": 0.2864,
1720
+ "step": 3350
1721
+ },
1722
+ {
1723
+ "epoch": 138.97,
1724
+ "eval_accuracy": 0.7657342657342657,
1725
+ "eval_loss": 0.6983500719070435,
1726
+ "eval_runtime": 6.955,
1727
+ "eval_samples_per_second": 41.122,
1728
+ "eval_steps_per_second": 2.588,
1729
+ "step": 3370
1730
+ },
1731
+ {
1732
+ "epoch": 140.0,
1733
+ "eval_accuracy": 0.7657342657342657,
1734
+ "eval_loss": 0.7168787121772766,
1735
+ "eval_runtime": 4.234,
1736
+ "eval_samples_per_second": 67.548,
1737
+ "eval_steps_per_second": 4.251,
1738
+ "step": 3395
1739
+ },
1740
+ {
1741
+ "epoch": 140.21,
1742
+ "grad_norm": 2.370694637298584,
1743
+ "learning_rate": 1.445e-05,
1744
+ "loss": 0.2765,
1745
+ "step": 3400
1746
+ },
1747
+ {
1748
+ "epoch": 140.99,
1749
+ "eval_accuracy": 0.7762237762237763,
1750
+ "eval_loss": 0.6960318088531494,
1751
+ "eval_runtime": 5.0294,
1752
+ "eval_samples_per_second": 56.865,
1753
+ "eval_steps_per_second": 3.579,
1754
+ "step": 3419
1755
+ },
1756
+ {
1757
+ "epoch": 141.98,
1758
+ "eval_accuracy": 0.7762237762237763,
1759
+ "eval_loss": 0.6990492343902588,
1760
+ "eval_runtime": 5.2727,
1761
+ "eval_samples_per_second": 54.242,
1762
+ "eval_steps_per_second": 3.414,
1763
+ "step": 3443
1764
+ },
1765
+ {
1766
+ "epoch": 142.27,
1767
+ "grad_norm": 1.6676194667816162,
1768
+ "learning_rate": 1.4172222222222222e-05,
1769
+ "loss": 0.2808,
1770
+ "step": 3450
1771
+ },
1772
+ {
1773
+ "epoch": 142.97,
1774
+ "eval_accuracy": 0.7797202797202797,
1775
+ "eval_loss": 0.706200897693634,
1776
+ "eval_runtime": 4.5273,
1777
+ "eval_samples_per_second": 63.173,
1778
+ "eval_steps_per_second": 3.976,
1779
+ "step": 3467
1780
+ },
1781
+ {
1782
+ "epoch": 144.0,
1783
+ "eval_accuracy": 0.7657342657342657,
1784
+ "eval_loss": 0.6821764707565308,
1785
+ "eval_runtime": 5.3614,
1786
+ "eval_samples_per_second": 53.344,
1787
+ "eval_steps_per_second": 3.357,
1788
+ "step": 3492
1789
+ },
1790
+ {
1791
+ "epoch": 144.33,
1792
+ "grad_norm": 1.9151145219802856,
1793
+ "learning_rate": 1.3894444444444444e-05,
1794
+ "loss": 0.2712,
1795
+ "step": 3500
1796
+ },
1797
+ {
1798
+ "epoch": 144.99,
1799
+ "eval_accuracy": 0.7762237762237763,
1800
+ "eval_loss": 0.7063603401184082,
1801
+ "eval_runtime": 4.9088,
1802
+ "eval_samples_per_second": 58.263,
1803
+ "eval_steps_per_second": 3.667,
1804
+ "step": 3516
1805
+ },
1806
+ {
1807
+ "epoch": 145.98,
1808
+ "eval_accuracy": 0.7692307692307693,
1809
+ "eval_loss": 0.7150112390518188,
1810
+ "eval_runtime": 7.2044,
1811
+ "eval_samples_per_second": 39.698,
1812
+ "eval_steps_per_second": 2.498,
1813
+ "step": 3540
1814
+ },
1815
+ {
1816
+ "epoch": 146.39,
1817
+ "grad_norm": 1.5093848705291748,
1818
+ "learning_rate": 1.3622222222222223e-05,
1819
+ "loss": 0.2726,
1820
+ "step": 3550
1821
+ },
1822
+ {
1823
+ "epoch": 146.97,
1824
+ "eval_accuracy": 0.7797202797202797,
1825
+ "eval_loss": 0.696849524974823,
1826
+ "eval_runtime": 4.9386,
1827
+ "eval_samples_per_second": 57.911,
1828
+ "eval_steps_per_second": 3.645,
1829
+ "step": 3564
1830
+ },
1831
+ {
1832
+ "epoch": 148.0,
1833
+ "eval_accuracy": 0.7727272727272727,
1834
+ "eval_loss": 0.7086759209632874,
1835
+ "eval_runtime": 4.4363,
1836
+ "eval_samples_per_second": 64.468,
1837
+ "eval_steps_per_second": 4.057,
1838
+ "step": 3589
1839
+ },
1840
+ {
1841
+ "epoch": 148.45,
1842
+ "grad_norm": 1.4403679370880127,
1843
+ "learning_rate": 1.3344444444444444e-05,
1844
+ "loss": 0.2607,
1845
+ "step": 3600
1846
+ },
1847
+ {
1848
+ "epoch": 148.99,
1849
+ "eval_accuracy": 0.7692307692307693,
1850
+ "eval_loss": 0.7129560112953186,
1851
+ "eval_runtime": 5.3809,
1852
+ "eval_samples_per_second": 53.15,
1853
+ "eval_steps_per_second": 3.345,
1854
+ "step": 3613
1855
+ },
1856
+ {
1857
+ "epoch": 149.98,
1858
+ "eval_accuracy": 0.7902097902097902,
1859
+ "eval_loss": 0.7080287933349609,
1860
+ "eval_runtime": 5.8187,
1861
+ "eval_samples_per_second": 49.152,
1862
+ "eval_steps_per_second": 3.093,
1863
+ "step": 3637
1864
+ },
1865
+ {
1866
+ "epoch": 150.52,
1867
+ "grad_norm": 2.036515235900879,
1868
+ "learning_rate": 1.3066666666666666e-05,
1869
+ "loss": 0.2546,
1870
+ "step": 3650
1871
+ },
1872
+ {
1873
+ "epoch": 150.97,
1874
+ "eval_accuracy": 0.7762237762237763,
1875
+ "eval_loss": 0.7088435888290405,
1876
+ "eval_runtime": 4.8742,
1877
+ "eval_samples_per_second": 58.677,
1878
+ "eval_steps_per_second": 3.693,
1879
+ "step": 3661
1880
+ },
1881
+ {
1882
+ "epoch": 152.0,
1883
+ "eval_accuracy": 0.7797202797202797,
1884
+ "eval_loss": 0.7030193209648132,
1885
+ "eval_runtime": 4.9492,
1886
+ "eval_samples_per_second": 57.787,
1887
+ "eval_steps_per_second": 3.637,
1888
+ "step": 3686
1889
+ },
1890
+ {
1891
+ "epoch": 152.58,
1892
+ "grad_norm": 1.200052261352539,
1893
+ "learning_rate": 1.2788888888888888e-05,
1894
+ "loss": 0.2563,
1895
+ "step": 3700
1896
+ },
1897
+ {
1898
+ "epoch": 152.99,
1899
+ "eval_accuracy": 0.7692307692307693,
1900
+ "eval_loss": 0.7077969908714294,
1901
+ "eval_runtime": 4.614,
1902
+ "eval_samples_per_second": 61.985,
1903
+ "eval_steps_per_second": 3.901,
1904
+ "step": 3710
1905
+ },
1906
+ {
1907
+ "epoch": 153.98,
1908
+ "eval_accuracy": 0.7727272727272727,
1909
+ "eval_loss": 0.700455904006958,
1910
+ "eval_runtime": 5.7657,
1911
+ "eval_samples_per_second": 49.604,
1912
+ "eval_steps_per_second": 3.122,
1913
+ "step": 3734
1914
+ },
1915
+ {
1916
+ "epoch": 154.64,
1917
+ "grad_norm": 2.2751214504241943,
1918
+ "learning_rate": 1.2511111111111112e-05,
1919
+ "loss": 0.2531,
1920
+ "step": 3750
1921
+ },
1922
+ {
1923
+ "epoch": 154.97,
1924
+ "eval_accuracy": 0.7727272727272727,
1925
+ "eval_loss": 0.7160292267799377,
1926
+ "eval_runtime": 5.1079,
1927
+ "eval_samples_per_second": 55.992,
1928
+ "eval_steps_per_second": 3.524,
1929
+ "step": 3758
1930
+ },
1931
+ {
1932
+ "epoch": 156.0,
1933
+ "eval_accuracy": 0.7797202797202797,
1934
+ "eval_loss": 0.7175909876823425,
1935
+ "eval_runtime": 5.4035,
1936
+ "eval_samples_per_second": 52.929,
1937
+ "eval_steps_per_second": 3.331,
1938
+ "step": 3783
1939
+ },
1940
+ {
1941
+ "epoch": 156.7,
1942
+ "grad_norm": 1.9024412631988525,
1943
+ "learning_rate": 1.2233333333333334e-05,
1944
+ "loss": 0.2446,
1945
+ "step": 3800
1946
+ },
1947
+ {
1948
+ "epoch": 156.99,
1949
+ "eval_accuracy": 0.7762237762237763,
1950
+ "eval_loss": 0.7190600037574768,
1951
+ "eval_runtime": 4.3633,
1952
+ "eval_samples_per_second": 65.546,
1953
+ "eval_steps_per_second": 4.125,
1954
+ "step": 3807
1955
+ },
1956
+ {
1957
+ "epoch": 157.98,
1958
+ "eval_accuracy": 0.7797202797202797,
1959
+ "eval_loss": 0.719641387462616,
1960
+ "eval_runtime": 5.0426,
1961
+ "eval_samples_per_second": 56.717,
1962
+ "eval_steps_per_second": 3.57,
1963
+ "step": 3831
1964
+ },
1965
+ {
1966
+ "epoch": 158.76,
1967
+ "grad_norm": 3.471806287765503,
1968
+ "learning_rate": 1.1955555555555556e-05,
1969
+ "loss": 0.2479,
1970
+ "step": 3850
1971
+ },
1972
+ {
1973
+ "epoch": 158.97,
1974
+ "eval_accuracy": 0.7797202797202797,
1975
+ "eval_loss": 0.7073430418968201,
1976
+ "eval_runtime": 3.6336,
1977
+ "eval_samples_per_second": 78.711,
1978
+ "eval_steps_per_second": 4.954,
1979
+ "step": 3855
1980
+ },
1981
+ {
1982
+ "epoch": 160.0,
1983
+ "eval_accuracy": 0.7797202797202797,
1984
+ "eval_loss": 0.7328661680221558,
1985
+ "eval_runtime": 5.2625,
1986
+ "eval_samples_per_second": 54.347,
1987
+ "eval_steps_per_second": 3.42,
1988
+ "step": 3880
1989
+ },
1990
+ {
1991
+ "epoch": 160.82,
1992
+ "grad_norm": 2.1171793937683105,
1993
+ "learning_rate": 1.1677777777777777e-05,
1994
+ "loss": 0.2523,
1995
+ "step": 3900
1996
+ },
1997
+ {
1998
+ "epoch": 160.99,
1999
+ "eval_accuracy": 0.7832167832167832,
2000
+ "eval_loss": 0.7158821821212769,
2001
+ "eval_runtime": 6.5877,
2002
+ "eval_samples_per_second": 43.414,
2003
+ "eval_steps_per_second": 2.732,
2004
+ "step": 3904
2005
+ },
2006
+ {
2007
+ "epoch": 161.98,
2008
+ "eval_accuracy": 0.7692307692307693,
2009
+ "eval_loss": 0.719171404838562,
2010
+ "eval_runtime": 4.5674,
2011
+ "eval_samples_per_second": 62.618,
2012
+ "eval_steps_per_second": 3.941,
2013
+ "step": 3928
2014
+ },
2015
+ {
2016
+ "epoch": 162.89,
2017
+ "grad_norm": 1.7515395879745483,
2018
+ "learning_rate": 1.1400000000000001e-05,
2019
+ "loss": 0.2523,
2020
+ "step": 3950
2021
+ },
2022
+ {
2023
+ "epoch": 162.97,
2024
+ "eval_accuracy": 0.7762237762237763,
2025
+ "eval_loss": 0.7281435132026672,
2026
+ "eval_runtime": 4.4866,
2027
+ "eval_samples_per_second": 63.746,
2028
+ "eval_steps_per_second": 4.012,
2029
+ "step": 3952
2030
+ },
2031
+ {
2032
+ "epoch": 164.0,
2033
+ "eval_accuracy": 0.7832167832167832,
2034
+ "eval_loss": 0.7078841328620911,
2035
+ "eval_runtime": 4.4241,
2036
+ "eval_samples_per_second": 64.645,
2037
+ "eval_steps_per_second": 4.069,
2038
+ "step": 3977
2039
+ },
2040
+ {
2041
+ "epoch": 164.95,
2042
+ "grad_norm": 1.456335186958313,
2043
+ "learning_rate": 1.1122222222222223e-05,
2044
+ "loss": 0.2422,
2045
+ "step": 4000
2046
+ },
2047
+ {
2048
+ "epoch": 164.99,
2049
+ "eval_accuracy": 0.7762237762237763,
2050
+ "eval_loss": 0.7161521911621094,
2051
+ "eval_runtime": 5.1239,
2052
+ "eval_samples_per_second": 55.817,
2053
+ "eval_steps_per_second": 3.513,
2054
+ "step": 4001
2055
+ },
2056
+ {
2057
+ "epoch": 165.98,
2058
+ "eval_accuracy": 0.7832167832167832,
2059
+ "eval_loss": 0.7190020084381104,
2060
+ "eval_runtime": 3.4488,
2061
+ "eval_samples_per_second": 82.926,
2062
+ "eval_steps_per_second": 5.219,
2063
+ "step": 4025
2064
+ },
2065
+ {
2066
+ "epoch": 166.97,
2067
+ "eval_accuracy": 0.7762237762237763,
2068
+ "eval_loss": 0.7311248779296875,
2069
+ "eval_runtime": 5.0389,
2070
+ "eval_samples_per_second": 56.759,
2071
+ "eval_steps_per_second": 3.572,
2072
+ "step": 4049
2073
+ },
2074
+ {
2075
+ "epoch": 167.01,
2076
+ "grad_norm": 1.2554075717926025,
2077
+ "learning_rate": 1.0844444444444445e-05,
2078
+ "loss": 0.242,
2079
+ "step": 4050
2080
+ },
2081
+ {
2082
+ "epoch": 168.0,
2083
+ "eval_accuracy": 0.7902097902097902,
2084
+ "eval_loss": 0.7110462188720703,
2085
+ "eval_runtime": 4.4612,
2086
+ "eval_samples_per_second": 64.108,
2087
+ "eval_steps_per_second": 4.035,
2088
+ "step": 4074
2089
+ },
2090
+ {
2091
+ "epoch": 168.99,
2092
+ "eval_accuracy": 0.7867132867132867,
2093
+ "eval_loss": 0.7028501629829407,
2094
+ "eval_runtime": 6.955,
2095
+ "eval_samples_per_second": 41.122,
2096
+ "eval_steps_per_second": 2.588,
2097
+ "step": 4098
2098
+ },
2099
+ {
2100
+ "epoch": 169.07,
2101
+ "grad_norm": 2.8003265857696533,
2102
+ "learning_rate": 1.0566666666666667e-05,
2103
+ "loss": 0.2392,
2104
+ "step": 4100
2105
+ },
2106
+ {
2107
+ "epoch": 169.98,
2108
+ "eval_accuracy": 0.7937062937062938,
2109
+ "eval_loss": 0.7108554840087891,
2110
+ "eval_runtime": 5.0033,
2111
+ "eval_samples_per_second": 57.162,
2112
+ "eval_steps_per_second": 3.598,
2113
+ "step": 4122
2114
+ },
2115
+ {
2116
+ "epoch": 170.97,
2117
+ "eval_accuracy": 0.7902097902097902,
2118
+ "eval_loss": 0.7106384634971619,
2119
+ "eval_runtime": 5.1984,
2120
+ "eval_samples_per_second": 55.017,
2121
+ "eval_steps_per_second": 3.463,
2122
+ "step": 4146
2123
+ },
2124
+ {
2125
+ "epoch": 171.13,
2126
+ "grad_norm": 2.1897969245910645,
2127
+ "learning_rate": 1.028888888888889e-05,
2128
+ "loss": 0.247,
2129
+ "step": 4150
2130
+ },
2131
+ {
2132
+ "epoch": 172.0,
2133
+ "eval_accuracy": 0.7867132867132867,
2134
+ "eval_loss": 0.7151694297790527,
2135
+ "eval_runtime": 5.1963,
2136
+ "eval_samples_per_second": 55.039,
2137
+ "eval_steps_per_second": 3.464,
2138
+ "step": 4171
2139
+ },
2140
+ {
2141
+ "epoch": 172.99,
2142
+ "eval_accuracy": 0.7657342657342657,
2143
+ "eval_loss": 0.7254167795181274,
2144
+ "eval_runtime": 4.4466,
2145
+ "eval_samples_per_second": 64.319,
2146
+ "eval_steps_per_second": 4.048,
2147
+ "step": 4195
2148
+ },
2149
+ {
2150
+ "epoch": 173.2,
2151
+ "grad_norm": 2.769357681274414,
2152
+ "learning_rate": 1.0011111111111112e-05,
2153
+ "loss": 0.2341,
2154
+ "step": 4200
2155
+ },
2156
+ {
2157
+ "epoch": 173.98,
2158
+ "eval_accuracy": 0.7832167832167832,
2159
+ "eval_loss": 0.7290962338447571,
2160
+ "eval_runtime": 6.2221,
2161
+ "eval_samples_per_second": 45.965,
2162
+ "eval_steps_per_second": 2.893,
2163
+ "step": 4219
2164
+ },
2165
+ {
2166
+ "epoch": 174.97,
2167
+ "eval_accuracy": 0.7867132867132867,
2168
+ "eval_loss": 0.7088623046875,
2169
+ "eval_runtime": 4.3709,
2170
+ "eval_samples_per_second": 65.433,
2171
+ "eval_steps_per_second": 4.118,
2172
+ "step": 4243
2173
+ },
2174
+ {
2175
+ "epoch": 175.26,
2176
+ "grad_norm": 2.044703483581543,
2177
+ "learning_rate": 9.733333333333332e-06,
2178
+ "loss": 0.2317,
2179
+ "step": 4250
2180
+ },
2181
+ {
2182
+ "epoch": 176.0,
2183
+ "eval_accuracy": 0.7902097902097902,
2184
+ "eval_loss": 0.7185826897621155,
2185
+ "eval_runtime": 5.4095,
2186
+ "eval_samples_per_second": 52.87,
2187
+ "eval_steps_per_second": 3.327,
2188
+ "step": 4268
2189
+ },
2190
+ {
2191
+ "epoch": 176.99,
2192
+ "eval_accuracy": 0.7797202797202797,
2193
+ "eval_loss": 0.7167823314666748,
2194
+ "eval_runtime": 4.9506,
2195
+ "eval_samples_per_second": 57.77,
2196
+ "eval_steps_per_second": 3.636,
2197
+ "step": 4292
2198
+ },
2199
+ {
2200
+ "epoch": 177.32,
2201
+ "grad_norm": 1.078834056854248,
2202
+ "learning_rate": 9.455555555555556e-06,
2203
+ "loss": 0.2269,
2204
+ "step": 4300
2205
+ },
2206
+ {
2207
+ "epoch": 177.98,
2208
+ "eval_accuracy": 0.7902097902097902,
2209
+ "eval_loss": 0.7237738966941833,
2210
+ "eval_runtime": 4.781,
2211
+ "eval_samples_per_second": 59.82,
2212
+ "eval_steps_per_second": 3.765,
2213
+ "step": 4316
2214
+ },
2215
+ {
2216
+ "epoch": 178.97,
2217
+ "eval_accuracy": 0.7867132867132867,
2218
+ "eval_loss": 0.7131801247596741,
2219
+ "eval_runtime": 4.6869,
2220
+ "eval_samples_per_second": 61.022,
2221
+ "eval_steps_per_second": 3.841,
2222
+ "step": 4340
2223
+ },
2224
+ {
2225
+ "epoch": 179.38,
2226
+ "grad_norm": 2.008120536804199,
2227
+ "learning_rate": 9.177777777777778e-06,
2228
+ "loss": 0.2283,
2229
+ "step": 4350
2230
+ },
2231
+ {
2232
+ "epoch": 180.0,
2233
+ "eval_accuracy": 0.7797202797202797,
2234
+ "eval_loss": 0.7384253144264221,
2235
+ "eval_runtime": 4.5879,
2236
+ "eval_samples_per_second": 62.338,
2237
+ "eval_steps_per_second": 3.923,
2238
+ "step": 4365
2239
+ },
2240
+ {
2241
+ "epoch": 180.99,
2242
+ "eval_accuracy": 0.7902097902097902,
2243
+ "eval_loss": 0.7002861499786377,
2244
+ "eval_runtime": 5.3238,
2245
+ "eval_samples_per_second": 53.721,
2246
+ "eval_steps_per_second": 3.381,
2247
+ "step": 4389
2248
+ },
2249
+ {
2250
+ "epoch": 181.44,
2251
+ "grad_norm": 1.9518792629241943,
2252
+ "learning_rate": 8.900000000000001e-06,
2253
+ "loss": 0.2303,
2254
+ "step": 4400
2255
+ },
2256
+ {
2257
+ "epoch": 181.98,
2258
+ "eval_accuracy": 0.7797202797202797,
2259
+ "eval_loss": 0.7278482913970947,
2260
+ "eval_runtime": 5.8358,
2261
+ "eval_samples_per_second": 49.008,
2262
+ "eval_steps_per_second": 3.084,
2263
+ "step": 4413
2264
+ },
2265
+ {
2266
+ "epoch": 182.97,
2267
+ "eval_accuracy": 0.7832167832167832,
2268
+ "eval_loss": 0.7143127918243408,
2269
+ "eval_runtime": 6.1229,
2270
+ "eval_samples_per_second": 46.71,
2271
+ "eval_steps_per_second": 2.94,
2272
+ "step": 4437
2273
+ },
2274
+ {
2275
+ "epoch": 183.51,
2276
+ "grad_norm": 1.0936890840530396,
2277
+ "learning_rate": 8.622222222222221e-06,
2278
+ "loss": 0.2109,
2279
+ "step": 4450
2280
+ },
2281
+ {
2282
+ "epoch": 184.0,
2283
+ "eval_accuracy": 0.7797202797202797,
2284
+ "eval_loss": 0.7406834363937378,
2285
+ "eval_runtime": 5.0467,
2286
+ "eval_samples_per_second": 56.671,
2287
+ "eval_steps_per_second": 3.567,
2288
+ "step": 4462
2289
+ },
2290
+ {
2291
+ "epoch": 184.99,
2292
+ "eval_accuracy": 0.7797202797202797,
2293
+ "eval_loss": 0.7053534388542175,
2294
+ "eval_runtime": 5.279,
2295
+ "eval_samples_per_second": 54.177,
2296
+ "eval_steps_per_second": 3.41,
2297
+ "step": 4486
2298
+ },
2299
+ {
2300
+ "epoch": 185.57,
2301
+ "grad_norm": 2.9350059032440186,
2302
+ "learning_rate": 8.344444444444445e-06,
2303
+ "loss": 0.2261,
2304
+ "step": 4500
2305
+ },
2306
+ {
2307
+ "epoch": 185.98,
2308
+ "eval_accuracy": 0.7727272727272727,
2309
+ "eval_loss": 0.7260809540748596,
2310
+ "eval_runtime": 5.4165,
2311
+ "eval_samples_per_second": 52.802,
2312
+ "eval_steps_per_second": 3.323,
2313
+ "step": 4510
2314
+ },
2315
+ {
2316
+ "epoch": 186.97,
2317
+ "eval_accuracy": 0.7902097902097902,
2318
+ "eval_loss": 0.7240064144134521,
2319
+ "eval_runtime": 5.4866,
2320
+ "eval_samples_per_second": 52.127,
2321
+ "eval_steps_per_second": 3.281,
2322
+ "step": 4534
2323
+ },
2324
+ {
2325
+ "epoch": 187.63,
2326
+ "grad_norm": 1.8322782516479492,
2327
+ "learning_rate": 8.066666666666667e-06,
2328
+ "loss": 0.2282,
2329
+ "step": 4550
2330
+ },
2331
+ {
2332
+ "epoch": 188.0,
2333
+ "eval_accuracy": 0.7867132867132867,
2334
+ "eval_loss": 0.7199599146842957,
2335
+ "eval_runtime": 4.6736,
2336
+ "eval_samples_per_second": 61.195,
2337
+ "eval_steps_per_second": 3.851,
2338
+ "step": 4559
2339
+ },
2340
+ {
2341
+ "epoch": 188.99,
2342
+ "eval_accuracy": 0.7797202797202797,
2343
+ "eval_loss": 0.7102844715118408,
2344
+ "eval_runtime": 5.4219,
2345
+ "eval_samples_per_second": 52.749,
2346
+ "eval_steps_per_second": 3.32,
2347
+ "step": 4583
2348
+ },
2349
+ {
2350
+ "epoch": 189.69,
2351
+ "grad_norm": 1.8777916431427002,
2352
+ "learning_rate": 7.78888888888889e-06,
2353
+ "loss": 0.2321,
2354
+ "step": 4600
2355
+ },
2356
+ {
2357
+ "epoch": 189.98,
2358
+ "eval_accuracy": 0.7797202797202797,
2359
+ "eval_loss": 0.7083376049995422,
2360
+ "eval_runtime": 5.9634,
2361
+ "eval_samples_per_second": 47.959,
2362
+ "eval_steps_per_second": 3.018,
2363
+ "step": 4607
2364
+ },
2365
+ {
2366
+ "epoch": 190.97,
2367
+ "eval_accuracy": 0.7832167832167832,
2368
+ "eval_loss": 0.7244677543640137,
2369
+ "eval_runtime": 5.2078,
2370
+ "eval_samples_per_second": 54.918,
2371
+ "eval_steps_per_second": 3.456,
2372
+ "step": 4631
2373
+ },
2374
+ {
2375
+ "epoch": 191.75,
2376
+ "grad_norm": 1.5277408361434937,
2377
+ "learning_rate": 7.5111111111111105e-06,
2378
+ "loss": 0.2261,
2379
+ "step": 4650
2380
+ },
2381
+ {
2382
+ "epoch": 192.0,
2383
+ "eval_accuracy": 0.7867132867132867,
2384
+ "eval_loss": 0.7124583721160889,
2385
+ "eval_runtime": 5.7079,
2386
+ "eval_samples_per_second": 50.106,
2387
+ "eval_steps_per_second": 3.154,
2388
+ "step": 4656
2389
+ },
2390
+ {
2391
+ "epoch": 192.99,
2392
+ "eval_accuracy": 0.7867132867132867,
2393
+ "eval_loss": 0.7308976054191589,
2394
+ "eval_runtime": 5.3404,
2395
+ "eval_samples_per_second": 53.554,
2396
+ "eval_steps_per_second": 3.371,
2397
+ "step": 4680
2398
+ },
2399
+ {
2400
+ "epoch": 193.81,
2401
+ "grad_norm": 2.095749616622925,
2402
+ "learning_rate": 7.233333333333333e-06,
2403
+ "loss": 0.2231,
2404
+ "step": 4700
2405
+ },
2406
+ {
2407
+ "epoch": 193.98,
2408
+ "eval_accuracy": 0.7832167832167832,
2409
+ "eval_loss": 0.7237818837165833,
2410
+ "eval_runtime": 4.6666,
2411
+ "eval_samples_per_second": 61.286,
2412
+ "eval_steps_per_second": 3.857,
2413
+ "step": 4704
2414
+ },
2415
+ {
2416
+ "epoch": 194.97,
2417
+ "eval_accuracy": 0.7832167832167832,
2418
+ "eval_loss": 0.7253320217132568,
2419
+ "eval_runtime": 5.8059,
2420
+ "eval_samples_per_second": 49.261,
2421
+ "eval_steps_per_second": 3.1,
2422
+ "step": 4728
2423
+ },
2424
+ {
2425
+ "epoch": 195.88,
2426
+ "grad_norm": 1.6955636739730835,
2427
+ "learning_rate": 6.955555555555556e-06,
2428
+ "loss": 0.2083,
2429
+ "step": 4750
2430
+ },
2431
+ {
2432
+ "epoch": 196.0,
2433
+ "eval_accuracy": 0.7832167832167832,
2434
+ "eval_loss": 0.7240011692047119,
2435
+ "eval_runtime": 6.0767,
2436
+ "eval_samples_per_second": 47.065,
2437
+ "eval_steps_per_second": 2.962,
2438
+ "step": 4753
2439
+ },
2440
+ {
2441
+ "epoch": 196.99,
2442
+ "eval_accuracy": 0.7832167832167832,
2443
+ "eval_loss": 0.7131750583648682,
2444
+ "eval_runtime": 5.3063,
2445
+ "eval_samples_per_second": 53.898,
2446
+ "eval_steps_per_second": 3.392,
2447
+ "step": 4777
2448
+ },
2449
+ {
2450
+ "epoch": 197.94,
2451
+ "grad_norm": 0.8933289051055908,
2452
+ "learning_rate": 6.677777777777778e-06,
2453
+ "loss": 0.2116,
2454
+ "step": 4800
2455
+ },
2456
+ {
2457
+ "epoch": 197.98,
2458
+ "eval_accuracy": 0.7867132867132867,
2459
+ "eval_loss": 0.7169559597969055,
2460
+ "eval_runtime": 5.5713,
2461
+ "eval_samples_per_second": 51.335,
2462
+ "eval_steps_per_second": 3.231,
2463
+ "step": 4801
2464
+ },
2465
+ {
2466
+ "epoch": 198.97,
2467
+ "eval_accuracy": 0.7832167832167832,
2468
+ "eval_loss": 0.7265609502792358,
2469
+ "eval_runtime": 4.1397,
2470
+ "eval_samples_per_second": 69.087,
2471
+ "eval_steps_per_second": 4.348,
2472
+ "step": 4825
2473
+ },
2474
+ {
2475
+ "epoch": 200.0,
2476
+ "grad_norm": 2.175414562225342,
2477
+ "learning_rate": 6.4000000000000006e-06,
2478
+ "loss": 0.2219,
2479
+ "step": 4850
2480
+ },
2481
+ {
2482
+ "epoch": 200.0,
2483
+ "eval_accuracy": 0.7832167832167832,
2484
+ "eval_loss": 0.7162622213363647,
2485
+ "eval_runtime": 5.2016,
2486
+ "eval_samples_per_second": 54.984,
2487
+ "eval_steps_per_second": 3.461,
2488
+ "step": 4850
2489
+ },
2490
+ {
2491
+ "epoch": 200.99,
2492
+ "eval_accuracy": 0.7797202797202797,
2493
+ "eval_loss": 0.7302048802375793,
2494
+ "eval_runtime": 4.9222,
2495
+ "eval_samples_per_second": 58.104,
2496
+ "eval_steps_per_second": 3.657,
2497
+ "step": 4874
2498
+ },
2499
+ {
2500
+ "epoch": 201.98,
2501
+ "eval_accuracy": 0.7832167832167832,
2502
+ "eval_loss": 0.7223746180534363,
2503
+ "eval_runtime": 4.6884,
2504
+ "eval_samples_per_second": 61.002,
2505
+ "eval_steps_per_second": 3.839,
2506
+ "step": 4898
2507
+ },
2508
+ {
2509
+ "epoch": 202.06,
2510
+ "grad_norm": 2.053739309310913,
2511
+ "learning_rate": 6.1222222222222224e-06,
2512
+ "loss": 0.2183,
2513
+ "step": 4900
2514
+ },
2515
+ {
2516
+ "epoch": 202.97,
2517
+ "eval_accuracy": 0.7797202797202797,
2518
+ "eval_loss": 0.7179226279258728,
2519
+ "eval_runtime": 4.5556,
2520
+ "eval_samples_per_second": 62.78,
2521
+ "eval_steps_per_second": 3.951,
2522
+ "step": 4922
2523
+ },
2524
+ {
2525
+ "epoch": 204.0,
2526
+ "eval_accuracy": 0.7797202797202797,
2527
+ "eval_loss": 0.7245286107063293,
2528
+ "eval_runtime": 5.7474,
2529
+ "eval_samples_per_second": 49.762,
2530
+ "eval_steps_per_second": 3.132,
2531
+ "step": 4947
2532
+ },
2533
+ {
2534
+ "epoch": 204.12,
2535
+ "grad_norm": 1.1081063747406006,
2536
+ "learning_rate": 5.844444444444444e-06,
2537
+ "loss": 0.2053,
2538
+ "step": 4950
2539
+ },
2540
+ {
2541
+ "epoch": 204.99,
2542
+ "eval_accuracy": 0.7832167832167832,
2543
+ "eval_loss": 0.7344977259635925,
2544
+ "eval_runtime": 5.4178,
2545
+ "eval_samples_per_second": 52.789,
2546
+ "eval_steps_per_second": 3.322,
2547
+ "step": 4971
2548
+ },
2549
+ {
2550
+ "epoch": 205.98,
2551
+ "eval_accuracy": 0.7832167832167832,
2552
+ "eval_loss": 0.7249557971954346,
2553
+ "eval_runtime": 5.6352,
2554
+ "eval_samples_per_second": 50.753,
2555
+ "eval_steps_per_second": 3.194,
2556
+ "step": 4995
2557
+ },
2558
+ {
2559
+ "epoch": 206.19,
2560
+ "grad_norm": 1.09213125705719,
2561
+ "learning_rate": 5.566666666666667e-06,
2562
+ "loss": 0.2113,
2563
+ "step": 5000
2564
+ },
2565
+ {
2566
+ "epoch": 206.97,
2567
+ "eval_accuracy": 0.7832167832167832,
2568
+ "eval_loss": 0.7246001958847046,
2569
+ "eval_runtime": 4.9071,
2570
+ "eval_samples_per_second": 58.283,
2571
+ "eval_steps_per_second": 3.668,
2572
+ "step": 5019
2573
+ },
2574
+ {
2575
+ "epoch": 208.0,
2576
+ "eval_accuracy": 0.7867132867132867,
2577
+ "eval_loss": 0.7270117998123169,
2578
+ "eval_runtime": 5.8385,
2579
+ "eval_samples_per_second": 48.985,
2580
+ "eval_steps_per_second": 3.083,
2581
+ "step": 5044
2582
+ },
2583
+ {
2584
+ "epoch": 208.25,
2585
+ "grad_norm": 1.6693130731582642,
2586
+ "learning_rate": 5.288888888888889e-06,
2587
+ "loss": 0.2152,
2588
+ "step": 5050
2589
+ },
2590
+ {
2591
+ "epoch": 208.99,
2592
+ "eval_accuracy": 0.7867132867132867,
2593
+ "eval_loss": 0.7285901308059692,
2594
+ "eval_runtime": 5.489,
2595
+ "eval_samples_per_second": 52.104,
2596
+ "eval_steps_per_second": 3.279,
2597
+ "step": 5068
2598
+ },
2599
+ {
2600
+ "epoch": 209.98,
2601
+ "eval_accuracy": 0.7797202797202797,
2602
+ "eval_loss": 0.7332947254180908,
2603
+ "eval_runtime": 5.3017,
2604
+ "eval_samples_per_second": 53.945,
2605
+ "eval_steps_per_second": 3.395,
2606
+ "step": 5092
2607
+ },
2608
+ {
2609
+ "epoch": 210.31,
2610
+ "grad_norm": 2.0511515140533447,
2611
+ "learning_rate": 5.011111111111112e-06,
2612
+ "loss": 0.2129,
2613
+ "step": 5100
2614
+ },
2615
+ {
2616
+ "epoch": 210.97,
2617
+ "eval_accuracy": 0.7797202797202797,
2618
+ "eval_loss": 0.7307863831520081,
2619
+ "eval_runtime": 5.2991,
2620
+ "eval_samples_per_second": 53.971,
2621
+ "eval_steps_per_second": 3.397,
2622
+ "step": 5116
2623
+ },
2624
+ {
2625
+ "epoch": 212.0,
2626
+ "eval_accuracy": 0.7797202797202797,
2627
+ "eval_loss": 0.7176437973976135,
2628
+ "eval_runtime": 4.9452,
2629
+ "eval_samples_per_second": 57.834,
2630
+ "eval_steps_per_second": 3.64,
2631
+ "step": 5141
2632
+ },
2633
+ {
2634
+ "epoch": 212.37,
2635
+ "grad_norm": 1.8491023778915405,
2636
+ "learning_rate": 4.7333333333333335e-06,
2637
+ "loss": 0.2173,
2638
+ "step": 5150
2639
+ },
2640
+ {
2641
+ "epoch": 212.99,
2642
+ "eval_accuracy": 0.7832167832167832,
2643
+ "eval_loss": 0.7334882020950317,
2644
+ "eval_runtime": 4.9602,
2645
+ "eval_samples_per_second": 57.659,
2646
+ "eval_steps_per_second": 3.629,
2647
+ "step": 5165
2648
+ },
2649
+ {
2650
+ "epoch": 213.98,
2651
+ "eval_accuracy": 0.7797202797202797,
2652
+ "eval_loss": 0.7268483638763428,
2653
+ "eval_runtime": 5.885,
2654
+ "eval_samples_per_second": 48.598,
2655
+ "eval_steps_per_second": 3.059,
2656
+ "step": 5189
2657
+ },
2658
+ {
2659
+ "epoch": 214.43,
2660
+ "grad_norm": 1.2067769765853882,
2661
+ "learning_rate": 4.455555555555556e-06,
2662
+ "loss": 0.2042,
2663
+ "step": 5200
2664
+ },
2665
+ {
2666
+ "epoch": 214.97,
2667
+ "eval_accuracy": 0.7902097902097902,
2668
+ "eval_loss": 0.7299237847328186,
2669
+ "eval_runtime": 5.7645,
2670
+ "eval_samples_per_second": 49.614,
2671
+ "eval_steps_per_second": 3.123,
2672
+ "step": 5213
2673
+ },
2674
+ {
2675
+ "epoch": 216.0,
2676
+ "eval_accuracy": 0.7902097902097902,
2677
+ "eval_loss": 0.7360625863075256,
2678
+ "eval_runtime": 4.7143,
2679
+ "eval_samples_per_second": 60.667,
2680
+ "eval_steps_per_second": 3.818,
2681
+ "step": 5238
2682
+ },
2683
+ {
2684
+ "epoch": 216.49,
2685
+ "grad_norm": 1.3863427639007568,
2686
+ "learning_rate": 4.177777777777777e-06,
2687
+ "loss": 0.2112,
2688
+ "step": 5250
2689
+ },
2690
+ {
2691
+ "epoch": 216.99,
2692
+ "eval_accuracy": 0.7902097902097902,
2693
+ "eval_loss": 0.723866879940033,
2694
+ "eval_runtime": 5.3445,
2695
+ "eval_samples_per_second": 53.513,
2696
+ "eval_steps_per_second": 3.368,
2697
+ "step": 5262
2698
+ },
2699
+ {
2700
+ "epoch": 217.98,
2701
+ "eval_accuracy": 0.7832167832167832,
2702
+ "eval_loss": 0.7252445220947266,
2703
+ "eval_runtime": 4.6314,
2704
+ "eval_samples_per_second": 61.753,
2705
+ "eval_steps_per_second": 3.887,
2706
+ "step": 5286
2707
+ },
2708
+ {
2709
+ "epoch": 218.56,
2710
+ "grad_norm": 1.1177924871444702,
2711
+ "learning_rate": 3.9e-06,
2712
+ "loss": 0.2007,
2713
+ "step": 5300
2714
+ },
2715
+ {
2716
+ "epoch": 218.97,
2717
+ "eval_accuracy": 0.7867132867132867,
2718
+ "eval_loss": 0.719983696937561,
2719
+ "eval_runtime": 4.865,
2720
+ "eval_samples_per_second": 58.787,
2721
+ "eval_steps_per_second": 3.7,
2722
+ "step": 5310
2723
+ },
2724
+ {
2725
+ "epoch": 220.0,
2726
+ "eval_accuracy": 0.7867132867132867,
2727
+ "eval_loss": 0.7195786237716675,
2728
+ "eval_runtime": 5.5422,
2729
+ "eval_samples_per_second": 51.604,
2730
+ "eval_steps_per_second": 3.248,
2731
+ "step": 5335
2732
+ },
2733
+ {
2734
+ "epoch": 220.62,
2735
+ "grad_norm": 1.413304090499878,
2736
+ "learning_rate": 3.6222222222222226e-06,
2737
+ "loss": 0.2163,
2738
+ "step": 5350
2739
+ },
2740
+ {
2741
+ "epoch": 220.99,
2742
+ "eval_accuracy": 0.7902097902097902,
2743
+ "eval_loss": 0.7309580445289612,
2744
+ "eval_runtime": 5.2512,
2745
+ "eval_samples_per_second": 54.463,
2746
+ "eval_steps_per_second": 3.428,
2747
+ "step": 5359
2748
+ },
2749
+ {
2750
+ "epoch": 221.98,
2751
+ "eval_accuracy": 0.7867132867132867,
2752
+ "eval_loss": 0.7313971519470215,
2753
+ "eval_runtime": 5.1151,
2754
+ "eval_samples_per_second": 55.913,
2755
+ "eval_steps_per_second": 3.519,
2756
+ "step": 5383
2757
+ },
2758
+ {
2759
+ "epoch": 222.68,
2760
+ "grad_norm": 3.0471901893615723,
2761
+ "learning_rate": 3.3444444444444445e-06,
2762
+ "loss": 0.2141,
2763
+ "step": 5400
2764
+ },
2765
+ {
2766
+ "epoch": 222.97,
2767
+ "eval_accuracy": 0.7832167832167832,
2768
+ "eval_loss": 0.727938175201416,
2769
+ "eval_runtime": 4.6405,
2770
+ "eval_samples_per_second": 61.631,
2771
+ "eval_steps_per_second": 3.879,
2772
+ "step": 5407
2773
+ },
2774
+ {
2775
+ "epoch": 224.0,
2776
+ "eval_accuracy": 0.7902097902097902,
2777
+ "eval_loss": 0.725923478603363,
2778
+ "eval_runtime": 5.0906,
2779
+ "eval_samples_per_second": 56.182,
2780
+ "eval_steps_per_second": 3.536,
2781
+ "step": 5432
2782
+ }
2783
+ ],
2784
+ "logging_steps": 50,
2785
+ "max_steps": 6000,
2786
+ "num_input_tokens_seen": 0,
2787
+ "num_train_epochs": 250,
2788
+ "save_steps": 500,
2789
+ "total_flos": 3.037085846065152e+18,
2790
+ "train_batch_size": 16,
2791
+ "trial_name": null,
2792
+ "trial_params": null
2793
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3218c866c7b253f5d3295edcd44b4197864747f68600a16bb0f3d6f506131fb
3
+ size 4984