DuongTrongChi commited on
Commit
aede69f
·
verified ·
1 Parent(s): c1a340a

Training in progress, step 189, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": "unsloth",
22
  "target_modules": [
23
- "q_proj",
24
  "k_proj",
25
- "o_proj",
26
- "gate_proj",
27
  "v_proj",
 
 
28
  "down_proj",
29
- "up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": "unsloth",
22
  "target_modules": [
 
23
  "k_proj",
24
+ "q_proj",
 
25
  "v_proj",
26
+ "o_proj",
27
+ "up_proj",
28
  "down_proj",
29
+ "gate_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c2c77e1faef5360edb86fdd675b75189f7974b272c230d2370205fe561086d7
3
  size 60010048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e625bcbda58a2077a2f53862e647a4496991959b8e386df7a79f732236f18e9f
3
  size 60010048
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a19147b8c5bee718d325cc69e052a9d7b1c02ff81e02f720da3c271048875c83
3
  size 30427860
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03e4632d18d4e91d695ae7e07e8db236469a783bace61d269def1460bacaf75a
3
  size 30427860
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:584ad9e5089f3d6aa8e9e8b89c55f8d862ea0d388ff3164ca7b9ce670f8a69af
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96200d4ca8dc27a2feac013eb62dd34ec99b5598bcdd66554ba3bfa0cfc0128f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:975a394e3f3cbbd67dfc86dc023c0bf94ad366845e30e55347ecdc023963d2d7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aa8b796ae61fd6c2a8cd520f49549ae764a41fd16083782a810533cbf63c112
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,142 +1,1339 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.03950617283950617,
5
  "eval_steps": 500,
6
- "global_step": 18,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0021947873799725653,
13
- "grad_norm": 0.5619140863418579,
14
- "learning_rate": 5.000000000000001e-07,
15
- "loss": 2.0945,
16
  "step": 1
17
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  {
19
  "epoch": 0.0043895747599451305,
20
- "grad_norm": 0.5982444286346436,
 
 
 
 
 
 
 
21
  "learning_rate": 1.0000000000000002e-06,
22
- "loss": 2.1405,
23
- "step": 2
24
  },
25
  {
26
  "epoch": 0.006584362139917695,
27
- "grad_norm": 0.5753356218338013,
28
- "learning_rate": 1.5e-06,
29
- "loss": 2.0972,
30
- "step": 3
 
 
 
 
 
 
 
31
  },
32
  {
33
  "epoch": 0.008779149519890261,
34
- "grad_norm": 0.6162204146385193,
35
- "learning_rate": 2.0000000000000003e-06,
36
- "loss": 2.1892,
37
- "step": 4
 
 
 
 
 
 
 
38
  },
39
  {
40
  "epoch": 0.010973936899862825,
41
- "grad_norm": 0.6353598833084106,
42
- "learning_rate": 2.5e-06,
43
- "loss": 2.1669,
44
- "step": 5
 
 
 
 
 
 
 
45
  },
46
  {
47
  "epoch": 0.01316872427983539,
48
- "grad_norm": 0.5805812478065491,
49
- "learning_rate": 3e-06,
50
- "loss": 2.1406,
51
- "step": 6
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 0.015363511659807956,
55
- "grad_norm": 0.6394542455673218,
56
- "learning_rate": 3.5000000000000004e-06,
57
- "loss": 2.2199,
58
- "step": 7
 
 
 
 
 
 
 
59
  },
60
  {
61
  "epoch": 0.017558299039780522,
62
- "grad_norm": 0.566260039806366,
63
- "learning_rate": 4.000000000000001e-06,
64
- "loss": 2.0736,
65
- "step": 8
 
 
 
 
 
 
 
66
  },
67
  {
68
  "epoch": 0.019753086419753086,
69
- "grad_norm": 0.5819919109344482,
70
- "learning_rate": 4.5e-06,
71
- "loss": 2.1244,
72
- "step": 9
 
 
 
 
 
 
 
73
  },
74
  {
75
  "epoch": 0.02194787379972565,
76
- "grad_norm": 0.6622738242149353,
77
- "learning_rate": 5e-06,
78
- "loss": 2.165,
79
- "step": 10
 
 
 
 
 
 
 
80
  },
81
  {
82
  "epoch": 0.024142661179698217,
83
- "grad_norm": 0.5929241180419922,
84
- "learning_rate": 5.500000000000001e-06,
85
- "loss": 2.0738,
86
- "step": 11
 
 
 
 
 
 
 
87
  },
88
  {
89
  "epoch": 0.02633744855967078,
90
- "grad_norm": 0.6357274651527405,
91
- "learning_rate": 6e-06,
92
- "loss": 2.1277,
93
- "step": 12
 
 
 
 
 
 
 
94
  },
95
  {
96
  "epoch": 0.02853223593964335,
97
- "grad_norm": 0.612406313419342,
98
- "learning_rate": 6.5000000000000004e-06,
99
- "loss": 2.1812,
100
- "step": 13
 
 
 
 
 
 
 
101
  },
102
  {
103
  "epoch": 0.030727023319615913,
104
- "grad_norm": 0.5351918339729309,
105
- "learning_rate": 7.000000000000001e-06,
106
- "loss": 2.0624,
107
- "step": 14
 
 
 
 
 
 
 
108
  },
109
  {
110
  "epoch": 0.03292181069958848,
111
- "grad_norm": 0.5575801730155945,
112
- "learning_rate": 7.5e-06,
113
- "loss": 2.0872,
114
- "step": 15
 
 
 
 
 
 
 
115
  },
116
  {
117
  "epoch": 0.035116598079561044,
118
- "grad_norm": 0.5277993679046631,
119
- "learning_rate": 8.000000000000001e-06,
120
- "loss": 2.0443,
121
- "step": 16
 
 
 
 
 
 
 
122
  },
123
  {
124
  "epoch": 0.03731138545953361,
125
- "grad_norm": 0.5522028207778931,
126
- "learning_rate": 8.500000000000002e-06,
127
- "loss": 2.1187,
128
- "step": 17
 
 
 
 
 
 
 
129
  },
130
  {
131
  "epoch": 0.03950617283950617,
132
- "grad_norm": 0.5474849343299866,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  "learning_rate": 9e-06,
134
- "loss": 2.1184,
135
- "step": 18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  }
137
  ],
138
  "logging_steps": 1,
139
- "max_steps": 455,
140
  "num_input_tokens_seen": 0,
141
  "num_train_epochs": 1,
142
  "save_steps": 1,
@@ -152,7 +1349,7 @@
152
  "attributes": {}
153
  }
154
  },
155
- "total_flos": 2.186071270264013e+16,
156
  "train_batch_size": 4,
157
  "trial_name": null,
158
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2074074074074074,
5
  "eval_steps": 500,
6
+ "global_step": 189,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0010973936899862826,
13
+ "grad_norm": 0.5590636134147644,
14
+ "learning_rate": 2.0000000000000002e-07,
15
+ "loss": 2.0815,
16
  "step": 1
17
  },
18
+ {
19
+ "epoch": 0.0021947873799725653,
20
+ "grad_norm": 0.5765010714530945,
21
+ "learning_rate": 4.0000000000000003e-07,
22
+ "loss": 2.1074,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.0032921810699588477,
27
+ "grad_norm": 0.584500253200531,
28
+ "learning_rate": 6.000000000000001e-07,
29
+ "loss": 2.1012,
30
+ "step": 3
31
+ },
32
  {
33
  "epoch": 0.0043895747599451305,
34
+ "grad_norm": 0.6246315836906433,
35
+ "learning_rate": 8.000000000000001e-07,
36
+ "loss": 2.1795,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.0054869684499314125,
41
+ "grad_norm": 0.5558974146842957,
42
  "learning_rate": 1.0000000000000002e-06,
43
+ "loss": 2.0729,
44
+ "step": 5
45
  },
46
  {
47
  "epoch": 0.006584362139917695,
48
+ "grad_norm": 0.6098957657814026,
49
+ "learning_rate": 1.2000000000000002e-06,
50
+ "loss": 2.1216,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.007681755829903978,
55
+ "grad_norm": 0.5948249697685242,
56
+ "learning_rate": 1.4000000000000001e-06,
57
+ "loss": 2.1381,
58
+ "step": 7
59
  },
60
  {
61
  "epoch": 0.008779149519890261,
62
+ "grad_norm": 0.6547859311103821,
63
+ "learning_rate": 1.6000000000000001e-06,
64
+ "loss": 2.2398,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.009876543209876543,
69
+ "grad_norm": 0.6762146353721619,
70
+ "learning_rate": 1.8000000000000001e-06,
71
+ "loss": 2.1872,
72
+ "step": 9
73
  },
74
  {
75
  "epoch": 0.010973936899862825,
76
+ "grad_norm": 0.6071422100067139,
77
+ "learning_rate": 2.0000000000000003e-06,
78
+ "loss": 2.1451,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.012071330589849109,
83
+ "grad_norm": 0.5874962210655212,
84
+ "learning_rate": 2.2e-06,
85
+ "loss": 2.1632,
86
+ "step": 11
87
  },
88
  {
89
  "epoch": 0.01316872427983539,
90
+ "grad_norm": 0.5862544775009155,
91
+ "learning_rate": 2.4000000000000003e-06,
92
+ "loss": 2.1152,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.014266117969821674,
97
+ "grad_norm": 0.614499032497406,
98
+ "learning_rate": 2.6e-06,
99
+ "loss": 2.1906,
100
+ "step": 13
101
  },
102
  {
103
  "epoch": 0.015363511659807956,
104
+ "grad_norm": 0.6741944551467896,
105
+ "learning_rate": 2.8000000000000003e-06,
106
+ "loss": 2.2436,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.01646090534979424,
111
+ "grad_norm": 0.5827784538269043,
112
+ "learning_rate": 3e-06,
113
+ "loss": 2.0787,
114
+ "step": 15
115
  },
116
  {
117
  "epoch": 0.017558299039780522,
118
+ "grad_norm": 0.5573399066925049,
119
+ "learning_rate": 3.2000000000000003e-06,
120
+ "loss": 2.0608,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.018655692729766804,
125
+ "grad_norm": 0.5720248818397522,
126
+ "learning_rate": 3.4000000000000005e-06,
127
+ "loss": 2.1152,
128
+ "step": 17
129
  },
130
  {
131
  "epoch": 0.019753086419753086,
132
+ "grad_norm": 0.5938870310783386,
133
+ "learning_rate": 3.6000000000000003e-06,
134
+ "loss": 2.1217,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.020850480109739368,
139
+ "grad_norm": 0.6471643447875977,
140
+ "learning_rate": 3.8000000000000005e-06,
141
+ "loss": 2.1649,
142
+ "step": 19
143
  },
144
  {
145
  "epoch": 0.02194787379972565,
146
+ "grad_norm": 0.6734815835952759,
147
+ "learning_rate": 4.000000000000001e-06,
148
+ "loss": 2.1458,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.023045267489711935,
153
+ "grad_norm": 0.5911048054695129,
154
+ "learning_rate": 4.2000000000000004e-06,
155
+ "loss": 2.0573,
156
+ "step": 21
157
  },
158
  {
159
  "epoch": 0.024142661179698217,
160
+ "grad_norm": 0.5842518210411072,
161
+ "learning_rate": 4.4e-06,
162
+ "loss": 2.0675,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.0252400548696845,
167
+ "grad_norm": 0.6548774242401123,
168
+ "learning_rate": 4.600000000000001e-06,
169
+ "loss": 2.1,
170
+ "step": 23
171
  },
172
  {
173
  "epoch": 0.02633744855967078,
174
+ "grad_norm": 0.5978567600250244,
175
+ "learning_rate": 4.800000000000001e-06,
176
+ "loss": 2.1234,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.027434842249657063,
181
+ "grad_norm": 0.6166532635688782,
182
+ "learning_rate": 5e-06,
183
+ "loss": 2.1784,
184
+ "step": 25
185
  },
186
  {
187
  "epoch": 0.02853223593964335,
188
+ "grad_norm": 0.5846672058105469,
189
+ "learning_rate": 5.2e-06,
190
+ "loss": 2.1453,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.02962962962962963,
195
+ "grad_norm": 0.5325105786323547,
196
+ "learning_rate": 5.400000000000001e-06,
197
+ "loss": 2.0348,
198
+ "step": 27
199
  },
200
  {
201
  "epoch": 0.030727023319615913,
202
+ "grad_norm": 0.5196258425712585,
203
+ "learning_rate": 5.600000000000001e-06,
204
+ "loss": 2.0484,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.031824417009602195,
209
+ "grad_norm": 0.5424355268478394,
210
+ "learning_rate": 5.8e-06,
211
+ "loss": 2.0728,
212
+ "step": 29
213
  },
214
  {
215
  "epoch": 0.03292181069958848,
216
+ "grad_norm": 0.5482433438301086,
217
+ "learning_rate": 6e-06,
218
+ "loss": 2.051,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 0.03401920438957476,
223
+ "grad_norm": 0.5127721428871155,
224
+ "learning_rate": 6.200000000000001e-06,
225
+ "loss": 2.0508,
226
+ "step": 31
227
  },
228
  {
229
  "epoch": 0.035116598079561044,
230
+ "grad_norm": 0.5115760564804077,
231
+ "learning_rate": 6.4000000000000006e-06,
232
+ "loss": 1.9822,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 0.03621399176954732,
237
+ "grad_norm": 0.5571993589401245,
238
+ "learning_rate": 6.600000000000001e-06,
239
+ "loss": 2.0929,
240
+ "step": 33
241
  },
242
  {
243
  "epoch": 0.03731138545953361,
244
+ "grad_norm": 0.5175538063049316,
245
+ "learning_rate": 6.800000000000001e-06,
246
+ "loss": 2.0776,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 0.038408779149519894,
251
+ "grad_norm": 0.5197016596794128,
252
+ "learning_rate": 7e-06,
253
+ "loss": 2.0747,
254
+ "step": 35
255
  },
256
  {
257
  "epoch": 0.03950617283950617,
258
+ "grad_norm": 0.5387422442436218,
259
+ "learning_rate": 7.2000000000000005e-06,
260
+ "loss": 2.0864,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 0.04060356652949246,
265
+ "grad_norm": 0.5122085809707642,
266
+ "learning_rate": 7.4e-06,
267
+ "loss": 2.0737,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 0.041700960219478736,
272
+ "grad_norm": 0.6023052334785461,
273
+ "learning_rate": 7.600000000000001e-06,
274
+ "loss": 2.0519,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 0.04279835390946502,
279
+ "grad_norm": 0.5678107738494873,
280
+ "learning_rate": 7.800000000000002e-06,
281
+ "loss": 2.1175,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 0.0438957475994513,
286
+ "grad_norm": 0.5373347401618958,
287
+ "learning_rate": 8.000000000000001e-06,
288
+ "loss": 2.0667,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 0.044993141289437585,
293
+ "grad_norm": 0.5342459082603455,
294
+ "learning_rate": 8.2e-06,
295
+ "loss": 2.081,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 0.04609053497942387,
300
+ "grad_norm": 0.6377962231636047,
301
+ "learning_rate": 8.400000000000001e-06,
302
+ "loss": 2.1576,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 0.04718792866941015,
307
+ "grad_norm": 0.46719589829444885,
308
+ "learning_rate": 8.6e-06,
309
+ "loss": 1.9755,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 0.048285322359396435,
314
+ "grad_norm": 0.4903099834918976,
315
+ "learning_rate": 8.8e-06,
316
+ "loss": 2.0195,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 0.04938271604938271,
321
+ "grad_norm": 0.460610955953598,
322
  "learning_rate": 9e-06,
323
+ "loss": 1.9456,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 0.050480109739369,
328
+ "grad_norm": 0.45545804500579834,
329
+ "learning_rate": 9.200000000000002e-06,
330
+ "loss": 2.0255,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 0.051577503429355284,
335
+ "grad_norm": 0.4546717405319214,
336
+ "learning_rate": 9.4e-06,
337
+ "loss": 1.9741,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 0.05267489711934156,
342
+ "grad_norm": 0.46261245012283325,
343
+ "learning_rate": 9.600000000000001e-06,
344
+ "loss": 2.0104,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 0.05377229080932785,
349
+ "grad_norm": 0.43465015292167664,
350
+ "learning_rate": 9.800000000000001e-06,
351
+ "loss": 1.9356,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 0.05486968449931413,
356
+ "grad_norm": 0.42450013756752014,
357
+ "learning_rate": 1e-05,
358
+ "loss": 1.9312,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 0.05596707818930041,
363
+ "grad_norm": 0.4223531484603882,
364
+ "learning_rate": 1.02e-05,
365
+ "loss": 1.9022,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 0.0570644718792867,
370
+ "grad_norm": 0.39140722155570984,
371
+ "learning_rate": 1.04e-05,
372
+ "loss": 1.9784,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 0.058161865569272976,
377
+ "grad_norm": 0.4256257712841034,
378
+ "learning_rate": 1.0600000000000002e-05,
379
+ "loss": 1.9278,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 0.05925925925925926,
384
+ "grad_norm": 0.45769235491752625,
385
+ "learning_rate": 1.0800000000000002e-05,
386
+ "loss": 2.031,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 0.06035665294924554,
391
+ "grad_norm": 0.4826626181602478,
392
+ "learning_rate": 1.1000000000000001e-05,
393
+ "loss": 1.9647,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 0.061454046639231825,
398
+ "grad_norm": 0.4414077699184418,
399
+ "learning_rate": 1.1200000000000001e-05,
400
+ "loss": 2.0007,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 0.06255144032921811,
405
+ "grad_norm": 0.39386674761772156,
406
+ "learning_rate": 1.14e-05,
407
+ "loss": 1.8676,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 0.06364883401920439,
412
+ "grad_norm": 0.386751264333725,
413
+ "learning_rate": 1.16e-05,
414
+ "loss": 1.9471,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 0.06474622770919067,
419
+ "grad_norm": 0.3908196985721588,
420
+ "learning_rate": 1.18e-05,
421
+ "loss": 1.9722,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 0.06584362139917696,
426
+ "grad_norm": 0.39014488458633423,
427
+ "learning_rate": 1.2e-05,
428
+ "loss": 1.9328,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 0.06694101508916324,
433
+ "grad_norm": 0.3620125651359558,
434
+ "learning_rate": 1.22e-05,
435
+ "loss": 1.9918,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 0.06803840877914952,
440
+ "grad_norm": 0.37926608324050903,
441
+ "learning_rate": 1.2400000000000002e-05,
442
+ "loss": 1.8233,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 0.0691358024691358,
447
+ "grad_norm": 0.44235774874687195,
448
+ "learning_rate": 1.2600000000000001e-05,
449
+ "loss": 1.9558,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 0.07023319615912209,
454
+ "grad_norm": 0.3922639489173889,
455
+ "learning_rate": 1.2800000000000001e-05,
456
+ "loss": 1.993,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 0.07133058984910837,
461
+ "grad_norm": 0.4201815128326416,
462
+ "learning_rate": 1.3000000000000001e-05,
463
+ "loss": 1.873,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 0.07242798353909465,
468
+ "grad_norm": 0.3698742389678955,
469
+ "learning_rate": 1.3200000000000002e-05,
470
+ "loss": 1.8728,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 0.07352537722908094,
475
+ "grad_norm": 0.38322025537490845,
476
+ "learning_rate": 1.3400000000000002e-05,
477
+ "loss": 1.8322,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 0.07462277091906722,
482
+ "grad_norm": 0.37198516726493835,
483
+ "learning_rate": 1.3600000000000002e-05,
484
+ "loss": 1.932,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 0.0757201646090535,
489
+ "grad_norm": 0.36407792568206787,
490
+ "learning_rate": 1.38e-05,
491
+ "loss": 1.826,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 0.07681755829903979,
496
+ "grad_norm": 0.3603726625442505,
497
+ "learning_rate": 1.4e-05,
498
+ "loss": 1.8001,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 0.07791495198902607,
503
+ "grad_norm": 0.37135443091392517,
504
+ "learning_rate": 1.4200000000000001e-05,
505
+ "loss": 1.8165,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 0.07901234567901234,
510
+ "grad_norm": 0.37606292963027954,
511
+ "learning_rate": 1.4400000000000001e-05,
512
+ "loss": 1.7207,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 0.08010973936899862,
517
+ "grad_norm": 0.3831545114517212,
518
+ "learning_rate": 1.46e-05,
519
+ "loss": 1.8692,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 0.08120713305898491,
524
+ "grad_norm": 0.3911626935005188,
525
+ "learning_rate": 1.48e-05,
526
+ "loss": 1.8687,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 0.0823045267489712,
531
+ "grad_norm": 0.39615172147750854,
532
+ "learning_rate": 1.5000000000000002e-05,
533
+ "loss": 1.7774,
534
+ "step": 75
535
+ },
536
+ {
537
+ "epoch": 0.08340192043895747,
538
+ "grad_norm": 0.37161362171173096,
539
+ "learning_rate": 1.5200000000000002e-05,
540
+ "loss": 1.723,
541
+ "step": 76
542
+ },
543
+ {
544
+ "epoch": 0.08449931412894376,
545
+ "grad_norm": 0.3870552182197571,
546
+ "learning_rate": 1.54e-05,
547
+ "loss": 1.7314,
548
+ "step": 77
549
+ },
550
+ {
551
+ "epoch": 0.08559670781893004,
552
+ "grad_norm": 0.4143535792827606,
553
+ "learning_rate": 1.5600000000000003e-05,
554
+ "loss": 1.7912,
555
+ "step": 78
556
+ },
557
+ {
558
+ "epoch": 0.08669410150891632,
559
+ "grad_norm": 0.4270377457141876,
560
+ "learning_rate": 1.58e-05,
561
+ "loss": 1.78,
562
+ "step": 79
563
+ },
564
+ {
565
+ "epoch": 0.0877914951989026,
566
+ "grad_norm": 0.4058167636394501,
567
+ "learning_rate": 1.6000000000000003e-05,
568
+ "loss": 1.7228,
569
+ "step": 80
570
+ },
571
+ {
572
+ "epoch": 0.08888888888888889,
573
+ "grad_norm": 0.42461684346199036,
574
+ "learning_rate": 1.62e-05,
575
+ "loss": 1.8259,
576
+ "step": 81
577
+ },
578
+ {
579
+ "epoch": 0.08998628257887517,
580
+ "grad_norm": 0.37542393803596497,
581
+ "learning_rate": 1.64e-05,
582
+ "loss": 1.7579,
583
+ "step": 82
584
+ },
585
+ {
586
+ "epoch": 0.09108367626886145,
587
+ "grad_norm": 0.4218761622905731,
588
+ "learning_rate": 1.66e-05,
589
+ "loss": 1.8108,
590
+ "step": 83
591
+ },
592
+ {
593
+ "epoch": 0.09218106995884774,
594
+ "grad_norm": 0.4039926528930664,
595
+ "learning_rate": 1.6800000000000002e-05,
596
+ "loss": 1.7625,
597
+ "step": 84
598
+ },
599
+ {
600
+ "epoch": 0.09327846364883402,
601
+ "grad_norm": 0.3840247690677643,
602
+ "learning_rate": 1.7e-05,
603
+ "loss": 1.7455,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 0.0943758573388203,
608
+ "grad_norm": 0.41403114795684814,
609
+ "learning_rate": 1.72e-05,
610
+ "loss": 1.6384,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 0.09547325102880659,
615
+ "grad_norm": 0.3860199451446533,
616
+ "learning_rate": 1.7400000000000003e-05,
617
+ "loss": 1.6876,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.09657064471879287,
622
+ "grad_norm": 0.39576929807662964,
623
+ "learning_rate": 1.76e-05,
624
+ "loss": 1.7044,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.09766803840877915,
629
+ "grad_norm": 0.4149666130542755,
630
+ "learning_rate": 1.7800000000000002e-05,
631
+ "loss": 1.657,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.09876543209876543,
636
+ "grad_norm": 0.46741801500320435,
637
+ "learning_rate": 1.8e-05,
638
+ "loss": 1.6955,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.09986282578875172,
643
+ "grad_norm": 0.41203179955482483,
644
+ "learning_rate": 1.8200000000000002e-05,
645
+ "loss": 1.6259,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.100960219478738,
650
+ "grad_norm": 0.4069835841655731,
651
+ "learning_rate": 1.8400000000000003e-05,
652
+ "loss": 1.6547,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.10205761316872428,
657
+ "grad_norm": 0.4733927845954895,
658
+ "learning_rate": 1.86e-05,
659
+ "loss": 1.6419,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.10315500685871057,
664
+ "grad_norm": 0.4243912100791931,
665
+ "learning_rate": 1.88e-05,
666
+ "loss": 1.5943,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.10425240054869685,
671
+ "grad_norm": 0.48016536235809326,
672
+ "learning_rate": 1.9e-05,
673
+ "loss": 1.5303,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.10534979423868313,
678
+ "grad_norm": 0.44309377670288086,
679
+ "learning_rate": 1.9200000000000003e-05,
680
+ "loss": 1.5682,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.1064471879286694,
685
+ "grad_norm": 0.45906150341033936,
686
+ "learning_rate": 1.94e-05,
687
+ "loss": 1.5747,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.1075445816186557,
692
+ "grad_norm": 0.4476592540740967,
693
+ "learning_rate": 1.9600000000000002e-05,
694
+ "loss": 1.5563,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.10864197530864197,
699
+ "grad_norm": 0.432974249124527,
700
+ "learning_rate": 1.98e-05,
701
+ "loss": 1.5782,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.10973936899862825,
706
+ "grad_norm": 0.4596545398235321,
707
+ "learning_rate": 2e-05,
708
+ "loss": 1.5268,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.11083676268861455,
713
+ "grad_norm": 0.49297234416007996,
714
+ "learning_rate": 1.997533908754624e-05,
715
+ "loss": 1.4867,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.11193415637860082,
720
+ "grad_norm": 0.4385370910167694,
721
+ "learning_rate": 1.995067817509248e-05,
722
+ "loss": 1.5288,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.1130315500685871,
727
+ "grad_norm": 0.4075908660888672,
728
+ "learning_rate": 1.992601726263872e-05,
729
+ "loss": 1.5342,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.1141289437585734,
734
+ "grad_norm": 0.4361695647239685,
735
+ "learning_rate": 1.990135635018496e-05,
736
+ "loss": 1.4855,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.11522633744855967,
741
+ "grad_norm": 0.3582554757595062,
742
+ "learning_rate": 1.9876695437731196e-05,
743
+ "loss": 1.5212,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.11632373113854595,
748
+ "grad_norm": 0.39850226044654846,
749
+ "learning_rate": 1.985203452527744e-05,
750
+ "loss": 1.4019,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.11742112482853223,
755
+ "grad_norm": 0.35860705375671387,
756
+ "learning_rate": 1.9827373612823677e-05,
757
+ "loss": 1.401,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.11851851851851852,
762
+ "grad_norm": 0.34796416759490967,
763
+ "learning_rate": 1.9802712700369916e-05,
764
+ "loss": 1.3828,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.1196159122085048,
769
+ "grad_norm": 0.3401014506816864,
770
+ "learning_rate": 1.9778051787916155e-05,
771
+ "loss": 1.3946,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.12071330589849108,
776
+ "grad_norm": 0.36761683225631714,
777
+ "learning_rate": 1.9753390875462394e-05,
778
+ "loss": 1.3895,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.12181069958847737,
783
+ "grad_norm": 0.3479093313217163,
784
+ "learning_rate": 1.9728729963008633e-05,
785
+ "loss": 1.3704,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.12290809327846365,
790
+ "grad_norm": 0.3511699438095093,
791
+ "learning_rate": 1.9704069050554872e-05,
792
+ "loss": 1.4554,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.12400548696844993,
797
+ "grad_norm": 0.3108881413936615,
798
+ "learning_rate": 1.967940813810111e-05,
799
+ "loss": 1.4446,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.12510288065843622,
804
+ "grad_norm": 0.35024315118789673,
805
+ "learning_rate": 1.965474722564735e-05,
806
+ "loss": 1.3689,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.1262002743484225,
811
+ "grad_norm": 0.3867047429084778,
812
+ "learning_rate": 1.9630086313193592e-05,
813
+ "loss": 1.323,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.12729766803840878,
818
+ "grad_norm": 0.3047441244125366,
819
+ "learning_rate": 1.9605425400739828e-05,
820
+ "loss": 1.3858,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 0.12839506172839507,
825
+ "grad_norm": 0.34278514981269836,
826
+ "learning_rate": 1.9580764488286066e-05,
827
+ "loss": 1.4294,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 0.12949245541838134,
832
+ "grad_norm": 0.3425813317298889,
833
+ "learning_rate": 1.955610357583231e-05,
834
+ "loss": 1.3426,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 0.13058984910836763,
839
+ "grad_norm": 0.38938474655151367,
840
+ "learning_rate": 1.9531442663378544e-05,
841
+ "loss": 1.3857,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 0.13168724279835392,
846
+ "grad_norm": 0.323210746049881,
847
+ "learning_rate": 1.9506781750924787e-05,
848
+ "loss": 1.3922,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 0.13278463648834019,
853
+ "grad_norm": 0.33866652846336365,
854
+ "learning_rate": 1.9482120838471025e-05,
855
+ "loss": 1.3969,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 0.13388203017832648,
860
+ "grad_norm": 0.3292064368724823,
861
+ "learning_rate": 1.9457459926017264e-05,
862
+ "loss": 1.417,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 0.13497942386831277,
867
+ "grad_norm": 0.3402523100376129,
868
+ "learning_rate": 1.9432799013563503e-05,
869
+ "loss": 1.3128,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 0.13607681755829903,
874
+ "grad_norm": 0.3458561599254608,
875
+ "learning_rate": 1.9408138101109742e-05,
876
+ "loss": 1.3259,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.13717421124828533,
881
+ "grad_norm": 0.3515847325325012,
882
+ "learning_rate": 1.938347718865598e-05,
883
+ "loss": 1.3399,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.1382716049382716,
888
+ "grad_norm": 0.3250204622745514,
889
+ "learning_rate": 1.935881627620222e-05,
890
+ "loss": 1.325,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 0.13936899862825788,
895
+ "grad_norm": 0.37784844636917114,
896
+ "learning_rate": 1.9334155363748462e-05,
897
+ "loss": 1.3475,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 0.14046639231824418,
902
+ "grad_norm": 0.39530113339424133,
903
+ "learning_rate": 1.9309494451294698e-05,
904
+ "loss": 1.3755,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 0.14156378600823044,
909
+ "grad_norm": 0.37158292531967163,
910
+ "learning_rate": 1.928483353884094e-05,
911
+ "loss": 1.3443,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 0.14266117969821673,
916
+ "grad_norm": 0.43860819935798645,
917
+ "learning_rate": 1.9260172626387176e-05,
918
+ "loss": 1.3849,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 0.14375857338820303,
923
+ "grad_norm": 0.43812137842178345,
924
+ "learning_rate": 1.9235511713933418e-05,
925
+ "loss": 1.335,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 0.1448559670781893,
930
+ "grad_norm": 0.4332147538661957,
931
+ "learning_rate": 1.9210850801479657e-05,
932
+ "loss": 1.286,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 0.14595336076817558,
937
+ "grad_norm": 0.3903179168701172,
938
+ "learning_rate": 1.9186189889025896e-05,
939
+ "loss": 1.3319,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 0.14705075445816188,
944
+ "grad_norm": 0.4002543091773987,
945
+ "learning_rate": 1.9161528976572135e-05,
946
+ "loss": 1.384,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 0.14814814814814814,
951
+ "grad_norm": 0.3899374008178711,
952
+ "learning_rate": 1.9136868064118374e-05,
953
+ "loss": 1.4108,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 0.14924554183813443,
958
+ "grad_norm": 0.4650733172893524,
959
+ "learning_rate": 1.9112207151664612e-05,
960
+ "loss": 1.3531,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 0.15034293552812072,
965
+ "grad_norm": 0.4140302538871765,
966
+ "learning_rate": 1.908754623921085e-05,
967
+ "loss": 1.3683,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 0.151440329218107,
972
+ "grad_norm": 0.46601012349128723,
973
+ "learning_rate": 1.9062885326757094e-05,
974
+ "loss": 1.3669,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 0.15253772290809328,
979
+ "grad_norm": 0.4634631276130676,
980
+ "learning_rate": 1.903822441430333e-05,
981
+ "loss": 1.3653,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 0.15363511659807957,
986
+ "grad_norm": 0.49095258116722107,
987
+ "learning_rate": 1.901356350184957e-05,
988
+ "loss": 1.2959,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 0.15473251028806584,
993
+ "grad_norm": 0.504693865776062,
994
+ "learning_rate": 1.898890258939581e-05,
995
+ "loss": 1.3398,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 0.15582990397805213,
1000
+ "grad_norm": 0.4257521629333496,
1001
+ "learning_rate": 1.896424167694205e-05,
1002
+ "loss": 1.4279,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 0.1569272976680384,
1007
+ "grad_norm": 0.4088304936885834,
1008
+ "learning_rate": 1.8939580764488288e-05,
1009
+ "loss": 1.349,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 0.1580246913580247,
1014
+ "grad_norm": 0.46413642168045044,
1015
+ "learning_rate": 1.8914919852034527e-05,
1016
+ "loss": 1.225,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 0.15912208504801098,
1021
+ "grad_norm": 0.42032694816589355,
1022
+ "learning_rate": 1.8890258939580766e-05,
1023
+ "loss": 1.3018,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 0.16021947873799725,
1028
+ "grad_norm": 0.3075321614742279,
1029
+ "learning_rate": 1.8865598027127005e-05,
1030
+ "loss": 1.3478,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 0.16131687242798354,
1035
+ "grad_norm": 0.2494765967130661,
1036
+ "learning_rate": 1.8840937114673244e-05,
1037
+ "loss": 1.2855,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 0.16241426611796983,
1042
+ "grad_norm": 0.22226963937282562,
1043
+ "learning_rate": 1.8816276202219483e-05,
1044
+ "loss": 1.2765,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 0.1635116598079561,
1049
+ "grad_norm": 0.19540823996067047,
1050
+ "learning_rate": 1.879161528976572e-05,
1051
+ "loss": 1.3324,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 0.1646090534979424,
1056
+ "grad_norm": 0.1837640106678009,
1057
+ "learning_rate": 1.876695437731196e-05,
1058
+ "loss": 1.357,
1059
+ "step": 150
1060
+ },
1061
+ {
1062
+ "epoch": 0.16570644718792868,
1063
+ "grad_norm": 0.21854494512081146,
1064
+ "learning_rate": 1.87422934648582e-05,
1065
+ "loss": 1.3023,
1066
+ "step": 151
1067
+ },
1068
+ {
1069
+ "epoch": 0.16680384087791494,
1070
+ "grad_norm": 0.1981099396944046,
1071
+ "learning_rate": 1.8717632552404442e-05,
1072
+ "loss": 1.2727,
1073
+ "step": 152
1074
+ },
1075
+ {
1076
+ "epoch": 0.16790123456790124,
1077
+ "grad_norm": 0.1930709034204483,
1078
+ "learning_rate": 1.8692971639950677e-05,
1079
+ "loss": 1.309,
1080
+ "step": 153
1081
+ },
1082
+ {
1083
+ "epoch": 0.16899862825788753,
1084
+ "grad_norm": 0.196756511926651,
1085
+ "learning_rate": 1.866831072749692e-05,
1086
+ "loss": 1.3215,
1087
+ "step": 154
1088
+ },
1089
+ {
1090
+ "epoch": 0.1700960219478738,
1091
+ "grad_norm": 0.18768474459648132,
1092
+ "learning_rate": 1.864364981504316e-05,
1093
+ "loss": 1.3032,
1094
+ "step": 155
1095
+ },
1096
+ {
1097
+ "epoch": 0.17119341563786009,
1098
+ "grad_norm": 0.22273507714271545,
1099
+ "learning_rate": 1.8618988902589397e-05,
1100
+ "loss": 1.312,
1101
+ "step": 156
1102
+ },
1103
+ {
1104
+ "epoch": 0.17229080932784638,
1105
+ "grad_norm": 0.17846502363681793,
1106
+ "learning_rate": 1.8594327990135636e-05,
1107
+ "loss": 1.3079,
1108
+ "step": 157
1109
+ },
1110
+ {
1111
+ "epoch": 0.17338820301783264,
1112
+ "grad_norm": 0.18322448432445526,
1113
+ "learning_rate": 1.8569667077681875e-05,
1114
+ "loss": 1.3537,
1115
+ "step": 158
1116
+ },
1117
+ {
1118
+ "epoch": 0.17448559670781894,
1119
+ "grad_norm": 0.180659681558609,
1120
+ "learning_rate": 1.8545006165228114e-05,
1121
+ "loss": 1.32,
1122
+ "step": 159
1123
+ },
1124
+ {
1125
+ "epoch": 0.1755829903978052,
1126
+ "grad_norm": 0.23424433171749115,
1127
+ "learning_rate": 1.8520345252774353e-05,
1128
+ "loss": 1.3525,
1129
+ "step": 160
1130
+ },
1131
+ {
1132
+ "epoch": 0.1766803840877915,
1133
+ "grad_norm": 0.19839544594287872,
1134
+ "learning_rate": 1.8495684340320595e-05,
1135
+ "loss": 1.2567,
1136
+ "step": 161
1137
+ },
1138
+ {
1139
+ "epoch": 0.17777777777777778,
1140
+ "grad_norm": 0.1809428632259369,
1141
+ "learning_rate": 1.847102342786683e-05,
1142
+ "loss": 1.4336,
1143
+ "step": 162
1144
+ },
1145
+ {
1146
+ "epoch": 0.17887517146776405,
1147
+ "grad_norm": 0.18282510340213776,
1148
+ "learning_rate": 1.8446362515413073e-05,
1149
+ "loss": 1.3281,
1150
+ "step": 163
1151
+ },
1152
+ {
1153
+ "epoch": 0.17997256515775034,
1154
+ "grad_norm": 0.18329757452011108,
1155
+ "learning_rate": 1.8421701602959312e-05,
1156
+ "loss": 1.2928,
1157
+ "step": 164
1158
+ },
1159
+ {
1160
+ "epoch": 0.18106995884773663,
1161
+ "grad_norm": 0.1925593912601471,
1162
+ "learning_rate": 1.839704069050555e-05,
1163
+ "loss": 1.3299,
1164
+ "step": 165
1165
+ },
1166
+ {
1167
+ "epoch": 0.1821673525377229,
1168
+ "grad_norm": 0.19288820028305054,
1169
+ "learning_rate": 1.837237977805179e-05,
1170
+ "loss": 1.3251,
1171
+ "step": 166
1172
+ },
1173
+ {
1174
+ "epoch": 0.1832647462277092,
1175
+ "grad_norm": 0.18203546106815338,
1176
+ "learning_rate": 1.834771886559803e-05,
1177
+ "loss": 1.336,
1178
+ "step": 167
1179
+ },
1180
+ {
1181
+ "epoch": 0.18436213991769548,
1182
+ "grad_norm": 0.20783405005931854,
1183
+ "learning_rate": 1.8323057953144268e-05,
1184
+ "loss": 1.3015,
1185
+ "step": 168
1186
+ },
1187
+ {
1188
+ "epoch": 0.18545953360768175,
1189
+ "grad_norm": 0.16650988161563873,
1190
+ "learning_rate": 1.8298397040690507e-05,
1191
+ "loss": 1.3584,
1192
+ "step": 169
1193
+ },
1194
+ {
1195
+ "epoch": 0.18655692729766804,
1196
+ "grad_norm": 0.19607752561569214,
1197
+ "learning_rate": 1.8273736128236746e-05,
1198
+ "loss": 1.3404,
1199
+ "step": 170
1200
+ },
1201
+ {
1202
+ "epoch": 0.18765432098765433,
1203
+ "grad_norm": 0.1942966729402542,
1204
+ "learning_rate": 1.8249075215782984e-05,
1205
+ "loss": 1.3222,
1206
+ "step": 171
1207
+ },
1208
+ {
1209
+ "epoch": 0.1887517146776406,
1210
+ "grad_norm": 0.23658064007759094,
1211
+ "learning_rate": 1.8224414303329227e-05,
1212
+ "loss": 1.3226,
1213
+ "step": 172
1214
+ },
1215
+ {
1216
+ "epoch": 0.1898491083676269,
1217
+ "grad_norm": 0.1818239539861679,
1218
+ "learning_rate": 1.8199753390875462e-05,
1219
+ "loss": 1.378,
1220
+ "step": 173
1221
+ },
1222
+ {
1223
+ "epoch": 0.19094650205761318,
1224
+ "grad_norm": 0.1911785900592804,
1225
+ "learning_rate": 1.8175092478421705e-05,
1226
+ "loss": 1.4018,
1227
+ "step": 174
1228
+ },
1229
+ {
1230
+ "epoch": 0.19204389574759945,
1231
+ "grad_norm": 0.20029093325138092,
1232
+ "learning_rate": 1.8150431565967943e-05,
1233
+ "loss": 1.3754,
1234
+ "step": 175
1235
+ },
1236
+ {
1237
+ "epoch": 0.19314128943758574,
1238
+ "grad_norm": 0.17715874314308167,
1239
+ "learning_rate": 1.8125770653514182e-05,
1240
+ "loss": 1.2913,
1241
+ "step": 176
1242
+ },
1243
+ {
1244
+ "epoch": 0.194238683127572,
1245
+ "grad_norm": 0.19813261926174164,
1246
+ "learning_rate": 1.810110974106042e-05,
1247
+ "loss": 1.4297,
1248
+ "step": 177
1249
+ },
1250
+ {
1251
+ "epoch": 0.1953360768175583,
1252
+ "grad_norm": 0.18769969046115875,
1253
+ "learning_rate": 1.807644882860666e-05,
1254
+ "loss": 1.3742,
1255
+ "step": 178
1256
+ },
1257
+ {
1258
+ "epoch": 0.1964334705075446,
1259
+ "grad_norm": 0.19279271364212036,
1260
+ "learning_rate": 1.80517879161529e-05,
1261
+ "loss": 1.2959,
1262
+ "step": 179
1263
+ },
1264
+ {
1265
+ "epoch": 0.19753086419753085,
1266
+ "grad_norm": 0.18196497857570648,
1267
+ "learning_rate": 1.8027127003699138e-05,
1268
+ "loss": 1.257,
1269
+ "step": 180
1270
+ },
1271
+ {
1272
+ "epoch": 0.19862825788751715,
1273
+ "grad_norm": 0.18429674208164215,
1274
+ "learning_rate": 1.8002466091245377e-05,
1275
+ "loss": 1.3139,
1276
+ "step": 181
1277
+ },
1278
+ {
1279
+ "epoch": 0.19972565157750344,
1280
+ "grad_norm": 0.17658454179763794,
1281
+ "learning_rate": 1.7977805178791616e-05,
1282
+ "loss": 1.2467,
1283
+ "step": 182
1284
+ },
1285
+ {
1286
+ "epoch": 0.2008230452674897,
1287
+ "grad_norm": 0.18059629201889038,
1288
+ "learning_rate": 1.7953144266337855e-05,
1289
+ "loss": 1.3461,
1290
+ "step": 183
1291
+ },
1292
+ {
1293
+ "epoch": 0.201920438957476,
1294
+ "grad_norm": 0.17834338545799255,
1295
+ "learning_rate": 1.7928483353884094e-05,
1296
+ "loss": 1.3542,
1297
+ "step": 184
1298
+ },
1299
+ {
1300
+ "epoch": 0.2030178326474623,
1301
+ "grad_norm": 0.19244344532489777,
1302
+ "learning_rate": 1.7903822441430333e-05,
1303
+ "loss": 1.3086,
1304
+ "step": 185
1305
+ },
1306
+ {
1307
+ "epoch": 0.20411522633744855,
1308
+ "grad_norm": 0.19486865401268005,
1309
+ "learning_rate": 1.7879161528976575e-05,
1310
+ "loss": 1.2946,
1311
+ "step": 186
1312
+ },
1313
+ {
1314
+ "epoch": 0.20521262002743484,
1315
+ "grad_norm": 0.19326968491077423,
1316
+ "learning_rate": 1.785450061652281e-05,
1317
+ "loss": 1.2794,
1318
+ "step": 187
1319
+ },
1320
+ {
1321
+ "epoch": 0.20631001371742114,
1322
+ "grad_norm": 0.2171463817358017,
1323
+ "learning_rate": 1.7829839704069053e-05,
1324
+ "loss": 1.3009,
1325
+ "step": 188
1326
+ },
1327
+ {
1328
+ "epoch": 0.2074074074074074,
1329
+ "grad_norm": 0.2024574726819992,
1330
+ "learning_rate": 1.780517879161529e-05,
1331
+ "loss": 1.3694,
1332
+ "step": 189
1333
  }
1334
  ],
1335
  "logging_steps": 1,
1336
+ "max_steps": 911,
1337
  "num_input_tokens_seen": 0,
1338
  "num_train_epochs": 1,
1339
  "save_steps": 1,
 
1349
  "attributes": {}
1350
  }
1351
  },
1352
+ "total_flos": 1.1524363938073805e+17,
1353
  "train_batch_size": 4,
1354
  "trial_name": null,
1355
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13a31ab6ba72d5f444e77e773341db29fe2e54fcefb9ddcac5ddba8e0711e92e
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3925e6f4fb74ea74c296487c97f50eee65db504b312a1431f8d935775889ba02
3
  size 5240