File size: 8,171 Bytes
da00d1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
{
  "best_metric": 1.5792005062103271,
  "best_model_checkpoint": "qlora_output/checkpoint-1200",
  "epoch": 1.4679393049437102,
  "eval_steps": 600,
  "global_step": 1500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.03915810083210964,
      "grad_norm": 0.07319402694702148,
      "learning_rate": 3.555555555555556e-05,
      "loss": 2.4428,
      "step": 40
    },
    {
      "epoch": 0.07831620166421928,
      "grad_norm": 0.04330237954854965,
      "learning_rate": 7.111111111111112e-05,
      "loss": 2.268,
      "step": 80
    },
    {
      "epoch": 0.11747430249632893,
      "grad_norm": 0.05867455527186394,
      "learning_rate": 0.00010666666666666667,
      "loss": 2.1806,
      "step": 120
    },
    {
      "epoch": 0.15663240332843856,
      "grad_norm": 0.06936266273260117,
      "learning_rate": 0.00014222222222222224,
      "loss": 2.0778,
      "step": 160
    },
    {
      "epoch": 0.19579050416054822,
      "grad_norm": 0.08056484907865524,
      "learning_rate": 0.00017777777777777779,
      "loss": 2.0382,
      "step": 200
    },
    {
      "epoch": 0.23494860499265785,
      "grad_norm": 0.0779654011130333,
      "learning_rate": 0.0001999317060143023,
      "loss": 1.9227,
      "step": 240
    },
    {
      "epoch": 0.2741067058247675,
      "grad_norm": 0.11802724003791809,
      "learning_rate": 0.00019908312530915603,
      "loss": 1.9139,
      "step": 280
    },
    {
      "epoch": 0.3132648066568771,
      "grad_norm": 0.0852489247918129,
      "learning_rate": 0.00019727282722446047,
      "loss": 1.9423,
      "step": 320
    },
    {
      "epoch": 0.3524229074889868,
      "grad_norm": 0.1409972459077835,
      "learning_rate": 0.00019451838281608197,
      "loss": 1.8484,
      "step": 360
    },
    {
      "epoch": 0.39158100832109644,
      "grad_norm": 0.11129080504179001,
      "learning_rate": 0.00019084652718195238,
      "loss": 1.7694,
      "step": 400
    },
    {
      "epoch": 0.43073910915320607,
      "grad_norm": 0.10179898887872696,
      "learning_rate": 0.00018629289996673897,
      "loss": 1.8026,
      "step": 440
    },
    {
      "epoch": 0.4698972099853157,
      "grad_norm": 0.14124783873558044,
      "learning_rate": 0.00018090169943749476,
      "loss": 1.8217,
      "step": 480
    },
    {
      "epoch": 0.5090553108174254,
      "grad_norm": 0.16184218227863312,
      "learning_rate": 0.0001747252534878891,
      "loss": 1.7847,
      "step": 520
    },
    {
      "epoch": 0.548213411649535,
      "grad_norm": 0.11349498480558395,
      "learning_rate": 0.00016782351173492342,
      "loss": 1.6622,
      "step": 560
    },
    {
      "epoch": 0.5873715124816447,
      "grad_norm": 0.08884529024362564,
      "learning_rate": 0.00016026346363792567,
      "loss": 1.7633,
      "step": 600
    },
    {
      "epoch": 0.5873715124816447,
      "eval_loss": 1.6572695970535278,
      "eval_runtime": 1912.2507,
      "eval_samples_per_second": 1.425,
      "eval_steps_per_second": 0.713,
      "step": 600
    },
    {
      "epoch": 0.6265296133137542,
      "grad_norm": 0.09996389597654343,
      "learning_rate": 0.0001521184882876585,
      "loss": 1.6764,
      "step": 640
    },
    {
      "epoch": 0.6656877141458639,
      "grad_norm": 0.12769252061843872,
      "learning_rate": 0.00014346764217659653,
      "loss": 1.7871,
      "step": 680
    },
    {
      "epoch": 0.7048458149779736,
      "grad_norm": 0.13380451500415802,
      "learning_rate": 0.00013439489186339282,
      "loss": 1.7167,
      "step": 720
    },
    {
      "epoch": 0.7440039158100832,
      "grad_norm": 0.11822285503149033,
      "learning_rate": 0.0001249882989794231,
      "loss": 1.6789,
      "step": 760
    },
    {
      "epoch": 0.7831620166421929,
      "grad_norm": 0.12109290808439255,
      "learning_rate": 0.00011533916548786857,
      "loss": 1.583,
      "step": 800
    },
    {
      "epoch": 0.8223201174743024,
      "grad_norm": 0.12838001549243927,
      "learning_rate": 0.000105541147491597,
      "loss": 1.7412,
      "step": 840
    },
    {
      "epoch": 0.8614782183064121,
      "grad_norm": 0.16042716801166534,
      "learning_rate": 9.568934619137046e-05,
      "loss": 1.6519,
      "step": 880
    },
    {
      "epoch": 0.9006363191385218,
      "grad_norm": 0.1427149474620819,
      "learning_rate": 8.587938481769089e-05,
      "loss": 1.6598,
      "step": 920
    },
    {
      "epoch": 0.9397944199706314,
      "grad_norm": 0.118178591132164,
      "learning_rate": 7.620648049573815e-05,
      "loss": 1.7378,
      "step": 960
    },
    {
      "epoch": 0.9789525208027411,
      "grad_norm": 0.1253277212381363,
      "learning_rate": 6.676452005203406e-05,
      "loss": 1.6451,
      "step": 1000
    },
    {
      "epoch": 1.0176211453744493,
      "grad_norm": 0.15462452173233032,
      "learning_rate": 5.764514873320761e-05,
      "loss": 1.6475,
      "step": 1040
    },
    {
      "epoch": 1.056779246206559,
      "grad_norm": 0.106235072016716,
      "learning_rate": 4.893688068190932e-05,
      "loss": 1.6686,
      "step": 1080
    },
    {
      "epoch": 1.0959373470386686,
      "grad_norm": 0.09717393666505814,
      "learning_rate": 4.072423980374452e-05,
      "loss": 1.6824,
      "step": 1120
    },
    {
      "epoch": 1.1350954478707782,
      "grad_norm": 0.13711334764957428,
      "learning_rate": 3.308693936411421e-05,
      "loss": 1.6147,
      "step": 1160
    },
    {
      "epoch": 1.174253548702888,
      "grad_norm": 0.1265803724527359,
      "learning_rate": 2.6099108277934103e-05,
      "loss": 1.6174,
      "step": 1200
    },
    {
      "epoch": 1.174253548702888,
      "eval_loss": 1.5792005062103271,
      "eval_runtime": 1903.0333,
      "eval_samples_per_second": 1.432,
      "eval_steps_per_second": 0.716,
      "step": 1200
    },
    {
      "epoch": 1.2134116495349976,
      "grad_norm": 0.09578167647123337,
      "learning_rate": 1.982857160199334e-05,
      "loss": 1.6246,
      "step": 1240
    },
    {
      "epoch": 1.2525697503671072,
      "grad_norm": 0.14227357506752014,
      "learning_rate": 1.4336192213613742e-05,
      "loss": 1.5548,
      "step": 1280
    },
    {
      "epoch": 1.2917278511992167,
      "grad_norm": 0.1526080220937729,
      "learning_rate": 9.675280065387116e-06,
      "loss": 1.5454,
      "step": 1320
    },
    {
      "epoch": 1.3308859520313265,
      "grad_norm": 0.17356757819652557,
      "learning_rate": 5.891074749862857e-06,
      "loss": 1.5555,
      "step": 1360
    },
    {
      "epoch": 1.3700440528634361,
      "grad_norm": 0.1258653849363327,
      "learning_rate": 3.0203063964990617e-06,
      "loss": 1.5775,
      "step": 1400
    },
    {
      "epoch": 1.4092021536955457,
      "grad_norm": 0.12249883264303207,
      "learning_rate": 1.0908391628854041e-06,
      "loss": 1.5619,
      "step": 1440
    },
    {
      "epoch": 1.4483602545276555,
      "grad_norm": 0.1455027014017105,
      "learning_rate": 1.2140078057101266e-07,
      "loss": 1.5342,
      "step": 1480
    },
    {
      "epoch": 1.4679393049437102,
      "step": 1500,
      "total_flos": 1.105565365842985e+17,
      "train_loss": 1.763797264099121,
      "train_runtime": 30390.5778,
      "train_samples_per_second": 0.395,
      "train_steps_per_second": 0.049
    }
  ],
  "logging_steps": 40,
  "max_steps": 1500,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 600,
  "stateful_callbacks": {
    "EarlyStoppingCallback": {
      "args": {
        "early_stopping_patience": 3,
        "early_stopping_threshold": 0.05
      },
      "attributes": {
        "early_stopping_patience_counter": 0
      }
    },
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.105565365842985e+17,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}