|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 4390, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002277904328018223, |
|
"grad_norm": 386.0, |
|
"learning_rate": 4.5558086560364467e-07, |
|
"loss": 47.0249, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011389521640091117, |
|
"grad_norm": 430.0, |
|
"learning_rate": 2.2779043280182233e-06, |
|
"loss": 48.5743, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.022779043280182234, |
|
"grad_norm": 324.0, |
|
"learning_rate": 4.555808656036447e-06, |
|
"loss": 47.0262, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03416856492027335, |
|
"grad_norm": 178.0, |
|
"learning_rate": 6.83371298405467e-06, |
|
"loss": 40.673, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04555808656036447, |
|
"grad_norm": 125.5, |
|
"learning_rate": 9.111617312072893e-06, |
|
"loss": 35.8229, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05694760820045558, |
|
"grad_norm": 76.5, |
|
"learning_rate": 1.1389521640091117e-05, |
|
"loss": 29.5881, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0683371298405467, |
|
"grad_norm": 24.75, |
|
"learning_rate": 1.366742596810934e-05, |
|
"loss": 26.1961, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07972665148063782, |
|
"grad_norm": 20.25, |
|
"learning_rate": 1.5945330296127563e-05, |
|
"loss": 24.1324, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09111617312072894, |
|
"grad_norm": 14.75, |
|
"learning_rate": 1.8223234624145787e-05, |
|
"loss": 22.6848, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10250569476082004, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.050113895216401e-05, |
|
"loss": 21.496, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11389521640091116, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 2.2779043280182233e-05, |
|
"loss": 20.4174, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1252847380410023, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 2.505694760820046e-05, |
|
"loss": 19.6892, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1366742596810934, |
|
"grad_norm": 4.25, |
|
"learning_rate": 2.733485193621868e-05, |
|
"loss": 18.9022, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1480637813211845, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 2.96127562642369e-05, |
|
"loss": 18.4344, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.15945330296127563, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 3.189066059225513e-05, |
|
"loss": 18.1733, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17084282460136674, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 3.416856492027335e-05, |
|
"loss": 17.6679, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.18223234624145787, |
|
"grad_norm": 9.875, |
|
"learning_rate": 3.6446469248291574e-05, |
|
"loss": 16.8998, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.19362186788154898, |
|
"grad_norm": 14.125, |
|
"learning_rate": 3.87243735763098e-05, |
|
"loss": 15.9599, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.20501138952164008, |
|
"grad_norm": 23.625, |
|
"learning_rate": 4.100227790432802e-05, |
|
"loss": 14.2835, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2164009111617312, |
|
"grad_norm": 29.875, |
|
"learning_rate": 4.3280182232346244e-05, |
|
"loss": 11.33, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.22779043280182232, |
|
"grad_norm": 29.0, |
|
"learning_rate": 4.555808656036447e-05, |
|
"loss": 7.3568, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23917995444191345, |
|
"grad_norm": 15.375, |
|
"learning_rate": 4.783599088838269e-05, |
|
"loss": 3.988, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2505694760820046, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 5.011389521640092e-05, |
|
"loss": 2.5546, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2619589977220957, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.239179954441914e-05, |
|
"loss": 2.0898, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2733485193621868, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 5.466970387243736e-05, |
|
"loss": 1.8634, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2847380410022779, |
|
"grad_norm": 1.375, |
|
"learning_rate": 5.6947608200455584e-05, |
|
"loss": 1.7283, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.296127562642369, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 5.92255125284738e-05, |
|
"loss": 1.652, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.30751708428246016, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 6.150341685649203e-05, |
|
"loss": 1.5754, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.31890660592255127, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.378132118451025e-05, |
|
"loss": 1.5234, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.33029612756264237, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 6.605922551252848e-05, |
|
"loss": 1.461, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3416856492027335, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 6.83371298405467e-05, |
|
"loss": 1.4367, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3530751708428246, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 7.061503416856492e-05, |
|
"loss": 1.4151, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.36446469248291574, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 7.289293849658315e-05, |
|
"loss": 1.3879, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.37585421412300685, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 7.517084282460137e-05, |
|
"loss": 1.3668, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.38724373576309795, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 7.74487471526196e-05, |
|
"loss": 1.3422, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.39863325740318906, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 7.972665148063782e-05, |
|
"loss": 1.3269, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.41002277904328016, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 8.200455580865604e-05, |
|
"loss": 1.3143, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4214123006833713, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 8.428246013667426e-05, |
|
"loss": 1.2994, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4328018223234624, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 8.656036446469249e-05, |
|
"loss": 1.2815, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.44419134396355353, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 8.883826879271071e-05, |
|
"loss": 1.2721, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.45558086560364464, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 9.111617312072893e-05, |
|
"loss": 1.2607, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.46697038724373574, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 9.339407744874716e-05, |
|
"loss": 1.2512, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.4783599088838269, |
|
"grad_norm": 3.0, |
|
"learning_rate": 9.567198177676538e-05, |
|
"loss": 1.2626, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.489749430523918, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 9.79498861047836e-05, |
|
"loss": 1.2506, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5011389521640092, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00010022779043280184, |
|
"loss": 1.2473, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5125284738041003, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00010250569476082006, |
|
"loss": 1.2319, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5239179954441914, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 0.00010478359908883827, |
|
"loss": 1.2246, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5353075170842825, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.0001070615034168565, |
|
"loss": 1.2092, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5466970387243736, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00010933940774487472, |
|
"loss": 1.1988, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5580865603644647, |
|
"grad_norm": 2.875, |
|
"learning_rate": 0.00011161731207289294, |
|
"loss": 1.1988, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5694760820045558, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00011389521640091117, |
|
"loss": 1.1985, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5808656036446469, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.00011617312072892939, |
|
"loss": 1.197, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.592255125284738, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.0001184510250569476, |
|
"loss": 1.1889, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6036446469248291, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.00012072892938496582, |
|
"loss": 1.1738, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6150341685649203, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00012300683371298406, |
|
"loss": 1.1693, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6264236902050114, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.00012528473804100228, |
|
"loss": 1.1681, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6378132118451025, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 0.0001275626423690205, |
|
"loss": 1.1611, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6492027334851936, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00012984054669703873, |
|
"loss": 1.1557, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6605922551252847, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 0.00013211845102505695, |
|
"loss": 1.151, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6719817767653758, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.00013439635535307518, |
|
"loss": 1.1527, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.683371298405467, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.0001366742596810934, |
|
"loss": 1.143, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6947608200455581, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.00013895216400911162, |
|
"loss": 1.1439, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7061503416856492, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00014123006833712985, |
|
"loss": 1.1345, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7175398633257403, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 0.00014350797266514807, |
|
"loss": 1.125, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7289293849658315, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 0.0001457858769931663, |
|
"loss": 1.1364, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7403189066059226, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 0.00014806378132118452, |
|
"loss": 1.1339, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7517084282460137, |
|
"grad_norm": 21.25, |
|
"learning_rate": 0.00015034168564920274, |
|
"loss": 1.1417, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7630979498861048, |
|
"grad_norm": 4.375, |
|
"learning_rate": 0.00015261958997722096, |
|
"loss": 1.1475, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7744874715261959, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.0001548974943052392, |
|
"loss": 1.1267, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.785876993166287, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.0001571753986332574, |
|
"loss": 1.1184, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7972665148063781, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 0.00015945330296127563, |
|
"loss": 1.123, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8086560364464692, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00016173120728929386, |
|
"loss": 1.1052, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8200455580865603, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.00016400911161731208, |
|
"loss": 1.1108, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8314350797266514, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 0.0001662870159453303, |
|
"loss": 1.1204, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8428246013667426, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.00016856492027334853, |
|
"loss": 1.1326, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8542141230068337, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 0.00017084282460136675, |
|
"loss": 1.1137, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8656036446469249, |
|
"grad_norm": 3.75, |
|
"learning_rate": 0.00017312072892938497, |
|
"loss": 1.1272, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.876993166287016, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.0001753986332574032, |
|
"loss": 1.108, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8883826879271071, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 0.00017767653758542142, |
|
"loss": 1.1221, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8997722095671982, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00017995444191343964, |
|
"loss": 1.0942, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9111617312072893, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00018223234624145787, |
|
"loss": 1.0881, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9225512528473804, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.0001845102505694761, |
|
"loss": 1.0857, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9339407744874715, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.00018678815489749431, |
|
"loss": 1.0843, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9453302961275627, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.00018906605922551254, |
|
"loss": 1.0782, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9567198177676538, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.00019134396355353076, |
|
"loss": 1.0762, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9681093394077449, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.00019362186788154898, |
|
"loss": 1.0681, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.979498861047836, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.0001958997722095672, |
|
"loss": 1.0742, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9908883826879271, |
|
"grad_norm": 18.625, |
|
"learning_rate": 0.00019817767653758543, |
|
"loss": 1.0805, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.521019220352173, |
|
"eval_runtime": 0.2677, |
|
"eval_samples_per_second": 37.357, |
|
"eval_steps_per_second": 3.736, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.0022779043280183, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 0.0001999999683877311, |
|
"loss": 1.1284, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0136674259681093, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.0001999988619604182, |
|
"loss": 1.0864, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.0250569476082005, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 0.00019999617493964692, |
|
"loss": 1.0719, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0364464692482915, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 0.00019999190736788865, |
|
"loss": 1.1239, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.0478359908883828, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0001999860593125971, |
|
"loss": 1.1026, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0592255125284737, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.00019997863086620727, |
|
"loss": 1.0776, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.070615034168565, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.0001999696221461341, |
|
"loss": 1.072, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.082004555808656, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 0.0001999590332947704, |
|
"loss": 1.073, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.0933940774487472, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 0.0001999468644794848, |
|
"loss": 1.0755, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1047835990888384, |
|
"grad_norm": 14.125, |
|
"learning_rate": 0.00019993311589261897, |
|
"loss": 1.0603, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.1161731207289294, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.00019991778775148465, |
|
"loss": 1.0802, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1275626423690206, |
|
"grad_norm": 3.25, |
|
"learning_rate": 0.00019990088029836017, |
|
"loss": 1.0647, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.1389521640091116, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 0.00019988239380048674, |
|
"loss": 1.062, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1503416856492028, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.000199862328550064, |
|
"loss": 1.0499, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.1617312072892938, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.00019984068486424557, |
|
"loss": 1.0475, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.173120728929385, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 0.0001998174630851341, |
|
"loss": 1.0381, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.184510250569476, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 0.00019979266357977564, |
|
"loss": 1.0617, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1958997722095672, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.000199766286740154, |
|
"loss": 1.051, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.2072892938496582, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 0.0001997383329831846, |
|
"loss": 1.0657, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2186788154897494, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.00019970880275070762, |
|
"loss": 1.0574, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.2300683371298406, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.00019967769650948135, |
|
"loss": 1.0469, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2414578587699316, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00019964501475117462, |
|
"loss": 1.0483, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.2528473804100229, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 0.00019961075799235903, |
|
"loss": 1.0401, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2642369020501139, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.0001995749267745008, |
|
"loss": 1.0436, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.275626423690205, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00019953752166395228, |
|
"loss": 1.0291, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.287015945330296, |
|
"grad_norm": 2.625, |
|
"learning_rate": 0.00019949854325194294, |
|
"loss": 1.0726, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.2984054669703873, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.00019945799215456998, |
|
"loss": 1.0269, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3097949886104785, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 0.00019941586901278875, |
|
"loss": 1.0222, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.3211845102505695, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.0001993721744924024, |
|
"loss": 1.0212, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3325740318906605, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 0.00019932690928405153, |
|
"loss": 1.0125, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.3439635535307517, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00019928007410320323, |
|
"loss": 1.0043, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.355353075170843, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.0001992316696901397, |
|
"loss": 1.0229, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.366742596810934, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.00019918169680994667, |
|
"loss": 1.0179, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3781321184510251, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.00019913015625250114, |
|
"loss": 1.0123, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.3895216400911161, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00019907704883245916, |
|
"loss": 1.0104, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4009111617312073, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.00019902237538924256, |
|
"loss": 1.0195, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.4123006833712983, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.00019896613678702617, |
|
"loss": 1.0101, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4236902050113895, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.0001989083339147237, |
|
"loss": 1.0192, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.4350797266514808, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.000198848967685974, |
|
"loss": 1.0109, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.4464692482915718, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.0001987880390391264, |
|
"loss": 1.0048, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.4578587699316627, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00019872554893722618, |
|
"loss": 0.9957, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.469248291571754, |
|
"grad_norm": 2.875, |
|
"learning_rate": 0.00019866149836799896, |
|
"loss": 1.0112, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.4806378132118452, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.0001985958883438354, |
|
"loss": 1.0237, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4920273348519362, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.00019852871990177503, |
|
"loss": 1.0246, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.5034168564920274, |
|
"grad_norm": 11.625, |
|
"learning_rate": 0.00019845999410349002, |
|
"loss": 1.0251, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.5148063781321186, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.00019838971203526808, |
|
"loss": 1.0168, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.5261958997722096, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 0.00019831787480799568, |
|
"loss": 1.0081, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.5375854214123006, |
|
"grad_norm": 10.25, |
|
"learning_rate": 0.0001982444835571403, |
|
"loss": 1.0271, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.5489749430523918, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.00019816953944273237, |
|
"loss": 1.0059, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.560364464692483, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 0.0001980930436493472, |
|
"loss": 1.0287, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.571753986332574, |
|
"grad_norm": 4.875, |
|
"learning_rate": 0.00019801499738608604, |
|
"loss": 1.0337, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.583143507972665, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00019793540188655704, |
|
"loss": 1.0124, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.5945330296127562, |
|
"grad_norm": 10.375, |
|
"learning_rate": 0.0001978542584088558, |
|
"loss": 1.0009, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6059225512528474, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00019777156823554544, |
|
"loss": 0.9936, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.6173120728929384, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 0.00019768733267363624, |
|
"loss": 1.0044, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.6287015945330297, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.0001976015530545652, |
|
"loss": 0.9959, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.6400911161731209, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.00019751423073417475, |
|
"loss": 0.9893, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.6514806378132119, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 0.0001974253670926915, |
|
"loss": 0.9896, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.6628701594533029, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.00019733496353470433, |
|
"loss": 0.9965, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.674259681093394, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 0.00019724302148914222, |
|
"loss": 0.9817, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.6856492027334853, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 0.00019714954240925172, |
|
"loss": 0.9811, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.6970387243735763, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 0.00019705452777257377, |
|
"loss": 0.9798, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.7084282460136673, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.0001969579790809207, |
|
"loss": 0.994, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.7198177676537585, |
|
"grad_norm": 2.625, |
|
"learning_rate": 0.00019685989786035211, |
|
"loss": 0.9838, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.7312072892938497, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.00019676028566115102, |
|
"loss": 0.9868, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.7425968109339407, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00019665914405779923, |
|
"loss": 0.9933, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.753986332574032, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00019655647464895254, |
|
"loss": 0.9949, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.7653758542141231, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.00019645227905741534, |
|
"loss": 0.9871, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.7767653758542141, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 0.00019634655893011513, |
|
"loss": 0.9855, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.7881548974943051, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0001962393159380763, |
|
"loss": 1.0098, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.7995444191343963, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.00019613055177639384, |
|
"loss": 0.9833, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8109339407744875, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.0001960202681642066, |
|
"loss": 0.9712, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.8223234624145785, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 0.00019590846684466992, |
|
"loss": 0.9792, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.8337129840546698, |
|
"grad_norm": 12.25, |
|
"learning_rate": 0.00019579514958492826, |
|
"loss": 0.978, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.845102505694761, |
|
"grad_norm": 18.125, |
|
"learning_rate": 0.00019568031817608725, |
|
"loss": 0.9891, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.856492027334852, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 0.00019556397443318523, |
|
"loss": 0.9708, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.867881548974943, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 0.00019544612019516472, |
|
"loss": 0.9601, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.8792710706150342, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.00019532675732484333, |
|
"loss": 1.0184, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.8906605922551254, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00019520588770888424, |
|
"loss": 0.9742, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9020501138952164, |
|
"grad_norm": 9.75, |
|
"learning_rate": 0.00019508351325776642, |
|
"loss": 0.9652, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.9134396355353074, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 0.00019495963590575443, |
|
"loss": 1.003, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.9248291571753986, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 0.00019483425761086793, |
|
"loss": 0.9781, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.9362186788154898, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 0.00019470738035485058, |
|
"loss": 0.9943, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.9476082004555808, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.0001945790061431388, |
|
"loss": 0.9864, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.958997722095672, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 0.00019444913700483008, |
|
"loss": 0.9491, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.9703872437357632, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00019431777499265087, |
|
"loss": 0.9748, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.9817767653758542, |
|
"grad_norm": 8.0, |
|
"learning_rate": 0.0001941849221829242, |
|
"loss": 0.939, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.9931662870159452, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00019405058067553676, |
|
"loss": 0.9397, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.4361355304718018, |
|
"eval_runtime": 0.2364, |
|
"eval_samples_per_second": 42.306, |
|
"eval_steps_per_second": 4.231, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 2.0045558086560367, |
|
"grad_norm": 12.625, |
|
"learning_rate": 0.00019391475259390584, |
|
"loss": 0.9262, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.0159453302961277, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00019377744008494555, |
|
"loss": 0.922, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.0273348519362187, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00019363864531903323, |
|
"loss": 0.9265, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.0387243735763096, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00019349837048997478, |
|
"loss": 0.9572, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.050113895216401, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.00019335661781497024, |
|
"loss": 0.9381, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.061503416856492, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 0.00019321338953457858, |
|
"loss": 0.9678, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.072892938496583, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0001930686879126824, |
|
"loss": 0.9557, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.084282460136674, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 0.00019292251523645208, |
|
"loss": 0.9384, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.0956719817767655, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 0.00019277487381630975, |
|
"loss": 0.9409, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.1070615034168565, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0001926257659858925, |
|
"loss": 0.9319, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.1184510250569475, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00019247519410201585, |
|
"loss": 0.9264, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.129840546697039, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.00019232316054463617, |
|
"loss": 0.9069, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.14123006833713, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.0001921696677168133, |
|
"loss": 0.9139, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.152619589977221, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00019201471804467245, |
|
"loss": 0.9152, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.164009111617312, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00019185831397736583, |
|
"loss": 0.9038, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.1753986332574033, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00019170045798703406, |
|
"loss": 0.9073, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.1867881548974943, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00019154115256876702, |
|
"loss": 0.9026, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.1981776765375853, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00019138040024056435, |
|
"loss": 0.8967, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.2095671981776768, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00019121820354329577, |
|
"loss": 0.8996, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.2209567198177678, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00019105456504066082, |
|
"loss": 0.9049, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.2323462414578588, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.0001908894873191484, |
|
"loss": 0.913, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.2437357630979498, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00019072297298799589, |
|
"loss": 0.9099, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.255125284738041, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00019055502467914788, |
|
"loss": 0.9016, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.266514806378132, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.00019038564504721454, |
|
"loss": 0.89, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.277904328018223, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00019021483676942973, |
|
"loss": 0.9021, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.289293849658314, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00019004260254560867, |
|
"loss": 0.9142, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.3006833712984056, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00018986894509810513, |
|
"loss": 0.9014, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.3120728929384966, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.0001896938671717687, |
|
"loss": 0.8982, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.3234624145785876, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00018951737153390105, |
|
"loss": 0.9056, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.334851936218679, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00018933946097421248, |
|
"loss": 0.8869, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.34624145785877, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00018916013830477766, |
|
"loss": 0.8821, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.357630979498861, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00018897940635999118, |
|
"loss": 0.8865, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.369020501138952, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.0001887972679965229, |
|
"loss": 0.8943, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.3804100227790435, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.00018861372609327263, |
|
"loss": 0.8912, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.3917995444191344, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00018842878355132471, |
|
"loss": 0.8847, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.4031890660592254, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.0001882424432939021, |
|
"loss": 0.8833, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.4145785876993164, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00018805470826632024, |
|
"loss": 0.8905, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.425968109339408, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00018786558143594047, |
|
"loss": 0.8828, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.437357630979499, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00018767506579212313, |
|
"loss": 0.882, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.44874715261959, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.0001874831643461803, |
|
"loss": 0.8863, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.4601366742596813, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00018728988013132819, |
|
"loss": 0.8787, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.4715261958997723, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0001870952162026392, |
|
"loss": 0.8755, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.4829157175398633, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001868991756369937, |
|
"loss": 0.8839, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.4943052391799543, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00018670176153303127, |
|
"loss": 0.8818, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.5056947608200457, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0001865029770111019, |
|
"loss": 0.8875, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.5170842824601367, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00018630282521321645, |
|
"loss": 0.8769, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.5284738041002277, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00018610130930299715, |
|
"loss": 0.8812, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.5398633257403187, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00018589843246562756, |
|
"loss": 0.8837, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.55125284738041, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00018569419790780218, |
|
"loss": 0.8769, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.562642369020501, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00018548860885767582, |
|
"loss": 0.8782, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.574031890660592, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 0.00018528166856481254, |
|
"loss": 0.8822, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.5854214123006836, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00018507338030013427, |
|
"loss": 0.8677, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.5968109339407746, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0001848637473558692, |
|
"loss": 0.8775, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.6082004555808656, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.00018465277304549962, |
|
"loss": 0.8699, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.619589977220957, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00018444046070370963, |
|
"loss": 0.8709, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.630979498861048, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00018422681368633238, |
|
"loss": 0.8821, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.642369020501139, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00018401183537029714, |
|
"loss": 0.8807, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.65375854214123, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00018379552915357575, |
|
"loss": 0.8786, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.665148063781321, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00018357789845512901, |
|
"loss": 0.8744, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.6765375854214124, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001833589467148527, |
|
"loss": 0.8649, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.6879271070615034, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00018313867739352304, |
|
"loss": 0.8716, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.6993166287015944, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00018291709397274218, |
|
"loss": 0.8658, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.710706150341686, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00018269419995488298, |
|
"loss": 0.874, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.722095671981777, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00018246999886303383, |
|
"loss": 0.8752, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.733485193621868, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00018224449424094288, |
|
"loss": 0.8665, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.7448747152619593, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.00018201768965296194, |
|
"loss": 0.866, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.7562642369020502, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00018178958868399033, |
|
"loss": 0.8602, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.7676537585421412, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00018156019493941803, |
|
"loss": 0.8618, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.7790432801822322, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00018132951204506887, |
|
"loss": 0.8658, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.7904328018223232, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00018109754364714305, |
|
"loss": 0.8646, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.8018223234624147, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0001808642934121597, |
|
"loss": 0.8627, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.8132118451025057, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00018062976502689862, |
|
"loss": 0.8639, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.8246013667425967, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00018039396219834237, |
|
"loss": 0.8592, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.835990888382688, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0001801568886536174, |
|
"loss": 0.8575, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.847380410022779, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0001799185481399354, |
|
"loss": 0.8581, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.85876993166287, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.0001796789444245337, |
|
"loss": 0.8682, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.8701594533029615, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.0001794380812946161, |
|
"loss": 0.8758, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.8815489749430525, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 0.00017919596255729285, |
|
"loss": 0.8691, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.8929384965831435, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00017895259203952032, |
|
"loss": 0.8629, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.9043280182232345, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00017870797358804084, |
|
"loss": 0.8665, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.9157175398633255, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00017846211106932165, |
|
"loss": 0.8631, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.927107061503417, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00017821500836949386, |
|
"loss": 0.8555, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.938496583143508, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.000177966669394291, |
|
"loss": 0.8555, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.949886104783599, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00017771709806898732, |
|
"loss": 0.8577, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.9612756264236904, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00017746629833833585, |
|
"loss": 0.8506, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.9726651480637813, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00017721427416650577, |
|
"loss": 0.8503, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.9840546697038723, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00017696102953702, |
|
"loss": 0.8513, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.995444191343964, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00017670656845269214, |
|
"loss": 0.8628, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.405621290206909, |
|
"eval_runtime": 0.2428, |
|
"eval_samples_per_second": 41.192, |
|
"eval_steps_per_second": 4.119, |
|
"step": 1317 |
|
}, |
|
{ |
|
"epoch": 3.0068337129840548, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00017645089493556322, |
|
"loss": 0.8368, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.0182232346241458, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 0.0001761940130268381, |
|
"loss": 0.8331, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 3.0296127562642368, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00017593592678682166, |
|
"loss": 0.8446, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.041002277904328, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001756766402948545, |
|
"loss": 0.8303, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 3.052391799544419, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00017541615764924868, |
|
"loss": 0.8381, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.06378132118451, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00017515448296722262, |
|
"loss": 0.8353, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 3.075170842824601, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00017489162038483637, |
|
"loss": 0.836, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.0865603644646926, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00017462757405692597, |
|
"loss": 0.8187, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 3.0979498861047836, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00017436234815703788, |
|
"loss": 0.8301, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.1093394077448746, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.000174095946877363, |
|
"loss": 0.8356, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 3.120728929384966, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00017382837442867055, |
|
"loss": 0.824, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.132118451025057, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.00017355963504024123, |
|
"loss": 0.8378, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 3.143507972665148, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.00017328973295980052, |
|
"loss": 0.8334, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.154897494305239, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00017301867245345172, |
|
"loss": 0.8412, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 3.1662870159453305, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0001727464578056081, |
|
"loss": 0.8302, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.1776765375854215, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.0001724730933189256, |
|
"loss": 0.8294, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 3.1890660592255125, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0001721985833142346, |
|
"loss": 0.8354, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.2004555808656034, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.0001719229321304716, |
|
"loss": 0.8379, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 3.211845102505695, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00017164614412461084, |
|
"loss": 0.8245, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.223234624145786, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00017136822367159516, |
|
"loss": 0.8202, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 3.234624145785877, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00017108917516426704, |
|
"loss": 0.822, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.2460136674259683, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0001708090030132992, |
|
"loss": 0.8299, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 3.2574031890660593, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017052771164712465, |
|
"loss": 0.8365, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.2687927107061503, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00017024530551186702, |
|
"loss": 0.8254, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 3.2801822323462413, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0001699617890712699, |
|
"loss": 0.8257, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.2915717539863327, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00016967716680662667, |
|
"loss": 0.8315, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 3.3029612756264237, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0001693914432167094, |
|
"loss": 0.8265, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.3143507972665147, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00016910462281769783, |
|
"loss": 0.8228, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 3.3257403189066057, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0001688167101431081, |
|
"loss": 0.8262, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.337129840546697, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.0001685277097437208, |
|
"loss": 0.8284, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 3.348519362186788, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00016823762618750938, |
|
"loss": 0.8327, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.359908883826879, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00016794646405956774, |
|
"loss": 0.8211, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 3.3712984054669706, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0001676542279620378, |
|
"loss": 0.8266, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.3826879271070616, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00016736092251403673, |
|
"loss": 0.8247, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 3.3940774487471526, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00016706655235158407, |
|
"loss": 0.8243, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.4054669703872436, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00016677112212752824, |
|
"loss": 0.8186, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 3.416856492027335, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0001664746365114732, |
|
"loss": 0.8274, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.428246013667426, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00016617710018970453, |
|
"loss": 0.8175, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 3.439635535307517, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00016587851786511543, |
|
"loss": 0.8212, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.451025056947608, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00016557889425713226, |
|
"loss": 0.8185, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 3.4624145785876994, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0001652782341016401, |
|
"loss": 0.8175, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.4738041002277904, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00016497654215090772, |
|
"loss": 0.8192, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 3.4851936218678814, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00016467382317351267, |
|
"loss": 0.8139, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.496583143507973, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00016437008195426578, |
|
"loss": 0.8217, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 3.507972665148064, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00016406532329413546, |
|
"loss": 0.8182, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.519362186788155, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.000163759552010172, |
|
"loss": 0.8187, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 3.5307517084282463, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00016345277293543136, |
|
"loss": 0.8114, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.5421412300683373, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0001631449909188987, |
|
"loss": 0.8195, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 3.5535307517084282, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00016283621082541173, |
|
"loss": 0.8122, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.5649202733485192, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.000162526437535584, |
|
"loss": 0.8166, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 3.5763097949886102, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00016221567594572762, |
|
"loss": 0.8194, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.5876993166287017, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0001619039309677758, |
|
"loss": 0.82, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 3.5990888382687927, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0001615912075292054, |
|
"loss": 0.8184, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.6104783599088837, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0001612775105729588, |
|
"loss": 0.8138, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 3.621867881548975, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0001609628450573661, |
|
"loss": 0.8121, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.633257403189066, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00016064721595606635, |
|
"loss": 0.8157, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 3.644646924829157, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00016033062825792935, |
|
"loss": 0.8063, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.6560364464692485, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.00016001308696697643, |
|
"loss": 0.8207, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 3.6674259681093395, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00015969459710230162, |
|
"loss": 0.8067, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.6788154897494305, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00015937516369799216, |
|
"loss": 0.8105, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 3.6902050113895215, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00015905479180304896, |
|
"loss": 0.8212, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.7015945330296125, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00015873348648130694, |
|
"loss": 0.8089, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 3.712984054669704, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00015841125281135473, |
|
"loss": 0.8173, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.724373576309795, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00015808809588645467, |
|
"loss": 0.8156, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 3.735763097949886, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00015776402081446204, |
|
"loss": 0.8083, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.7471526195899774, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00015743903271774455, |
|
"loss": 0.8165, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 3.7585421412300684, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00015711313673310125, |
|
"loss": 0.8237, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.7699316628701594, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00015678633801168137, |
|
"loss": 0.8177, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 3.781321184510251, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00015645864171890295, |
|
"loss": 0.8086, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.792710706150342, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00015613005303437104, |
|
"loss": 0.8118, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 3.8041002277904328, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00015580057715179605, |
|
"loss": 0.8208, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.8154897494305238, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00015547021927891144, |
|
"loss": 0.8076, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 3.8268792710706148, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0001551389846373916, |
|
"loss": 0.8137, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.838268792710706, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00015480687846276917, |
|
"loss": 0.8117, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 3.849658314350797, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00015447390600435238, |
|
"loss": 0.8035, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.861047835990888, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00015414007252514202, |
|
"loss": 0.8105, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 3.8724373576309796, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00015380538330174827, |
|
"loss": 0.8043, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.8838268792710706, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0001534698436243073, |
|
"loss": 0.8124, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 3.8952164009111616, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00015313345879639764, |
|
"loss": 0.8198, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.906605922551253, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00015279623413495642, |
|
"loss": 0.8057, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 3.917995444191344, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00015245817497019524, |
|
"loss": 0.806, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.929384965831435, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00015211928664551593, |
|
"loss": 0.8033, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.940774487471526, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00015177957451742612, |
|
"loss": 0.8137, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.9521640091116175, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00015143904395545466, |
|
"loss": 0.8075, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 3.9635535307517085, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0001510977003420665, |
|
"loss": 0.8065, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.9749430523917995, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00015075554907257796, |
|
"loss": 0.8129, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 3.9863325740318905, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00015041259555507108, |
|
"loss": 0.8064, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.997722095671982, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00015006884521030848, |
|
"loss": 0.8131, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.417691946029663, |
|
"eval_runtime": 0.2354, |
|
"eval_samples_per_second": 42.48, |
|
"eval_steps_per_second": 4.248, |
|
"step": 1756 |
|
}, |
|
{ |
|
"epoch": 4.009111617312073, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00014972430347164742, |
|
"loss": 0.7909, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.020501138952164, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0001493789757849541, |
|
"loss": 0.7848, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 4.031890660592255, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00014903286760851737, |
|
"loss": 0.7893, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.043280182232346, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.0001486859844129628, |
|
"loss": 0.7902, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 4.054669703872437, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00014833833168116582, |
|
"loss": 0.7859, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.066059225512529, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00014798991490816532, |
|
"loss": 0.782, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 4.077448747152619, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00014764073960107666, |
|
"loss": 0.793, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.088838268792711, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00014729081127900476, |
|
"loss": 0.783, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 4.100227790432802, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00014694013547295672, |
|
"loss": 0.7908, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.111617312072893, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0001465887177257545, |
|
"loss": 0.7923, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 4.123006833712984, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00014623656359194712, |
|
"loss": 0.7904, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.134396355353076, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00014588367863772325, |
|
"loss": 0.7881, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 4.145785876993166, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00014553006844082283, |
|
"loss": 0.7831, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.157175398633258, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00014517573859044907, |
|
"loss": 0.788, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 4.168564920273348, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00014482069468718022, |
|
"loss": 0.7853, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 4.17995444191344, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00014446494234288083, |
|
"loss": 0.7931, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 4.191343963553531, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00014410848718061312, |
|
"loss": 0.7942, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.2027334851936216, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0001437513348345482, |
|
"loss": 0.7834, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 4.214123006833713, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00014339349094987699, |
|
"loss": 0.7797, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.225512528473804, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00014303496118272084, |
|
"loss": 0.7876, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 4.236902050113895, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00014267575120004231, |
|
"loss": 0.7943, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.248291571753986, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00014231586667955552, |
|
"loss": 0.7929, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 4.259681093394078, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00014195531330963635, |
|
"loss": 0.7842, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.271070615034168, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00014159409678923265, |
|
"loss": 0.7878, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 4.28246013667426, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0001412322228277741, |
|
"loss": 0.7843, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.29384965831435, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00014086969714508196, |
|
"loss": 0.7829, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 4.305239179954442, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00014050652547127864, |
|
"loss": 0.784, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.316628701594533, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00014014271354669718, |
|
"loss": 0.7815, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 4.328018223234624, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00013977826712179058, |
|
"loss": 0.7865, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.339407744874715, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0001394131919570407, |
|
"loss": 0.7784, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 4.350797266514807, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00013904749382286734, |
|
"loss": 0.7846, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.362186788154897, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0001386811784995371, |
|
"loss": 0.7905, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 4.373576309794989, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00013831425177707193, |
|
"loss": 0.7936, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.38496583143508, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00013794671945515757, |
|
"loss": 0.7828, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 4.396355353075171, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00013757858734305203, |
|
"loss": 0.7888, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.407744874715262, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00013720986125949353, |
|
"loss": 0.7852, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 4.4191343963553535, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00013684054703260882, |
|
"loss": 0.7921, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.430523917995444, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00013647065049982078, |
|
"loss": 0.7872, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 4.4419134396355355, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.00013610017750775643, |
|
"loss": 0.7883, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.453302961275626, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0001357291339121542, |
|
"loss": 0.7848, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 4.4646924829157175, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0001353575255777717, |
|
"loss": 0.7894, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.476082004555809, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00013498535837829276, |
|
"loss": 0.7911, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 4.4874715261958995, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00013461263819623476, |
|
"loss": 0.7897, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 4.498861047835991, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00013423937092285555, |
|
"loss": 0.7833, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 4.510250569476082, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.00013386556245806034, |
|
"loss": 0.8, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 4.521640091116173, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00013349121871030856, |
|
"loss": 0.7984, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 4.533029612756264, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00013311634559652036, |
|
"loss": 0.7938, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 4.544419134396355, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.000132740949041983, |
|
"loss": 0.7826, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 4.555808656036446, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00013236503498025747, |
|
"loss": 0.7922, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.567198177676538, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00013198860935308444, |
|
"loss": 0.7796, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 4.578587699316628, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0001316116781102904, |
|
"loss": 0.7926, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.58997722095672, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0001312342472096938, |
|
"loss": 0.7877, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 4.601366742596811, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00013085632261701063, |
|
"loss": 0.7903, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.612756264236902, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00013047791030576023, |
|
"loss": 0.7826, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 4.624145785876993, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00013009901625717093, |
|
"loss": 0.7823, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.635535307517085, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00012971964646008542, |
|
"loss": 0.7884, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 4.646924829157175, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0001293398069108662, |
|
"loss": 0.7846, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.658314350797267, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00012895950361330058, |
|
"loss": 0.7822, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 4.669703872437358, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00012857874257850605, |
|
"loss": 0.7899, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.681093394077449, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00012819752982483508, |
|
"loss": 0.7914, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 4.69248291571754, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00012781587137778013, |
|
"loss": 0.7859, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 4.703872437357631, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00012743377326987826, |
|
"loss": 0.7849, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 4.715261958997722, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00012705124154061597, |
|
"loss": 0.7852, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 4.7266514806378135, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00012666828223633348, |
|
"loss": 0.7802, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 4.738041002277904, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00012628490141012937, |
|
"loss": 0.792, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 4.7494305239179955, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00012590110512176498, |
|
"loss": 0.7915, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 4.760820045558087, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0001255168994375683, |
|
"loss": 0.7859, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 4.7722095671981775, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0001251322904303383, |
|
"loss": 0.7901, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 4.783599088838269, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0001247472841792491, |
|
"loss": 0.7866, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.7949886104783594, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00012436188676975346, |
|
"loss": 0.7846, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 4.806378132118451, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.000123976104293487, |
|
"loss": 0.789, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 4.817767653758542, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00012358994284817167, |
|
"loss": 0.7765, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 4.829157175398633, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00012320340853751952, |
|
"loss": 0.7877, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 4.840546697038724, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00012281650747113612, |
|
"loss": 0.7862, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 4.851936218678816, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00012242924576442388, |
|
"loss": 0.7897, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 4.863325740318906, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00012204162953848581, |
|
"loss": 0.7782, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 4.874715261958998, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00012165366492002832, |
|
"loss": 0.7796, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 4.886104783599089, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00012126535804126451, |
|
"loss": 0.791, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 4.89749430523918, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00012087671503981741, |
|
"loss": 0.7875, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.908883826879271, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00012048774205862279, |
|
"loss": 0.7783, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 4.920273348519363, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00012009844524583203, |
|
"loss": 0.7865, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.931662870159453, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00011970883075471522, |
|
"loss": 0.7899, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 4.943052391799545, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00011931890474356358, |
|
"loss": 0.7838, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 4.954441913439636, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00011892867337559221, |
|
"loss": 0.7859, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 4.965831435079727, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00011853814281884283, |
|
"loss": 0.7794, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.977220956719818, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00011814731924608616, |
|
"loss": 0.7793, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 4.988610478359909, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00011775620883472424, |
|
"loss": 0.7818, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.7788, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.416613817214966, |
|
"eval_runtime": 0.2352, |
|
"eval_samples_per_second": 42.522, |
|
"eval_steps_per_second": 4.252, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 5.011389521640091, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00011697315222836458, |
|
"loss": 0.7695, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.022779043280182, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00011658121841044922, |
|
"loss": 0.7684, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 5.034168564920273, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0001161890225078977, |
|
"loss": 0.7578, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 5.045558086560365, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0001157965707198034, |
|
"loss": 0.7611, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 5.056947608200455, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00011540386924930413, |
|
"loss": 0.7611, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 5.068337129840547, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00011501092430348435, |
|
"loss": 0.7644, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 5.079726651480637, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0001146177420932768, |
|
"loss": 0.7635, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 5.091116173120729, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00011422432883336456, |
|
"loss": 0.7639, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 5.10250569476082, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00011383069074208259, |
|
"loss": 0.77, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 5.113895216400911, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00011343683404131964, |
|
"loss": 0.7643, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 5.125284738041002, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00011304276495641981, |
|
"loss": 0.7696, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 5.136674259681094, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0001126484897160842, |
|
"loss": 0.7619, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 5.148063781321184, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0001122540145522723, |
|
"loss": 0.7612, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 5.159453302961276, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00011185934570010374, |
|
"loss": 0.7596, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 5.170842824601367, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00011146448939775962, |
|
"loss": 0.7652, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 5.182232346241458, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00011106945188638378, |
|
"loss": 0.763, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 5.193621867881549, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00011067423940998438, |
|
"loss": 0.7633, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 5.20501138952164, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00011027885821533508, |
|
"loss": 0.7679, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 5.216400911161731, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00010988331455187628, |
|
"loss": 0.7661, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 5.2277904328018225, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00010948761467161637, |
|
"loss": 0.7699, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 5.239179954441913, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00010909176482903295, |
|
"loss": 0.7734, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.2505694760820045, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00010869577128097404, |
|
"loss": 0.7675, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 5.261958997722096, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00010829964028655885, |
|
"loss": 0.7645, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 5.2733485193621865, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00010790337810707931, |
|
"loss": 0.767, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 5.284738041002278, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00010750699100590076, |
|
"loss": 0.7662, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 5.296127562642369, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00010711048524836311, |
|
"loss": 0.7673, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 5.30751708428246, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0001067138671016817, |
|
"loss": 0.7666, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 5.318906605922551, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00010631714283484842, |
|
"loss": 0.7687, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 5.330296127562642, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00010592031871853239, |
|
"loss": 0.771, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 5.341685649202733, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00010552340102498104, |
|
"loss": 0.7624, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 5.353075170842825, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00010512639602792088, |
|
"loss": 0.7654, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 5.364464692482915, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0001047293100024583, |
|
"loss": 0.7585, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 5.375854214123007, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00010433214922498047, |
|
"loss": 0.7622, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 5.387243735763098, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00010393491997305613, |
|
"loss": 0.7711, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 5.398633257403189, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0001035376285253363, |
|
"loss": 0.7672, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 5.41002277904328, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00010314028116145509, |
|
"loss": 0.7748, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 5.421412300683372, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.00010274288416193034, |
|
"loss": 0.7648, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 5.432801822323462, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00010234544380806461, |
|
"loss": 0.7623, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 5.444191343963554, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00010194796638184558, |
|
"loss": 0.7707, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 5.455580865603645, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00010155045816584691, |
|
"loss": 0.7629, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 5.466970387243736, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00010115292544312904, |
|
"loss": 0.7728, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 5.478359908883827, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00010075537449713963, |
|
"loss": 0.7704, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 5.489749430523918, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00010035781161161446, |
|
"loss": 0.7731, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 5.501138952164009, |
|
"grad_norm": 0.5, |
|
"learning_rate": 9.996024307047798e-05, |
|
"loss": 0.7712, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 5.5125284738041005, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.956267515774412e-05, |
|
"loss": 0.7647, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 5.523917995444191, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 9.916511415741676e-05, |
|
"loss": 0.7712, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 5.5353075170842825, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 9.876756635339058e-05, |
|
"loss": 0.77, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 5.546697038724374, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 9.83700380293517e-05, |
|
"loss": 0.7637, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 5.5580865603644645, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.797253546867831e-05, |
|
"loss": 0.7676, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 5.569476082004556, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 9.757506495434133e-05, |
|
"loss": 0.7601, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 5.5808656036446465, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 9.71776327688053e-05, |
|
"loss": 0.7657, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 5.592255125284738, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 9.678024519392871e-05, |
|
"loss": 0.7737, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 5.603644646924829, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 9.638290851086518e-05, |
|
"loss": 0.7676, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 5.61503416856492, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.598562899996375e-05, |
|
"loss": 0.7704, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 5.626423690205011, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 9.558841294066985e-05, |
|
"loss": 0.7631, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 5.637813211845103, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.519126661142597e-05, |
|
"loss": 0.7657, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 5.649202733485193, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.479419628957246e-05, |
|
"loss": 0.7668, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 5.660592255125285, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 9.439720825124827e-05, |
|
"loss": 0.7604, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 5.671981776765376, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.400030877129176e-05, |
|
"loss": 0.7685, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 5.683371298405467, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 9.360350412314157e-05, |
|
"loss": 0.7715, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 5.694760820045558, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 9.320680057873735e-05, |
|
"loss": 0.7628, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.70615034168565, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 9.281020440842079e-05, |
|
"loss": 0.7629, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 5.71753986332574, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 9.241372188083631e-05, |
|
"loss": 0.7585, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 5.728929384965832, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 9.201735926283213e-05, |
|
"loss": 0.768, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 5.740318906605923, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 9.162112281936118e-05, |
|
"loss": 0.7658, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 5.751708428246014, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 9.122501881338199e-05, |
|
"loss": 0.7681, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 5.763097949886105, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 9.082905350575986e-05, |
|
"loss": 0.7653, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 5.774487471526196, |
|
"grad_norm": 0.375, |
|
"learning_rate": 9.043323315516775e-05, |
|
"loss": 0.7711, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 5.785876993166287, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 9.003756401798744e-05, |
|
"loss": 0.7596, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 5.7972665148063784, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 8.96420523482106e-05, |
|
"loss": 0.7649, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 5.808656036446469, |
|
"grad_norm": 0.375, |
|
"learning_rate": 8.924670439733997e-05, |
|
"loss": 0.7686, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 5.82004555808656, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 8.885152641429049e-05, |
|
"loss": 0.771, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 5.831435079726651, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 8.845652464529057e-05, |
|
"loss": 0.7638, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 5.842824601366742, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 8.806170533378345e-05, |
|
"loss": 0.7705, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 5.854214123006834, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 8.766707472032831e-05, |
|
"loss": 0.768, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 5.865603644646924, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 8.727263904250178e-05, |
|
"loss": 0.7626, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 5.876993166287016, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 8.687840453479938e-05, |
|
"loss": 0.7728, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 5.888382687927107, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.648437742853685e-05, |
|
"loss": 0.7665, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 5.899772209567198, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 8.609056395175175e-05, |
|
"loss": 0.7613, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 5.911161731207289, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 8.569697032910492e-05, |
|
"loss": 0.7712, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 5.922551252847381, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 8.530360278178227e-05, |
|
"loss": 0.7704, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.933940774487471, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 8.491046752739624e-05, |
|
"loss": 0.7672, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 5.945330296127563, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 8.451757077988767e-05, |
|
"loss": 0.7701, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 5.956719817767654, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 8.41249187494275e-05, |
|
"loss": 0.7666, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 5.968109339407745, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.373251764231872e-05, |
|
"loss": 0.7562, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 5.979498861047836, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 8.334037366089813e-05, |
|
"loss": 0.765, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 5.990888382687928, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 8.294849300343836e-05, |
|
"loss": 0.771, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.432857036590576, |
|
"eval_runtime": 0.2435, |
|
"eval_samples_per_second": 41.072, |
|
"eval_steps_per_second": 4.107, |
|
"step": 2634 |
|
}, |
|
{ |
|
"epoch": 6.002277904328018, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 8.255688186404996e-05, |
|
"loss": 0.77, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 6.0136674259681095, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 8.216554643258342e-05, |
|
"loss": 0.748, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 6.0250569476082, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.177449289453134e-05, |
|
"loss": 0.7503, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 6.0364464692482915, |
|
"grad_norm": 0.5, |
|
"learning_rate": 8.138372743093076e-05, |
|
"loss": 0.7419, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 6.047835990888383, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 8.099325621826526e-05, |
|
"loss": 0.7518, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 6.0592255125284735, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 8.060308542836755e-05, |
|
"loss": 0.76, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 6.070615034168565, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 8.021322122832178e-05, |
|
"loss": 0.7567, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 6.082004555808656, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 7.982366978036618e-05, |
|
"loss": 0.7548, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 6.093394077448747, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 7.943443724179548e-05, |
|
"loss": 0.7557, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 6.104783599088838, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 7.904552976486372e-05, |
|
"loss": 0.7571, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 6.116173120728929, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 7.865695349668703e-05, |
|
"loss": 0.7572, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 6.12756264236902, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 7.826871457914639e-05, |
|
"loss": 0.7415, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 6.138952164009112, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 7.788081914879051e-05, |
|
"loss": 0.7547, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 6.150341685649202, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 7.7493273336739e-05, |
|
"loss": 0.7431, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.161731207289294, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 7.710608326858535e-05, |
|
"loss": 0.7555, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 6.173120728929385, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 7.67192550643001e-05, |
|
"loss": 0.7472, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 6.184510250569476, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 7.633279483813405e-05, |
|
"loss": 0.7569, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 6.195899772209567, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 7.594670869852185e-05, |
|
"loss": 0.7494, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 6.207289293849659, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 7.556100274798519e-05, |
|
"loss": 0.7532, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 6.218678815489749, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 7.517568308303643e-05, |
|
"loss": 0.7581, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 6.230068337129841, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 7.47907557940824e-05, |
|
"loss": 0.7569, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 6.241457858769932, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 7.440622696532775e-05, |
|
"loss": 0.7549, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 6.252847380410023, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 7.402210267467928e-05, |
|
"loss": 0.7478, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 6.264236902050114, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 7.363838899364944e-05, |
|
"loss": 0.7515, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 6.275626423690205, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 7.325509198726064e-05, |
|
"loss": 0.7471, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 6.287015945330296, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 7.287221771394917e-05, |
|
"loss": 0.7565, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 6.2984054669703875, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 7.248977222546968e-05, |
|
"loss": 0.7572, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 6.309794988610478, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 7.210776156679931e-05, |
|
"loss": 0.7442, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 6.3211845102505695, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 7.172619177604223e-05, |
|
"loss": 0.7615, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 6.332574031890661, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 7.134506888433426e-05, |
|
"loss": 0.7538, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 6.3439635535307515, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 7.096439891574745e-05, |
|
"loss": 0.7445, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 6.355353075170843, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 7.058418788719491e-05, |
|
"loss": 0.7593, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 6.366742596810934, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 7.020444180833564e-05, |
|
"loss": 0.7603, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 6.378132118451025, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 6.982516668147967e-05, |
|
"loss": 0.7544, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 6.389521640091116, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 6.944636850149306e-05, |
|
"loss": 0.7508, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 6.400911161731207, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 6.906805325570316e-05, |
|
"loss": 0.7587, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 6.412300683371298, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 6.869022692380411e-05, |
|
"loss": 0.7456, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 6.42369020501139, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 6.831289547776207e-05, |
|
"loss": 0.7541, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 6.43507972665148, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 6.793606488172118e-05, |
|
"loss": 0.7477, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 6.446469248291572, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 6.75597410919089e-05, |
|
"loss": 0.7508, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 6.457858769931663, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 6.718393005654215e-05, |
|
"loss": 0.7583, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 6.469248291571754, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 6.680863771573318e-05, |
|
"loss": 0.7533, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 6.480637813211845, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 6.643387000139565e-05, |
|
"loss": 0.7506, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 6.492027334851937, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 6.6059632837151e-05, |
|
"loss": 0.7556, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 6.503416856492027, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 6.568593213823465e-05, |
|
"loss": 0.7505, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 6.514806378132119, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 6.53127738114026e-05, |
|
"loss": 0.7499, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 6.52619589977221, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 6.494016375483811e-05, |
|
"loss": 0.7614, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 6.537585421412301, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 6.456810785805842e-05, |
|
"loss": 0.7412, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 6.548974943052392, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 6.419661200182158e-05, |
|
"loss": 0.7612, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 6.560364464692483, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 6.38256820580336e-05, |
|
"loss": 0.7529, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 6.571753986332574, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 6.345532388965565e-05, |
|
"loss": 0.7536, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 6.5831435079726655, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 6.308554335061135e-05, |
|
"loss": 0.7533, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 6.594533029612756, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 6.271634628569418e-05, |
|
"loss": 0.7502, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 6.605922551252847, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 6.234773853047526e-05, |
|
"loss": 0.7513, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 6.617312072892939, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 6.19797259112109e-05, |
|
"loss": 0.7542, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 6.628701594533029, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 6.161231424475075e-05, |
|
"loss": 0.7499, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 6.640091116173121, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 6.124550933844562e-05, |
|
"loss": 0.7522, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 6.651480637813211, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 6.087931699005588e-05, |
|
"loss": 0.7573, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 6.662870159453303, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 6.0513742987659686e-05, |
|
"loss": 0.7557, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 6.674259681093394, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 6.014879310956154e-05, |
|
"loss": 0.7554, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 6.685649202733485, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 5.978447312420103e-05, |
|
"loss": 0.7524, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 6.697038724373576, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 5.9420788790061544e-05, |
|
"loss": 0.7571, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 6.708428246013668, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 5.905774585557922e-05, |
|
"loss": 0.7512, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 6.719817767653758, |
|
"grad_norm": 0.375, |
|
"learning_rate": 5.869535005905232e-05, |
|
"loss": 0.747, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 6.73120728929385, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 5.833360712855029e-05, |
|
"loss": 0.7562, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 6.742596810933941, |
|
"grad_norm": 0.5, |
|
"learning_rate": 5.7972522781823256e-05, |
|
"loss": 0.752, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 6.753986332574032, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 5.761210272621175e-05, |
|
"loss": 0.7494, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 6.765375854214123, |
|
"grad_norm": 0.375, |
|
"learning_rate": 5.7252352658556376e-05, |
|
"loss": 0.7533, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 6.776765375854215, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 5.689327826510796e-05, |
|
"loss": 0.7486, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 6.788154897494305, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 5.653488522143744e-05, |
|
"loss": 0.7489, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 6.7995444191343966, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 5.617717919234624e-05, |
|
"loss": 0.7518, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 6.810933940774487, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 5.582016583177687e-05, |
|
"loss": 0.7607, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 6.8223234624145785, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 5.5463850782723346e-05, |
|
"loss": 0.7518, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 6.83371298405467, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 5.5108239677142115e-05, |
|
"loss": 0.7445, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.8451025056947605, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 5.475333813586297e-05, |
|
"loss": 0.7496, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 6.856492027334852, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 5.439915176850037e-05, |
|
"loss": 0.7585, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 6.867881548974943, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 5.404568617336456e-05, |
|
"loss": 0.7485, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 6.879271070615034, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 5.369294693737319e-05, |
|
"loss": 0.755, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 6.890660592255125, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 5.334093963596294e-05, |
|
"loss": 0.7556, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 6.902050113895216, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 5.298966983300161e-05, |
|
"loss": 0.7474, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 6.913439635535307, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 5.263914308069986e-05, |
|
"loss": 0.7531, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 6.924829157175399, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 5.228936491952363e-05, |
|
"loss": 0.7501, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 6.936218678815489, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 5.194034087810665e-05, |
|
"loss": 0.7469, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 6.947608200455581, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 5.159207647316282e-05, |
|
"loss": 0.7559, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 6.958997722095672, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 5.12445772093992e-05, |
|
"loss": 0.748, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 6.970387243735763, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 5.089784857942892e-05, |
|
"loss": 0.7525, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 6.981776765375854, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.055189606368436e-05, |
|
"loss": 0.7544, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 6.993166287015946, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 5.020672513033066e-05, |
|
"loss": 0.7459, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.445798397064209, |
|
"eval_runtime": 0.2348, |
|
"eval_samples_per_second": 42.591, |
|
"eval_steps_per_second": 4.259, |
|
"step": 3073 |
|
}, |
|
{ |
|
"epoch": 7.004555808656036, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 4.9862341235179014e-05, |
|
"loss": 0.7506, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 7.015945330296128, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 4.951874982160079e-05, |
|
"loss": 0.7452, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 7.027334851936219, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 4.917595632044113e-05, |
|
"loss": 0.7421, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 7.03872437357631, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 4.8833966149933364e-05, |
|
"loss": 0.744, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 7.050113895216401, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 4.849278471561328e-05, |
|
"loss": 0.7418, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 7.061503416856492, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 4.815241741023367e-05, |
|
"loss": 0.7488, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 7.072892938496583, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 4.7812869613679103e-05, |
|
"loss": 0.7373, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 7.0842824601366745, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 4.747414669288094e-05, |
|
"loss": 0.7441, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 7.095671981776765, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 4.713625400173247e-05, |
|
"loss": 0.7439, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 7.1070615034168565, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 4.679919688100423e-05, |
|
"loss": 0.7471, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 7.118451025056948, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 4.6462980658259625e-05, |
|
"loss": 0.7476, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 7.1298405466970385, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 4.6127610647770767e-05, |
|
"loss": 0.7365, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 7.14123006833713, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 4.5793092150434405e-05, |
|
"loss": 0.7422, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 7.152619589977221, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 4.545943045368826e-05, |
|
"loss": 0.7483, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 7.164009111617312, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 4.5126630831427264e-05, |
|
"loss": 0.743, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 7.175398633257403, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 4.479469854392031e-05, |
|
"loss": 0.7414, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 7.186788154897494, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 4.4463638837727196e-05, |
|
"loss": 0.7382, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 7.198177676537585, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 4.413345694561549e-05, |
|
"loss": 0.7365, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 7.209567198177677, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 4.3804158086477986e-05, |
|
"loss": 0.7412, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 7.220956719817767, |
|
"grad_norm": 0.375, |
|
"learning_rate": 4.34757474652501e-05, |
|
"loss": 0.7491, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 7.232346241457859, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 4.3148230272827784e-05, |
|
"loss": 0.7452, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 7.24373576309795, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 4.282161168598523e-05, |
|
"loss": 0.7496, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 7.255125284738041, |
|
"grad_norm": 0.375, |
|
"learning_rate": 4.249589686729319e-05, |
|
"loss": 0.7409, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 7.266514806378132, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 4.217109096503736e-05, |
|
"loss": 0.742, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 7.277904328018224, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 4.184719911313707e-05, |
|
"loss": 0.7367, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 7.289293849658314, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 4.152422643106396e-05, |
|
"loss": 0.7467, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 7.300683371298406, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 4.1202178023761195e-05, |
|
"loss": 0.7416, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 7.312072892938497, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 4.088105898156282e-05, |
|
"loss": 0.7483, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 7.323462414578588, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 4.0560874380113146e-05, |
|
"loss": 0.7444, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 7.334851936218679, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 4.024162928028663e-05, |
|
"loss": 0.7417, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 7.34624145785877, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 3.9923328728107856e-05, |
|
"loss": 0.743, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 7.357630979498861, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 3.960597775467177e-05, |
|
"loss": 0.7482, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 7.3690205011389525, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 3.928958137606421e-05, |
|
"loss": 0.7473, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 7.380410022779043, |
|
"grad_norm": 0.375, |
|
"learning_rate": 3.8974144593282534e-05, |
|
"loss": 0.7429, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 7.3917995444191344, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 3.865967239215667e-05, |
|
"loss": 0.7481, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 7.403189066059226, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 3.834616974327021e-05, |
|
"loss": 0.7445, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 7.414578587699316, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 3.80336416018819e-05, |
|
"loss": 0.7431, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 7.425968109339408, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 3.7722092907847305e-05, |
|
"loss": 0.7394, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 7.437357630979498, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 3.741152858554077e-05, |
|
"loss": 0.744, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 7.44874715261959, |
|
"grad_norm": 0.375, |
|
"learning_rate": 3.710195354377747e-05, |
|
"loss": 0.7408, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 7.460136674259681, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 3.679337267573597e-05, |
|
"loss": 0.7361, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 7.471526195899772, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 3.648579085888085e-05, |
|
"loss": 0.7467, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 7.482915717539863, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 3.6179212954885477e-05, |
|
"loss": 0.738, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 7.494305239179955, |
|
"grad_norm": 0.375, |
|
"learning_rate": 3.587364380955529e-05, |
|
"loss": 0.7475, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 7.505694760820045, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 3.556908825275117e-05, |
|
"loss": 0.7434, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 7.517084282460137, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 3.526555109831311e-05, |
|
"loss": 0.7477, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 7.528473804100228, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 3.4963037143984087e-05, |
|
"loss": 0.7413, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 7.539863325740319, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 3.466155117133433e-05, |
|
"loss": 0.748, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 7.55125284738041, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 3.436109794568565e-05, |
|
"loss": 0.7444, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 7.562642369020502, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 3.406168221603611e-05, |
|
"loss": 0.7387, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 7.574031890660592, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 3.3763308714984974e-05, |
|
"loss": 0.7436, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 7.585421412300684, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 3.3465982158657984e-05, |
|
"loss": 0.7413, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 7.596810933940774, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 3.3169707246632705e-05, |
|
"loss": 0.7423, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 7.6082004555808656, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 3.287448866186428e-05, |
|
"loss": 0.7392, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 7.619589977220957, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 3.258033107061153e-05, |
|
"loss": 0.7461, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 7.6309794988610475, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 3.228723912236291e-05, |
|
"loss": 0.7365, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 7.642369020501139, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 3.199521744976342e-05, |
|
"loss": 0.7414, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 7.65375854214123, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 3.170427066854096e-05, |
|
"loss": 0.7548, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 7.665148063781321, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 3.141440337743369e-05, |
|
"loss": 0.739, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 7.676537585421412, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 3.1125620158117186e-05, |
|
"loss": 0.7505, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 7.687927107061503, |
|
"grad_norm": 0.375, |
|
"learning_rate": 3.0837925575132024e-05, |
|
"loss": 0.7487, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 7.699316628701594, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 3.055132417581179e-05, |
|
"loss": 0.7427, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 7.710706150341686, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 3.0265820490210973e-05, |
|
"loss": 0.7384, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 7.722095671981776, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 2.9981419031033498e-05, |
|
"loss": 0.7402, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 7.733485193621868, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.9698124293561357e-05, |
|
"loss": 0.7485, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 7.744874715261959, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 2.941594075558366e-05, |
|
"loss": 0.7505, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 7.75626423690205, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 2.913487287732565e-05, |
|
"loss": 0.7446, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 7.767653758542141, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 2.8854925101378438e-05, |
|
"loss": 0.7461, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 7.779043280182233, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 2.857610185262859e-05, |
|
"loss": 0.75, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 7.790432801822323, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 2.8298407538188288e-05, |
|
"loss": 0.7469, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 7.801822323462415, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 2.8021846547325635e-05, |
|
"loss": 0.7437, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 7.813211845102506, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 2.774642325139535e-05, |
|
"loss": 0.7408, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 7.824601366742597, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 2.7472142003769495e-05, |
|
"loss": 0.7431, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 7.835990888382688, |
|
"grad_norm": 0.375, |
|
"learning_rate": 2.7199007139768928e-05, |
|
"loss": 0.7475, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 7.8473804100227795, |
|
"grad_norm": 0.375, |
|
"learning_rate": 2.6927022976594607e-05, |
|
"loss": 0.7371, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 7.85876993166287, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 2.665619381325929e-05, |
|
"loss": 0.7477, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 7.8701594533029615, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 2.638652393051976e-05, |
|
"loss": 0.7433, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 7.881548974943052, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 2.6118017590809017e-05, |
|
"loss": 0.7401, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 7.8929384965831435, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 2.5850679038169045e-05, |
|
"loss": 0.7415, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 7.904328018223235, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 2.5584512498183544e-05, |
|
"loss": 0.7309, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 7.9157175398633255, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.531952217791136e-05, |
|
"loss": 0.7422, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 7.927107061503417, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 2.505571226581984e-05, |
|
"loss": 0.7434, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 7.9384965831435075, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 2.4793086931718634e-05, |
|
"loss": 0.7451, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 7.949886104783599, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 2.4531650326693822e-05, |
|
"loss": 0.7455, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 7.96127562642369, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 2.4271406583042335e-05, |
|
"loss": 0.7393, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 7.972665148063781, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 2.401235981420653e-05, |
|
"loss": 0.7443, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.984054669703872, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 2.3754514114709304e-05, |
|
"loss": 0.7429, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 7.995444191343964, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 2.3497873560089322e-05, |
|
"loss": 0.745, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.460949420928955, |
|
"eval_runtime": 0.2436, |
|
"eval_samples_per_second": 41.058, |
|
"eval_steps_per_second": 4.106, |
|
"step": 3512 |
|
}, |
|
{ |
|
"epoch": 8.006833712984054, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 2.3242442206836523e-05, |
|
"loss": 0.749, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 8.018223234624147, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.298822409232817e-05, |
|
"loss": 0.7467, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 8.029612756264237, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 2.2735223234764846e-05, |
|
"loss": 0.735, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 8.041002277904328, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 2.2483443633107058e-05, |
|
"loss": 0.7392, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 8.052391799544418, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.2232889267012038e-05, |
|
"loss": 0.7377, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 8.06378132118451, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 2.1983564096770725e-05, |
|
"loss": 0.748, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 8.075170842824601, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 2.1735472063245354e-05, |
|
"loss": 0.7325, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 8.086560364464692, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 2.1488617087806982e-05, |
|
"loss": 0.7372, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 8.097949886104784, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 2.1243003072273582e-05, |
|
"loss": 0.7391, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 8.109339407744875, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 2.0998633898848442e-05, |
|
"loss": 0.7467, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 8.120728929384965, |
|
"grad_norm": 0.375, |
|
"learning_rate": 2.0755513430058672e-05, |
|
"loss": 0.7418, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 8.132118451025057, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 2.0513645508694225e-05, |
|
"loss": 0.7383, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 8.143507972665148, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 2.0273033957747134e-05, |
|
"loss": 0.7417, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 8.154897494305239, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 2.0033682580351144e-05, |
|
"loss": 0.7386, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 8.166287015945331, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.9795595159721524e-05, |
|
"loss": 0.7449, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 8.177676537585421, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.955877545909528e-05, |
|
"loss": 0.7429, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 8.189066059225512, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.932322722167168e-05, |
|
"loss": 0.7391, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 8.200455580865604, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9088954170553198e-05, |
|
"loss": 0.7389, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 8.211845102505695, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.8855960008686446e-05, |
|
"loss": 0.7406, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 8.223234624145785, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.86242484188038e-05, |
|
"loss": 0.736, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 8.234624145785878, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.8393823063365223e-05, |
|
"loss": 0.7449, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 8.246013667425968, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.816468758450024e-05, |
|
"loss": 0.7346, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 8.257403189066059, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.7936845603950447e-05, |
|
"loss": 0.743, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 8.268792710706151, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.7710300723012262e-05, |
|
"loss": 0.7421, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 8.280182232346242, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.7485056522480004e-05, |
|
"loss": 0.7365, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 8.291571753986332, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.726111656258932e-05, |
|
"loss": 0.741, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 8.302961275626423, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.7038484382960796e-05, |
|
"loss": 0.736, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 8.314350797266515, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.6817163502544208e-05, |
|
"loss": 0.7342, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 8.325740318906606, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.6597157419562703e-05, |
|
"loss": 0.7331, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 8.337129840546696, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.6378469611457592e-05, |
|
"loss": 0.7375, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 8.348519362186789, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.6161103534833423e-05, |
|
"loss": 0.7431, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 8.35990888382688, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.594506262540324e-05, |
|
"loss": 0.7431, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 8.37129840546697, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.5730350297934448e-05, |
|
"loss": 0.7392, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 8.382687927107062, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.5516969946194626e-05, |
|
"loss": 0.7355, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 8.394077448747153, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.5304924942898068e-05, |
|
"loss": 0.7388, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 8.405466970387243, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.509421863965237e-05, |
|
"loss": 0.7441, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 8.416856492027335, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.4884854366905455e-05, |
|
"loss": 0.7324, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 8.428246013667426, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.4676835433892989e-05, |
|
"loss": 0.7403, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 8.439635535307517, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.4470165128586022e-05, |
|
"loss": 0.7422, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 8.451025056947609, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.4264846717639102e-05, |
|
"loss": 0.7403, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 8.4624145785877, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.4060883446338502e-05, |
|
"loss": 0.7358, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 8.47380410022779, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.3858278538551018e-05, |
|
"loss": 0.7384, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 8.485193621867882, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.3657035196673052e-05, |
|
"loss": 0.7365, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 8.496583143507973, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.345715660157989e-05, |
|
"loss": 0.7389, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 8.507972665148063, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.3258645912575484e-05, |
|
"loss": 0.7459, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 8.519362186788156, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.3061506267342472e-05, |
|
"loss": 0.7482, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 8.530751708428246, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.2865740781892699e-05, |
|
"loss": 0.7441, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 8.542141230068337, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.2671352550517823e-05, |
|
"loss": 0.7379, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 8.55353075170843, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.2478344645740469e-05, |
|
"loss": 0.7386, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 8.56492027334852, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.2286720118265659e-05, |
|
"loss": 0.7435, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 8.57630979498861, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.209648199693264e-05, |
|
"loss": 0.7412, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 8.5876993166287, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.190763328866693e-05, |
|
"loss": 0.7414, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 8.599088838268793, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.1720176978432795e-05, |
|
"loss": 0.7393, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 8.610478359908884, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.1534116029186181e-05, |
|
"loss": 0.7333, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 8.621867881548974, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.1349453381827713e-05, |
|
"loss": 0.7345, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 8.633257403189067, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.1166191955156346e-05, |
|
"loss": 0.7531, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 8.644646924829157, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.0984334645823158e-05, |
|
"loss": 0.7359, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 8.656036446469248, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.0803884328285586e-05, |
|
"loss": 0.7441, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 8.66742596810934, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.0624843854762034e-05, |
|
"loss": 0.7353, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 8.67881548974943, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.0447216055186681e-05, |
|
"loss": 0.7407, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 8.690205011389521, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.0271003737164909e-05, |
|
"loss": 0.7372, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 8.701594533029613, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.009620968592876e-05, |
|
"loss": 0.7445, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 8.712984054669704, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 9.922836664293022e-06, |
|
"loss": 0.7362, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 8.724373576309794, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.750887412611508e-06, |
|
"loss": 0.7408, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 8.735763097949887, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 9.580364648733775e-06, |
|
"loss": 0.7347, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 8.747152619589977, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 9.411271067962124e-06, |
|
"loss": 0.738, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 8.758542141230068, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 9.243609343009086e-06, |
|
"loss": 0.7391, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 8.76993166287016, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 9.07738212395508e-06, |
|
"loss": 0.7405, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 8.78132118451025, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 8.912592038206546e-06, |
|
"loss": 0.7391, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 8.792710706150341, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 8.749241690454424e-06, |
|
"loss": 0.7367, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 8.804100227790432, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 8.587333662633035e-06, |
|
"loss": 0.7411, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 8.815489749430524, |
|
"grad_norm": 0.375, |
|
"learning_rate": 8.426870513879182e-06, |
|
"loss": 0.7365, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 8.826879271070615, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 8.267854780491747e-06, |
|
"loss": 0.7458, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 8.838268792710707, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 8.110288975891634e-06, |
|
"loss": 0.7326, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 8.849658314350798, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 7.954175590581992e-06, |
|
"loss": 0.7374, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 8.861047835990888, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 7.799517092108855e-06, |
|
"loss": 0.7345, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 8.872437357630979, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 7.646315925022152e-06, |
|
"loss": 0.7384, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 8.883826879271071, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 7.49457451083706e-06, |
|
"loss": 0.7319, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 8.895216400911162, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 7.344295247995725e-06, |
|
"loss": 0.7432, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 8.906605922551252, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 7.195480511829411e-06, |
|
"loss": 0.7392, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 8.917995444191344, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 7.048132654520856e-06, |
|
"loss": 0.7465, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 8.929384965831435, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 6.902254005067166e-06, |
|
"loss": 0.7387, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 8.940774487471526, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 6.7578468692429345e-06, |
|
"loss": 0.74, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 8.952164009111618, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 6.614913529563927e-06, |
|
"loss": 0.7346, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 8.963553530751708, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 6.4734562452508525e-06, |
|
"loss": 0.739, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 8.974943052391799, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 6.333477252193731e-06, |
|
"loss": 0.7418, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 8.986332574031891, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 6.19497876291657e-06, |
|
"loss": 0.7394, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 8.997722095671982, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 6.057962966542319e-06, |
|
"loss": 0.7433, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.4639732837677, |
|
"eval_runtime": 0.2359, |
|
"eval_samples_per_second": 42.387, |
|
"eval_steps_per_second": 4.239, |
|
"step": 3951 |
|
}, |
|
{ |
|
"epoch": 9.009111617312072, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 5.922432028758362e-06, |
|
"loss": 0.7355, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 9.020501138952165, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 5.788388091782204e-06, |
|
"loss": 0.743, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 9.031890660592255, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 5.655833274327638e-06, |
|
"loss": 0.7396, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 9.043280182232346, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 5.524769671571317e-06, |
|
"loss": 0.734, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 9.054669703872438, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 5.395199355119518e-06, |
|
"loss": 0.7406, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 9.066059225512529, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 5.267124372975518e-06, |
|
"loss": 0.7398, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 9.07744874715262, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 5.140546749507136e-06, |
|
"loss": 0.7425, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 9.08883826879271, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 5.0154684854147645e-06, |
|
"loss": 0.7335, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 9.100227790432802, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 4.891891557699779e-06, |
|
"loss": 0.7502, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 9.111617312072893, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 4.769817919633235e-06, |
|
"loss": 0.737, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 9.123006833712983, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 4.649249500725017e-06, |
|
"loss": 0.7357, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 9.134396355353076, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 4.530188206693375e-06, |
|
"loss": 0.7419, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 9.145785876993166, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 4.412635919434749e-06, |
|
"loss": 0.7442, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 9.157175398633257, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 4.296594496994055e-06, |
|
"loss": 0.7385, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 9.168564920273349, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 4.182065773535271e-06, |
|
"loss": 0.7394, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 9.17995444191344, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 4.069051559312531e-06, |
|
"loss": 0.7384, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 9.19134396355353, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 3.957553640641442e-06, |
|
"loss": 0.7342, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 9.202733485193622, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 3.847573779870839e-06, |
|
"loss": 0.734, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 9.214123006833713, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 3.7391137153550137e-06, |
|
"loss": 0.742, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 9.225512528473804, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 3.6321751614261767e-06, |
|
"loss": 0.7449, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 9.236902050113896, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 3.5267598083673304e-06, |
|
"loss": 0.7389, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 9.248291571753986, |
|
"grad_norm": 0.375, |
|
"learning_rate": 3.4228693223856136e-06, |
|
"loss": 0.7377, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 9.259681093394077, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 3.320505345585945e-06, |
|
"loss": 0.7356, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 9.27107061503417, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 3.219669495945055e-06, |
|
"loss": 0.7484, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 9.28246013667426, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 3.120363367285917e-06, |
|
"loss": 0.7398, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 9.29384965831435, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 3.022588529252579e-06, |
|
"loss": 0.736, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 9.305239179954443, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.9263465272853173e-06, |
|
"loss": 0.7336, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 9.316628701594533, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.8316388825962324e-06, |
|
"loss": 0.7402, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 9.328018223234624, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 2.738467092145214e-06, |
|
"loss": 0.7427, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 9.339407744874716, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 2.646832628616214e-06, |
|
"loss": 0.7395, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 9.350797266514807, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 2.5567369403940776e-06, |
|
"loss": 0.7432, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 9.362186788154897, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.4681814515415404e-06, |
|
"loss": 0.7375, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 9.373576309794988, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 2.3811675617768204e-06, |
|
"loss": 0.7441, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 9.38496583143508, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 2.2956966464514175e-06, |
|
"loss": 0.7369, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 9.39635535307517, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 2.2117700565283838e-06, |
|
"loss": 0.734, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 9.407744874715261, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 2.1293891185610204e-06, |
|
"loss": 0.7374, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 9.419134396355354, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 2.04855513467187e-06, |
|
"loss": 0.7387, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 9.430523917995444, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.969269382532113e-06, |
|
"loss": 0.7385, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 9.441913439635535, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.8915331153414262e-06, |
|
"loss": 0.7313, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 9.453302961275627, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.8153475618081673e-06, |
|
"loss": 0.7394, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 9.464692482915718, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.7407139261299e-06, |
|
"loss": 0.736, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 9.476082004555808, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.667633387974421e-06, |
|
"loss": 0.7403, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 9.4874715261959, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.5961071024610752e-06, |
|
"loss": 0.7478, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 9.498861047835991, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 1.5261362001425138e-06, |
|
"loss": 0.7342, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 9.510250569476081, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.457721786986821e-06, |
|
"loss": 0.7485, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 9.521640091116174, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.3908649443600707e-06, |
|
"loss": 0.7392, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 9.533029612756264, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.3255667290091644e-06, |
|
"loss": 0.7362, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 9.544419134396355, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.2618281730451432e-06, |
|
"loss": 0.7383, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 9.555808656036447, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.1996502839269453e-06, |
|
"loss": 0.7361, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 9.567198177676538, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.139034044445375e-06, |
|
"loss": 0.7354, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 9.578587699316628, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.0799804127076707e-06, |
|
"loss": 0.7354, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 9.589977220956719, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.0224903221222938e-06, |
|
"loss": 0.7408, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 9.601366742596811, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 9.665646813842077e-07, |
|
"loss": 0.7329, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 9.612756264236902, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 9.12204374460468e-07, |
|
"loss": 0.729, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 9.624145785876994, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 8.59410260576321e-07, |
|
"loss": 0.734, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 9.635535307517085, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 8.081831742015822e-07, |
|
"loss": 0.7391, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 9.646924829157175, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 7.585239250374243e-07, |
|
"loss": 0.7468, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 9.658314350797266, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 7.104332980036211e-07, |
|
"loss": 0.7347, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 9.669703872437358, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 6.639120532261456e-07, |
|
"loss": 0.7391, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 9.681093394077449, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 6.189609260251139e-07, |
|
"loss": 0.742, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 9.69248291571754, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 5.755806269031827e-07, |
|
"loss": 0.7403, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 9.703872437357631, |
|
"grad_norm": 0.375, |
|
"learning_rate": 5.337718415343362e-07, |
|
"loss": 0.7401, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 9.715261958997722, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 4.935352307530062e-07, |
|
"loss": 0.7366, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 9.726651480637813, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 4.548714305436685e-07, |
|
"loss": 0.7305, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 9.738041002277905, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 4.1778105203078565e-07, |
|
"loss": 0.7419, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 9.749430523917995, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 3.822646814691244e-07, |
|
"loss": 0.735, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 9.760820045558086, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 3.483228802344973e-07, |
|
"loss": 0.7427, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 9.772209567198178, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 3.159561848149029e-07, |
|
"loss": 0.738, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 9.783599088838269, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 2.8516510680203224e-07, |
|
"loss": 0.7443, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 9.79498861047836, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 2.5595013288318703e-07, |
|
"loss": 0.7361, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 9.806378132118452, |
|
"grad_norm": 0.375, |
|
"learning_rate": 2.2831172483359643e-07, |
|
"loss": 0.7446, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 9.817767653758542, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 2.0225031950910078e-07, |
|
"loss": 0.7427, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 9.829157175398633, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.7776632883924615e-07, |
|
"loss": 0.7394, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 9.840546697038725, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.548601398208116e-07, |
|
"loss": 0.7364, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 9.851936218678816, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.3353211451161417e-07, |
|
"loss": 0.741, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 9.863325740318906, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.1378259002488013e-07, |
|
"loss": 0.733, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 9.874715261958997, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 9.561187852386022e-08, |
|
"loss": 0.742, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 9.88610478359909, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 7.902026721687828e-08, |
|
"loss": 0.7417, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 9.89749430523918, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 6.400801835286796e-08, |
|
"loss": 0.7369, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 9.90888382687927, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 5.05753692171318e-08, |
|
"loss": 0.7385, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 9.920273348519363, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 3.8722532127677404e-08, |
|
"loss": 0.737, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 9.931662870159453, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 2.844969443178691e-08, |
|
"loss": 0.7355, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 9.943052391799544, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.9757018503119285e-08, |
|
"loss": 0.7433, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 9.954441913439636, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.2644641739101292e-08, |
|
"loss": 0.7328, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 9.965831435079727, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 7.112676558784781e-09, |
|
"loss": 0.743, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 9.977220956719817, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 3.1612104010370068e-09, |
|
"loss": 0.7305, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 9.98861047835991, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 7.903057231750666e-10, |
|
"loss": 0.7394, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0, |
|
"loss": 0.7376, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.4680683612823486, |
|
"eval_runtime": 0.2341, |
|
"eval_samples_per_second": 42.714, |
|
"eval_steps_per_second": 4.271, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 4390, |
|
"total_flos": 1.3402520949471838e+19, |
|
"train_loss": 1.3662878394941533, |
|
"train_runtime": 10620.987, |
|
"train_samples_per_second": 26.444, |
|
"train_steps_per_second": 0.413 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 4390, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 1.3402520949471838e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|