|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.99867197875166, |
|
"eval_steps": 94, |
|
"global_step": 94, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010624169986719787, |
|
"grad_norm": 2.473472145582703, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.4101, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010624169986719787, |
|
"eval_loss": 1.6489976644515991, |
|
"eval_runtime": 38.1546, |
|
"eval_samples_per_second": 63.033, |
|
"eval_steps_per_second": 1.992, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.021248339973439574, |
|
"grad_norm": 2.473600838217233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4273, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03187250996015936, |
|
"grad_norm": 2.5043053838666367, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.4244, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04249667994687915, |
|
"grad_norm": 2.4185276106954317, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4166, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05312084993359894, |
|
"grad_norm": 2.101313294150849, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.3973, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06374501992031872, |
|
"grad_norm": 2.32474051391718, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.364, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07436918990703852, |
|
"grad_norm": 2.1736886206661663, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.3353, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0849933598937583, |
|
"grad_norm": 1.7213633107027813, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3195, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.09561752988047809, |
|
"grad_norm": 3.2596713587035007, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.3083, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.10624169986719788, |
|
"grad_norm": 7.208305473873824, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.3097, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11686586985391766, |
|
"grad_norm": 2.0792264874760193, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.2968, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.12749003984063745, |
|
"grad_norm": 7.712252689421062, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2859, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.13811420982735723, |
|
"grad_norm": 0.9497642120720764, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.2974, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.14873837981407703, |
|
"grad_norm": 0.9147721663929693, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.2803, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1593625498007968, |
|
"grad_norm": 0.4984711347806819, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.2721, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1699867197875166, |
|
"grad_norm": 0.471047244087332, |
|
"learning_rate": 4e-05, |
|
"loss": 0.262, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1806108897742364, |
|
"grad_norm": 0.472154205638306, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.2638, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.19123505976095617, |
|
"grad_norm": 0.4625092675068189, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.2591, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.20185922974767595, |
|
"grad_norm": 0.4996392904070069, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.2571, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.21248339973439576, |
|
"grad_norm": 1.0428146077253466, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2487, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.22310756972111553, |
|
"grad_norm": 0.45426198900774195, |
|
"learning_rate": 4.9809160305343514e-05, |
|
"loss": 0.2523, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2337317397078353, |
|
"grad_norm": 0.3580171632863205, |
|
"learning_rate": 4.9618320610687025e-05, |
|
"loss": 0.2435, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.24435590969455512, |
|
"grad_norm": 0.3537567495590767, |
|
"learning_rate": 4.9427480916030536e-05, |
|
"loss": 0.2411, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.2549800796812749, |
|
"grad_norm": 0.3233498171110322, |
|
"learning_rate": 4.923664122137405e-05, |
|
"loss": 0.2319, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2656042496679947, |
|
"grad_norm": 0.26847478370421396, |
|
"learning_rate": 4.904580152671756e-05, |
|
"loss": 0.2401, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.27622841965471445, |
|
"grad_norm": 0.22622668691998796, |
|
"learning_rate": 4.885496183206107e-05, |
|
"loss": 0.2339, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2868525896414343, |
|
"grad_norm": 0.17447662466704217, |
|
"learning_rate": 4.866412213740458e-05, |
|
"loss": 0.2334, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.29747675962815406, |
|
"grad_norm": 0.1406444973947631, |
|
"learning_rate": 4.847328244274809e-05, |
|
"loss": 0.2325, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.30810092961487384, |
|
"grad_norm": 0.15551008923115225, |
|
"learning_rate": 4.82824427480916e-05, |
|
"loss": 0.2312, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.3187250996015936, |
|
"grad_norm": 0.1363375689910322, |
|
"learning_rate": 4.809160305343512e-05, |
|
"loss": 0.231, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3293492695883134, |
|
"grad_norm": 0.1701602959944274, |
|
"learning_rate": 4.7900763358778626e-05, |
|
"loss": 0.2293, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3399734395750332, |
|
"grad_norm": 0.13622900372630573, |
|
"learning_rate": 4.7709923664122144e-05, |
|
"loss": 0.2248, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.350597609561753, |
|
"grad_norm": 0.15128152543667112, |
|
"learning_rate": 4.751908396946565e-05, |
|
"loss": 0.2306, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3612217795484728, |
|
"grad_norm": 0.15811875642520323, |
|
"learning_rate": 4.7328244274809166e-05, |
|
"loss": 0.2321, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.37184594953519257, |
|
"grad_norm": 0.12645178389747586, |
|
"learning_rate": 4.713740458015267e-05, |
|
"loss": 0.2208, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.38247011952191234, |
|
"grad_norm": 0.12862564367197982, |
|
"learning_rate": 4.694656488549619e-05, |
|
"loss": 0.223, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3930942895086321, |
|
"grad_norm": 0.1369002410151976, |
|
"learning_rate": 4.675572519083969e-05, |
|
"loss": 0.2271, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.4037184594953519, |
|
"grad_norm": 0.14767255168536392, |
|
"learning_rate": 4.656488549618321e-05, |
|
"loss": 0.2252, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.41434262948207173, |
|
"grad_norm": 0.13483775474592635, |
|
"learning_rate": 4.637404580152672e-05, |
|
"loss": 0.2256, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.4249667994687915, |
|
"grad_norm": 0.14826458237724738, |
|
"learning_rate": 4.618320610687023e-05, |
|
"loss": 0.2191, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4355909694555113, |
|
"grad_norm": 0.12640771132223902, |
|
"learning_rate": 4.5992366412213745e-05, |
|
"loss": 0.2222, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.44621513944223107, |
|
"grad_norm": 0.12735423736878446, |
|
"learning_rate": 4.5801526717557256e-05, |
|
"loss": 0.215, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.45683930942895085, |
|
"grad_norm": 0.12856119123995918, |
|
"learning_rate": 4.561068702290077e-05, |
|
"loss": 0.2229, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4674634794156706, |
|
"grad_norm": 0.10294046565924951, |
|
"learning_rate": 4.541984732824428e-05, |
|
"loss": 0.2176, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.47808764940239046, |
|
"grad_norm": 0.15050659721187953, |
|
"learning_rate": 4.522900763358779e-05, |
|
"loss": 0.221, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.48871181938911024, |
|
"grad_norm": 0.11127436955612696, |
|
"learning_rate": 4.5038167938931294e-05, |
|
"loss": 0.2202, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.49933598937583, |
|
"grad_norm": 0.1369522376895646, |
|
"learning_rate": 4.484732824427481e-05, |
|
"loss": 0.2133, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.5099601593625498, |
|
"grad_norm": 0.13836598803647265, |
|
"learning_rate": 4.465648854961832e-05, |
|
"loss": 0.2155, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5205843293492696, |
|
"grad_norm": 0.11643639323464662, |
|
"learning_rate": 4.4465648854961834e-05, |
|
"loss": 0.2152, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5312084993359893, |
|
"grad_norm": 0.13132395588523518, |
|
"learning_rate": 4.4274809160305345e-05, |
|
"loss": 0.2107, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5418326693227091, |
|
"grad_norm": 0.11238359379346577, |
|
"learning_rate": 4.408396946564886e-05, |
|
"loss": 0.2137, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5524568393094289, |
|
"grad_norm": 0.17138814733946134, |
|
"learning_rate": 4.389312977099237e-05, |
|
"loss": 0.2154, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5630810092961488, |
|
"grad_norm": 0.1092235798281431, |
|
"learning_rate": 4.370229007633588e-05, |
|
"loss": 0.2112, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5737051792828686, |
|
"grad_norm": 0.11289137255732332, |
|
"learning_rate": 4.351145038167939e-05, |
|
"loss": 0.2122, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5843293492695883, |
|
"grad_norm": 0.1025053581338601, |
|
"learning_rate": 4.332061068702291e-05, |
|
"loss": 0.2168, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5949535192563081, |
|
"grad_norm": 0.11102992105384238, |
|
"learning_rate": 4.312977099236641e-05, |
|
"loss": 0.2135, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6055776892430279, |
|
"grad_norm": 0.12567758664033069, |
|
"learning_rate": 4.293893129770993e-05, |
|
"loss": 0.211, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6162018592297477, |
|
"grad_norm": 0.11349920644602123, |
|
"learning_rate": 4.2748091603053435e-05, |
|
"loss": 0.2115, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6268260292164675, |
|
"grad_norm": 0.12916314836096926, |
|
"learning_rate": 4.255725190839695e-05, |
|
"loss": 0.2208, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6374501992031872, |
|
"grad_norm": 0.09752976352934666, |
|
"learning_rate": 4.236641221374046e-05, |
|
"loss": 0.2082, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.648074369189907, |
|
"grad_norm": 0.13511089875393836, |
|
"learning_rate": 4.2175572519083975e-05, |
|
"loss": 0.2081, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6586985391766268, |
|
"grad_norm": 0.10709804722840903, |
|
"learning_rate": 4.198473282442748e-05, |
|
"loss": 0.2089, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6693227091633466, |
|
"grad_norm": 0.11164046629150956, |
|
"learning_rate": 4.1793893129771e-05, |
|
"loss": 0.212, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6799468791500664, |
|
"grad_norm": 0.11504623760720377, |
|
"learning_rate": 4.160305343511451e-05, |
|
"loss": 0.2104, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6905710491367862, |
|
"grad_norm": 0.1056951553011598, |
|
"learning_rate": 4.1412213740458014e-05, |
|
"loss": 0.2095, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.701195219123506, |
|
"grad_norm": 0.11902910166095262, |
|
"learning_rate": 4.122137404580153e-05, |
|
"loss": 0.21, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7118193891102258, |
|
"grad_norm": 0.11888528499779406, |
|
"learning_rate": 4.1030534351145036e-05, |
|
"loss": 0.2074, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7224435590969456, |
|
"grad_norm": 0.11792907862935477, |
|
"learning_rate": 4.0839694656488554e-05, |
|
"loss": 0.2074, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.7330677290836654, |
|
"grad_norm": 0.12123246786138824, |
|
"learning_rate": 4.064885496183206e-05, |
|
"loss": 0.2117, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7436918990703851, |
|
"grad_norm": 0.11548221744093704, |
|
"learning_rate": 4.0458015267175576e-05, |
|
"loss": 0.2152, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7543160690571049, |
|
"grad_norm": 0.11744546618963372, |
|
"learning_rate": 4.026717557251908e-05, |
|
"loss": 0.2102, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.7649402390438247, |
|
"grad_norm": 0.1129425013506569, |
|
"learning_rate": 4.00763358778626e-05, |
|
"loss": 0.2084, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7755644090305445, |
|
"grad_norm": 0.11061564445479172, |
|
"learning_rate": 3.988549618320611e-05, |
|
"loss": 0.2077, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7861885790172642, |
|
"grad_norm": 0.12747716190819824, |
|
"learning_rate": 3.969465648854962e-05, |
|
"loss": 0.2065, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.796812749003984, |
|
"grad_norm": 0.11144908828621163, |
|
"learning_rate": 3.950381679389313e-05, |
|
"loss": 0.2106, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8074369189907038, |
|
"grad_norm": 0.10477648755637319, |
|
"learning_rate": 3.9312977099236644e-05, |
|
"loss": 0.2126, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.8180610889774237, |
|
"grad_norm": 0.12819412448665685, |
|
"learning_rate": 3.9122137404580155e-05, |
|
"loss": 0.2079, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.8286852589641435, |
|
"grad_norm": 0.11200327130647701, |
|
"learning_rate": 3.8931297709923666e-05, |
|
"loss": 0.2092, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8393094289508632, |
|
"grad_norm": 0.10297552490298816, |
|
"learning_rate": 3.874045801526718e-05, |
|
"loss": 0.2024, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.849933598937583, |
|
"grad_norm": 0.11622087800076612, |
|
"learning_rate": 3.854961832061069e-05, |
|
"loss": 0.2041, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8605577689243028, |
|
"grad_norm": 0.12063464037416519, |
|
"learning_rate": 3.83587786259542e-05, |
|
"loss": 0.2139, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8711819389110226, |
|
"grad_norm": 0.10545414252113708, |
|
"learning_rate": 3.816793893129771e-05, |
|
"loss": 0.207, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8818061088977424, |
|
"grad_norm": 0.11873399276296229, |
|
"learning_rate": 3.797709923664122e-05, |
|
"loss": 0.2114, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8924302788844621, |
|
"grad_norm": 0.12163297981289363, |
|
"learning_rate": 3.778625954198473e-05, |
|
"loss": 0.2088, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.9030544488711819, |
|
"grad_norm": 0.11583886378919572, |
|
"learning_rate": 3.7595419847328244e-05, |
|
"loss": 0.2097, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.9136786188579017, |
|
"grad_norm": 0.11870676027662425, |
|
"learning_rate": 3.7404580152671756e-05, |
|
"loss": 0.1982, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.9243027888446215, |
|
"grad_norm": 0.1188545663258694, |
|
"learning_rate": 3.721374045801527e-05, |
|
"loss": 0.2006, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.9349269588313412, |
|
"grad_norm": 0.11078800000682179, |
|
"learning_rate": 3.702290076335878e-05, |
|
"loss": 0.2011, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9455511288180611, |
|
"grad_norm": 0.11154286597122227, |
|
"learning_rate": 3.683206106870229e-05, |
|
"loss": 0.2011, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9561752988047809, |
|
"grad_norm": 0.10658131888410093, |
|
"learning_rate": 3.66412213740458e-05, |
|
"loss": 0.2048, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9667994687915007, |
|
"grad_norm": 0.09993801688648761, |
|
"learning_rate": 3.645038167938932e-05, |
|
"loss": 0.2049, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9774236387782205, |
|
"grad_norm": 0.11005954464067705, |
|
"learning_rate": 3.625954198473282e-05, |
|
"loss": 0.208, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9880478087649402, |
|
"grad_norm": 0.10245543326903217, |
|
"learning_rate": 3.606870229007634e-05, |
|
"loss": 0.204, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.99867197875166, |
|
"grad_norm": 0.1163495155338574, |
|
"learning_rate": 3.5877862595419845e-05, |
|
"loss": 0.2037, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.99867197875166, |
|
"eval_loss": 1.6727522611618042, |
|
"eval_runtime": 38.3479, |
|
"eval_samples_per_second": 62.715, |
|
"eval_steps_per_second": 1.982, |
|
"step": 94 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 282, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 94, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.8221787935132877e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|