|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.991735537190083, |
|
"eval_steps": 500, |
|
"global_step": 980, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02542911633820725, |
|
"grad_norm": 0.7163803577423096, |
|
"learning_rate": 4.9996788644998285e-05, |
|
"loss": 1.6242, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0508582326764145, |
|
"grad_norm": 0.5585092306137085, |
|
"learning_rate": 4.99871554050172e-05, |
|
"loss": 1.4834, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07628734901462174, |
|
"grad_norm": 0.3108583688735962, |
|
"learning_rate": 4.997110275491702e-05, |
|
"loss": 1.3901, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.101716465352829, |
|
"grad_norm": 0.20925107598304749, |
|
"learning_rate": 4.994863481875841e-05, |
|
"loss": 1.2635, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12714558169103624, |
|
"grad_norm": 0.2436874657869339, |
|
"learning_rate": 4.991975736874289e-05, |
|
"loss": 1.2457, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.15257469802924348, |
|
"grad_norm": 0.19612200558185577, |
|
"learning_rate": 4.9884477823729956e-05, |
|
"loss": 1.2602, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.17800381436745072, |
|
"grad_norm": 0.1591530740261078, |
|
"learning_rate": 4.984280524733107e-05, |
|
"loss": 1.2238, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.203432930705658, |
|
"grad_norm": 0.15396293997764587, |
|
"learning_rate": 4.979475034558115e-05, |
|
"loss": 1.2456, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22886204704386523, |
|
"grad_norm": 0.17088189721107483, |
|
"learning_rate": 4.9740325464188154e-05, |
|
"loss": 1.2554, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.25429116338207247, |
|
"grad_norm": 0.14276106655597687, |
|
"learning_rate": 4.967954458536126e-05, |
|
"loss": 1.248, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.27972027972027974, |
|
"grad_norm": 0.13971547782421112, |
|
"learning_rate": 4.961242332421882e-05, |
|
"loss": 1.1816, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.30514939605848695, |
|
"grad_norm": 0.14659512042999268, |
|
"learning_rate": 4.9538978924776634e-05, |
|
"loss": 1.1962, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3305785123966942, |
|
"grad_norm": 0.1615353226661682, |
|
"learning_rate": 4.945923025551789e-05, |
|
"loss": 1.1989, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.35600762873490144, |
|
"grad_norm": 0.1626625955104828, |
|
"learning_rate": 4.937319780454559e-05, |
|
"loss": 1.2023, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3814367450731087, |
|
"grad_norm": 0.1588200479745865, |
|
"learning_rate": 4.92809036743191e-05, |
|
"loss": 1.2328, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.406865861411316, |
|
"grad_norm": 0.1648840457201004, |
|
"learning_rate": 4.9182371575975736e-05, |
|
"loss": 1.1974, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4322949777495232, |
|
"grad_norm": 0.16755953431129456, |
|
"learning_rate": 4.907762682323927e-05, |
|
"loss": 1.1876, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.45772409408773046, |
|
"grad_norm": 0.19065892696380615, |
|
"learning_rate": 4.8966696325916515e-05, |
|
"loss": 1.2396, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4831532104259377, |
|
"grad_norm": 0.18359169363975525, |
|
"learning_rate": 4.8849608582984096e-05, |
|
"loss": 1.1743, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5085823267641449, |
|
"grad_norm": 0.18455757200717926, |
|
"learning_rate": 4.8726393675266716e-05, |
|
"loss": 1.1566, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5340114431023522, |
|
"grad_norm": 0.18412873148918152, |
|
"learning_rate": 4.8597083257709194e-05, |
|
"loss": 1.1998, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5594405594405595, |
|
"grad_norm": 0.2001655399799347, |
|
"learning_rate": 4.846171055124401e-05, |
|
"loss": 1.172, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5848696757787667, |
|
"grad_norm": 0.1914481222629547, |
|
"learning_rate": 4.832031033425662e-05, |
|
"loss": 1.1514, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6102987921169739, |
|
"grad_norm": 0.19641566276550293, |
|
"learning_rate": 4.817291893365055e-05, |
|
"loss": 1.1887, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6357279084551812, |
|
"grad_norm": 0.20107164978981018, |
|
"learning_rate": 4.8019574215514705e-05, |
|
"loss": 1.1932, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6611570247933884, |
|
"grad_norm": 0.21638870239257812, |
|
"learning_rate": 4.7860315575395316e-05, |
|
"loss": 1.166, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6865861411315957, |
|
"grad_norm": 0.22959065437316895, |
|
"learning_rate": 4.7695183928174804e-05, |
|
"loss": 1.1645, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7120152574698029, |
|
"grad_norm": 0.2240159660577774, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 1.167, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7374443738080102, |
|
"grad_norm": 0.22989672422409058, |
|
"learning_rate": 4.734747280518549e-05, |
|
"loss": 1.187, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7628734901462174, |
|
"grad_norm": 0.22513297200202942, |
|
"learning_rate": 4.716498265932501e-05, |
|
"loss": 1.18, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7883026064844246, |
|
"grad_norm": 0.23522530496120453, |
|
"learning_rate": 4.6976798143230434e-05, |
|
"loss": 1.1532, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.813731722822632, |
|
"grad_norm": 0.2377459853887558, |
|
"learning_rate": 4.678296760308474e-05, |
|
"loss": 1.1726, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8391608391608392, |
|
"grad_norm": 0.25356563925743103, |
|
"learning_rate": 4.658354083558188e-05, |
|
"loss": 1.2446, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8645899554990464, |
|
"grad_norm": 0.24188080430030823, |
|
"learning_rate": 4.637856907513366e-05, |
|
"loss": 1.1905, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8900190718372537, |
|
"grad_norm": 0.2587025761604309, |
|
"learning_rate": 4.6168104980707107e-05, |
|
"loss": 1.1594, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9154481881754609, |
|
"grad_norm": 0.25419944524765015, |
|
"learning_rate": 4.595220262229601e-05, |
|
"loss": 1.1709, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9408773045136681, |
|
"grad_norm": 0.2684890031814575, |
|
"learning_rate": 4.573091746702988e-05, |
|
"loss": 1.1936, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9663064208518753, |
|
"grad_norm": 0.2676820755004883, |
|
"learning_rate": 4.55043063649239e-05, |
|
"loss": 1.1692, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9917355371900827, |
|
"grad_norm": 0.27281874418258667, |
|
"learning_rate": 4.5272427534273774e-05, |
|
"loss": 1.1812, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.0190718372536554, |
|
"grad_norm": 0.26799142360687256, |
|
"learning_rate": 4.503534054669892e-05, |
|
"loss": 1.3447, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0445009535918626, |
|
"grad_norm": 0.29342585802078247, |
|
"learning_rate": 4.4793106311838e-05, |
|
"loss": 1.1098, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.06993006993007, |
|
"grad_norm": 0.29103586077690125, |
|
"learning_rate": 4.454578706170075e-05, |
|
"loss": 1.1913, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.0953591862682772, |
|
"grad_norm": 0.28619349002838135, |
|
"learning_rate": 4.429344633468004e-05, |
|
"loss": 1.1936, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1207883026064844, |
|
"grad_norm": 0.3098839521408081, |
|
"learning_rate": 4.4036148959228365e-05, |
|
"loss": 1.1187, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1462174189446916, |
|
"grad_norm": 0.30148303508758545, |
|
"learning_rate": 4.377396103720278e-05, |
|
"loss": 1.157, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.1716465352828989, |
|
"grad_norm": 0.31909579038619995, |
|
"learning_rate": 4.350694992688289e-05, |
|
"loss": 1.1618, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.197075651621106, |
|
"grad_norm": 0.327538400888443, |
|
"learning_rate": 4.323518422566586e-05, |
|
"loss": 1.1245, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.2225047679593135, |
|
"grad_norm": 0.334259033203125, |
|
"learning_rate": 4.2958733752443195e-05, |
|
"loss": 1.1108, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2479338842975207, |
|
"grad_norm": 0.3590494692325592, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 1.1006, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.273363000635728, |
|
"grad_norm": 0.32646894454956055, |
|
"learning_rate": 4.239206376508717e-05, |
|
"loss": 1.1424, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2987921169739352, |
|
"grad_norm": 0.3532828688621521, |
|
"learning_rate": 4.210198983323366e-05, |
|
"loss": 1.141, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.3242212333121424, |
|
"grad_norm": 0.33540236949920654, |
|
"learning_rate": 4.180752225653292e-05, |
|
"loss": 1.0976, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3496503496503496, |
|
"grad_norm": 0.3587055504322052, |
|
"learning_rate": 4.150873668617898e-05, |
|
"loss": 1.1019, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.375079465988557, |
|
"grad_norm": 0.35621777176856995, |
|
"learning_rate": 4.120570988269472e-05, |
|
"loss": 1.1075, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.400508582326764, |
|
"grad_norm": 0.3648779094219208, |
|
"learning_rate": 4.089851969621138e-05, |
|
"loss": 1.1422, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.4259376986649714, |
|
"grad_norm": 0.3523823916912079, |
|
"learning_rate": 4.058724504646834e-05, |
|
"loss": 1.115, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4513668150031787, |
|
"grad_norm": 0.36515510082244873, |
|
"learning_rate": 4.027196590253786e-05, |
|
"loss": 1.1293, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.4767959313413859, |
|
"grad_norm": 0.37401697039604187, |
|
"learning_rate": 3.9952763262280405e-05, |
|
"loss": 1.1186, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.502225047679593, |
|
"grad_norm": 0.3709016442298889, |
|
"learning_rate": 3.9629719131535594e-05, |
|
"loss": 1.149, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.5276541640178003, |
|
"grad_norm": 0.38189002871513367, |
|
"learning_rate": 3.9302916503054246e-05, |
|
"loss": 1.1267, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5530832803560077, |
|
"grad_norm": 0.36769306659698486, |
|
"learning_rate": 3.897243933517679e-05, |
|
"loss": 1.108, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.5785123966942147, |
|
"grad_norm": 0.398843377828598, |
|
"learning_rate": 3.8638372530263715e-05, |
|
"loss": 1.1324, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.6039415130324222, |
|
"grad_norm": 0.37085267901420593, |
|
"learning_rate": 3.830080191288342e-05, |
|
"loss": 1.1051, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.6293706293706294, |
|
"grad_norm": 0.39182424545288086, |
|
"learning_rate": 3.7959814207763135e-05, |
|
"loss": 1.1622, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.6547997457088366, |
|
"grad_norm": 0.3733777105808258, |
|
"learning_rate": 3.761549701750865e-05, |
|
"loss": 1.1146, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.680228862047044, |
|
"grad_norm": 0.3874206840991974, |
|
"learning_rate": 3.726793880009845e-05, |
|
"loss": 1.0812, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.705657978385251, |
|
"grad_norm": 0.4049334228038788, |
|
"learning_rate": 3.6917228846158134e-05, |
|
"loss": 1.1379, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.7310870947234585, |
|
"grad_norm": 0.4189164340496063, |
|
"learning_rate": 3.656345725602089e-05, |
|
"loss": 1.1183, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.7565162110616654, |
|
"grad_norm": 0.38738593459129333, |
|
"learning_rate": 3.620671491657993e-05, |
|
"loss": 1.1094, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.7819453273998729, |
|
"grad_norm": 0.40121838450431824, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 1.1048, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.80737444373808, |
|
"grad_norm": 0.42995157837867737, |
|
"learning_rate": 3.5484685329866425e-05, |
|
"loss": 1.1209, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.8328035600762873, |
|
"grad_norm": 0.4180549085140228, |
|
"learning_rate": 3.5119583578059846e-05, |
|
"loss": 1.1284, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.8582326764144947, |
|
"grad_norm": 0.40642648935317993, |
|
"learning_rate": 3.475188202022617e-05, |
|
"loss": 1.1239, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.8836617927527017, |
|
"grad_norm": 0.4026438593864441, |
|
"learning_rate": 3.438167512198436e-05, |
|
"loss": 1.1068, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"grad_norm": 0.4229359030723572, |
|
"learning_rate": 3.400905799259634e-05, |
|
"loss": 1.1046, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.9345200254291164, |
|
"grad_norm": 0.4220713973045349, |
|
"learning_rate": 3.363412636053269e-05, |
|
"loss": 1.1127, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.9599491417673236, |
|
"grad_norm": 0.40782222151756287, |
|
"learning_rate": 3.3256976548879184e-05, |
|
"loss": 1.117, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.9853782581055308, |
|
"grad_norm": 0.43011417984962463, |
|
"learning_rate": 3.2877705450590526e-05, |
|
"loss": 1.1105, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.0127145581691037, |
|
"grad_norm": 0.4370057284832001, |
|
"learning_rate": 3.249641050359779e-05, |
|
"loss": 1.2868, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.0381436745073107, |
|
"grad_norm": 0.42571040987968445, |
|
"learning_rate": 3.211318966577581e-05, |
|
"loss": 1.082, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.063572790845518, |
|
"grad_norm": 0.45124417543411255, |
|
"learning_rate": 3.172814138977692e-05, |
|
"loss": 1.0788, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.089001907183725, |
|
"grad_norm": 0.4748118817806244, |
|
"learning_rate": 3.1341364597737686e-05, |
|
"loss": 1.0658, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.1144310235219326, |
|
"grad_norm": 0.44493603706359863, |
|
"learning_rate": 3.0952958655864955e-05, |
|
"loss": 1.0914, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.13986013986014, |
|
"grad_norm": 0.4867158532142639, |
|
"learning_rate": 3.056302334890786e-05, |
|
"loss": 1.0719, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.165289256198347, |
|
"grad_norm": 0.494890034198761, |
|
"learning_rate": 3.0171658854522273e-05, |
|
"loss": 1.0929, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.1907183725365544, |
|
"grad_norm": 0.489145964384079, |
|
"learning_rate": 2.9778965717534313e-05, |
|
"loss": 1.0833, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.2161474888747614, |
|
"grad_norm": 0.4823625087738037, |
|
"learning_rate": 2.9385044824109543e-05, |
|
"loss": 1.0836, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.241576605212969, |
|
"grad_norm": 0.5059676170349121, |
|
"learning_rate": 2.8989997375834482e-05, |
|
"loss": 1.0492, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.2670057215511763, |
|
"grad_norm": 0.5108452439308167, |
|
"learning_rate": 2.8593924863717046e-05, |
|
"loss": 1.0746, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.2924348378893833, |
|
"grad_norm": 0.516611635684967, |
|
"learning_rate": 2.8196929042112652e-05, |
|
"loss": 1.0972, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.3178639542275907, |
|
"grad_norm": 0.5169530510902405, |
|
"learning_rate": 2.7799111902582696e-05, |
|
"loss": 1.0524, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.3432930705657977, |
|
"grad_norm": 0.5156260132789612, |
|
"learning_rate": 2.7400575647692046e-05, |
|
"loss": 1.0402, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.368722186904005, |
|
"grad_norm": 0.5396901965141296, |
|
"learning_rate": 2.7001422664752333e-05, |
|
"loss": 1.0956, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.394151303242212, |
|
"grad_norm": 0.5307148694992065, |
|
"learning_rate": 2.6601755499517826e-05, |
|
"loss": 1.0803, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.4195804195804196, |
|
"grad_norm": 0.5289092063903809, |
|
"learning_rate": 2.620167682984052e-05, |
|
"loss": 1.0657, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.445009535918627, |
|
"grad_norm": 0.5314033031463623, |
|
"learning_rate": 2.5801289439291388e-05, |
|
"loss": 1.07, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.470438652256834, |
|
"grad_norm": 0.5466395616531372, |
|
"learning_rate": 2.540069619075435e-05, |
|
"loss": 1.0835, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.4958677685950414, |
|
"grad_norm": 0.5496529340744019, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.0571, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.5212968849332484, |
|
"grad_norm": 0.5266883373260498, |
|
"learning_rate": 2.459930380924566e-05, |
|
"loss": 1.0984, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.546726001271456, |
|
"grad_norm": 0.5321890711784363, |
|
"learning_rate": 2.419871056070862e-05, |
|
"loss": 1.0453, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.572155117609663, |
|
"grad_norm": 0.5523152947425842, |
|
"learning_rate": 2.3798323170159488e-05, |
|
"loss": 1.0697, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.5975842339478703, |
|
"grad_norm": 0.5818379521369934, |
|
"learning_rate": 2.339824450048218e-05, |
|
"loss": 1.1192, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.6230133502860777, |
|
"grad_norm": 0.5280163884162903, |
|
"learning_rate": 2.299857733524767e-05, |
|
"loss": 1.0263, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.6484424666242847, |
|
"grad_norm": 0.5316183567047119, |
|
"learning_rate": 2.2599424352307957e-05, |
|
"loss": 1.0851, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.673871582962492, |
|
"grad_norm": 0.5420896410942078, |
|
"learning_rate": 2.2200888097417307e-05, |
|
"loss": 1.0691, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.699300699300699, |
|
"grad_norm": 0.5518523454666138, |
|
"learning_rate": 2.1803070957887347e-05, |
|
"loss": 1.0735, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.7247298156389066, |
|
"grad_norm": 0.5464677810668945, |
|
"learning_rate": 2.140607513628296e-05, |
|
"loss": 1.0905, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.750158931977114, |
|
"grad_norm": 0.5611386895179749, |
|
"learning_rate": 2.1010002624165527e-05, |
|
"loss": 1.0789, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.775588048315321, |
|
"grad_norm": 0.576288104057312, |
|
"learning_rate": 2.0614955175890463e-05, |
|
"loss": 1.0551, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.801017164653528, |
|
"grad_norm": 0.5907012820243835, |
|
"learning_rate": 2.02210342824657e-05, |
|
"loss": 1.0641, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.8264462809917354, |
|
"grad_norm": 0.5511234998703003, |
|
"learning_rate": 1.982834114547773e-05, |
|
"loss": 1.0781, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.851875397329943, |
|
"grad_norm": 0.5715182423591614, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 1.0377, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.87730451366815, |
|
"grad_norm": 0.5606087446212769, |
|
"learning_rate": 1.9047041344135044e-05, |
|
"loss": 1.037, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.9027336300063573, |
|
"grad_norm": 0.5559452176094055, |
|
"learning_rate": 1.865863540226232e-05, |
|
"loss": 1.0786, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.9281627463445643, |
|
"grad_norm": 0.5585038065910339, |
|
"learning_rate": 1.827185861022308e-05, |
|
"loss": 1.058, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.9535918626827717, |
|
"grad_norm": 0.6056733727455139, |
|
"learning_rate": 1.7886810334224192e-05, |
|
"loss": 1.0654, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.979020979020979, |
|
"grad_norm": 0.5935595035552979, |
|
"learning_rate": 1.7503589496402208e-05, |
|
"loss": 1.0603, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.006357279084552, |
|
"grad_norm": 0.5643672943115234, |
|
"learning_rate": 1.7122294549409484e-05, |
|
"loss": 1.2515, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.031786395422759, |
|
"grad_norm": 0.5927174687385559, |
|
"learning_rate": 1.6743023451120832e-05, |
|
"loss": 1.0486, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.0572155117609663, |
|
"grad_norm": 0.6135289072990417, |
|
"learning_rate": 1.6365873639467315e-05, |
|
"loss": 1.0647, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.0826446280991737, |
|
"grad_norm": 0.6193404197692871, |
|
"learning_rate": 1.599094200740367e-05, |
|
"loss": 1.0201, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.1080737444373807, |
|
"grad_norm": 0.5914512872695923, |
|
"learning_rate": 1.561832487801565e-05, |
|
"loss": 1.0361, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.133502860775588, |
|
"grad_norm": 0.6324409246444702, |
|
"learning_rate": 1.5248117979773829e-05, |
|
"loss": 1.0334, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.158931977113795, |
|
"grad_norm": 0.6424704790115356, |
|
"learning_rate": 1.4880416421940155e-05, |
|
"loss": 0.9946, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.1843610934520026, |
|
"grad_norm": 0.6515686511993408, |
|
"learning_rate": 1.4515314670133582e-05, |
|
"loss": 1.0242, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.20979020979021, |
|
"grad_norm": 0.6387981176376343, |
|
"learning_rate": 1.4152906522061048e-05, |
|
"loss": 1.051, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.235219326128417, |
|
"grad_norm": 0.6447208523750305, |
|
"learning_rate": 1.3793285083420077e-05, |
|
"loss": 1.0407, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 3.2606484424666244, |
|
"grad_norm": 0.6469278335571289, |
|
"learning_rate": 1.3436542743979125e-05, |
|
"loss": 1.0262, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.2860775588048314, |
|
"grad_norm": 0.6370428204536438, |
|
"learning_rate": 1.3082771153841871e-05, |
|
"loss": 1.0428, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 3.311506675143039, |
|
"grad_norm": 0.6550537943840027, |
|
"learning_rate": 1.2732061199901562e-05, |
|
"loss": 1.0236, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.336935791481246, |
|
"grad_norm": 0.6451073288917542, |
|
"learning_rate": 1.2384502982491358e-05, |
|
"loss": 1.0346, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 3.3623649078194533, |
|
"grad_norm": 0.6514309048652649, |
|
"learning_rate": 1.2040185792236874e-05, |
|
"loss": 1.048, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.3877940241576603, |
|
"grad_norm": 0.6752684116363525, |
|
"learning_rate": 1.1699198087116589e-05, |
|
"loss": 1.0338, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 3.4132231404958677, |
|
"grad_norm": 0.6573219895362854, |
|
"learning_rate": 1.1361627469736285e-05, |
|
"loss": 1.0606, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.438652256834075, |
|
"grad_norm": 0.6186379194259644, |
|
"learning_rate": 1.1027560664823209e-05, |
|
"loss": 1.0397, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.464081373172282, |
|
"grad_norm": 0.6716841459274292, |
|
"learning_rate": 1.0697083496945765e-05, |
|
"loss": 1.0613, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.4895104895104896, |
|
"grad_norm": 0.6756547093391418, |
|
"learning_rate": 1.0370280868464405e-05, |
|
"loss": 1.0409, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.5149396058486966, |
|
"grad_norm": 0.6823243498802185, |
|
"learning_rate": 1.0047236737719601e-05, |
|
"loss": 1.0539, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.540368722186904, |
|
"grad_norm": 0.6413143277168274, |
|
"learning_rate": 9.728034097462144e-06, |
|
"loss": 1.0292, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.5657978385251115, |
|
"grad_norm": 0.6690115928649902, |
|
"learning_rate": 9.412754953531663e-06, |
|
"loss": 1.0183, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.5912269548633184, |
|
"grad_norm": 0.684471070766449, |
|
"learning_rate": 9.101480303788623e-06, |
|
"loss": 1.022, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.616656071201526, |
|
"grad_norm": 0.6697286367416382, |
|
"learning_rate": 8.794290117305296e-06, |
|
"loss": 1.0101, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.642085187539733, |
|
"grad_norm": 0.6681500673294067, |
|
"learning_rate": 8.491263313821021e-06, |
|
"loss": 1.0113, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.6675143038779403, |
|
"grad_norm": 0.6634730100631714, |
|
"learning_rate": 8.192477743467078e-06, |
|
"loss": 1.0016, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.6929434202161477, |
|
"grad_norm": 0.645428478717804, |
|
"learning_rate": 7.898010166766347e-06, |
|
"loss": 1.0227, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.7183725365543547, |
|
"grad_norm": 0.678112804889679, |
|
"learning_rate": 7.607936234912841e-06, |
|
"loss": 1.0339, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.7438016528925617, |
|
"grad_norm": 0.691081166267395, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 0.9891, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.769230769230769, |
|
"grad_norm": 0.6777130961418152, |
|
"learning_rate": 7.041266247556813e-06, |
|
"loss": 1.0427, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.7946598855689766, |
|
"grad_norm": 0.6708715558052063, |
|
"learning_rate": 6.764815774334149e-06, |
|
"loss": 1.0068, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.8200890019071836, |
|
"grad_norm": 0.685946524143219, |
|
"learning_rate": 6.493050073117116e-06, |
|
"loss": 1.0673, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.845518118245391, |
|
"grad_norm": 0.654752790927887, |
|
"learning_rate": 6.226038962797218e-06, |
|
"loss": 1.015, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.870947234583598, |
|
"grad_norm": 0.679111897945404, |
|
"learning_rate": 5.9638510407716394e-06, |
|
"loss": 1.0068, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.8963763509218055, |
|
"grad_norm": 0.6768970489501953, |
|
"learning_rate": 5.706553665319955e-06, |
|
"loss": 1.0424, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.921805467260013, |
|
"grad_norm": 0.6532475352287292, |
|
"learning_rate": 5.454212938299255e-06, |
|
"loss": 1.0668, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.94723458359822, |
|
"grad_norm": 0.646823525428772, |
|
"learning_rate": 5.2068936881620095e-06, |
|
"loss": 1.0185, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.9726636999364273, |
|
"grad_norm": 0.6898627281188965, |
|
"learning_rate": 4.9646594533010875e-06, |
|
"loss": 1.036, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.9980928162746343, |
|
"grad_norm": 1.4582291841506958, |
|
"learning_rate": 4.72757246572623e-06, |
|
"loss": 1.1749, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 4.025429116338207, |
|
"grad_norm": 0.6636221408843994, |
|
"learning_rate": 4.495693635076101e-06, |
|
"loss": 1.0003, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 4.050858232676415, |
|
"grad_norm": 0.679002046585083, |
|
"learning_rate": 4.2690825329701315e-06, |
|
"loss": 0.9937, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 4.076287349014621, |
|
"grad_norm": 0.6657451391220093, |
|
"learning_rate": 4.047797377703985e-06, |
|
"loss": 1.0328, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.101716465352829, |
|
"grad_norm": 0.6936158537864685, |
|
"learning_rate": 3.831895019292897e-06, |
|
"loss": 1.0453, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 4.127145581691036, |
|
"grad_norm": 0.7010984420776367, |
|
"learning_rate": 3.621430924866348e-06, |
|
"loss": 0.9894, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.152574698029244, |
|
"grad_norm": 0.7202966809272766, |
|
"learning_rate": 3.4164591644181233e-06, |
|
"loss": 1.0028, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 4.17800381436745, |
|
"grad_norm": 0.7169246077537537, |
|
"learning_rate": 3.217032396915265e-06, |
|
"loss": 1.0268, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.203432930705658, |
|
"grad_norm": 0.68609219789505, |
|
"learning_rate": 3.0232018567695697e-06, |
|
"loss": 1.008, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 4.228862047043865, |
|
"grad_norm": 0.6944385170936584, |
|
"learning_rate": 2.8350173406749973e-06, |
|
"loss": 0.9933, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.254291163382073, |
|
"grad_norm": 0.6846181750297546, |
|
"learning_rate": 2.652527194814511e-06, |
|
"loss": 1.0147, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 4.27972027972028, |
|
"grad_norm": 0.6891025304794312, |
|
"learning_rate": 2.475778302439524e-06, |
|
"loss": 1.0619, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.305149396058487, |
|
"grad_norm": 0.7004478573799133, |
|
"learning_rate": 2.3048160718251997e-06, |
|
"loss": 0.9766, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 4.330578512396694, |
|
"grad_norm": 0.681630551815033, |
|
"learning_rate": 2.1396844246046903e-06, |
|
"loss": 1.0194, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.356007628734901, |
|
"grad_norm": 0.7060221433639526, |
|
"learning_rate": 1.980425784485293e-06, |
|
"loss": 1.0079, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 4.381436745073109, |
|
"grad_norm": 0.7196823954582214, |
|
"learning_rate": 1.827081066349459e-06, |
|
"loss": 1.0269, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.406865861411316, |
|
"grad_norm": 0.7184461951255798, |
|
"learning_rate": 1.6796896657433808e-06, |
|
"loss": 0.9593, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 4.432294977749523, |
|
"grad_norm": 0.7016611099243164, |
|
"learning_rate": 1.538289448755989e-06, |
|
"loss": 1.005, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.45772409408773, |
|
"grad_norm": 0.7026543021202087, |
|
"learning_rate": 1.4029167422908107e-06, |
|
"loss": 1.0268, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 4.483153210425938, |
|
"grad_norm": 0.722529411315918, |
|
"learning_rate": 1.273606324733284e-06, |
|
"loss": 1.0258, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.508582326764145, |
|
"grad_norm": 0.6864616274833679, |
|
"learning_rate": 1.1503914170159057e-06, |
|
"loss": 1.0159, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 4.534011443102353, |
|
"grad_norm": 0.6917942762374878, |
|
"learning_rate": 1.0333036740834856e-06, |
|
"loss": 1.0288, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.559440559440559, |
|
"grad_norm": 0.7076994180679321, |
|
"learning_rate": 9.223731767607435e-07, |
|
"loss": 1.0102, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 4.584869675778767, |
|
"grad_norm": 0.6951266527175903, |
|
"learning_rate": 8.176284240242638e-07, |
|
"loss": 1.0253, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.610298792116974, |
|
"grad_norm": 0.6959212422370911, |
|
"learning_rate": 7.19096325680907e-07, |
|
"loss": 1.0014, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 4.6357279084551815, |
|
"grad_norm": 0.7114453911781311, |
|
"learning_rate": 6.268021954544096e-07, |
|
"loss": 0.9987, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.661157024793388, |
|
"grad_norm": 0.7204643487930298, |
|
"learning_rate": 5.407697444821169e-07, |
|
"loss": 0.9987, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 4.686586141131595, |
|
"grad_norm": 0.7097218632698059, |
|
"learning_rate": 4.6102107522336403e-07, |
|
"loss": 1.0043, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.712015257469803, |
|
"grad_norm": 0.7494398951530457, |
|
"learning_rate": 3.8757667578118994e-07, |
|
"loss": 1.0321, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 4.73744437380801, |
|
"grad_norm": 0.6792721748352051, |
|
"learning_rate": 3.204554146387456e-07, |
|
"loss": 1.0747, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.762873490146218, |
|
"grad_norm": 0.6858909130096436, |
|
"learning_rate": 2.5967453581185184e-07, |
|
"loss": 0.9982, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 4.788302606484424, |
|
"grad_norm": 0.7001191973686218, |
|
"learning_rate": 2.052496544188487e-07, |
|
"loss": 1.0616, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.813731722822632, |
|
"grad_norm": 0.7212966084480286, |
|
"learning_rate": 1.571947526689349e-07, |
|
"loss": 1.0264, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 4.839160839160839, |
|
"grad_norm": 0.670038104057312, |
|
"learning_rate": 1.1552217627004425e-07, |
|
"loss": 1.0237, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.864589955499047, |
|
"grad_norm": 0.7079971432685852, |
|
"learning_rate": 8.024263125710752e-08, |
|
"loss": 0.992, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 4.890019071837254, |
|
"grad_norm": 0.7235581874847412, |
|
"learning_rate": 5.136518124159162e-08, |
|
"loss": 0.9947, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.915448188175461, |
|
"grad_norm": 0.6986010074615479, |
|
"learning_rate": 2.8897245082978864e-08, |
|
"loss": 0.9907, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 4.940877304513668, |
|
"grad_norm": 0.7248331904411316, |
|
"learning_rate": 1.284459498280266e-08, |
|
"loss": 0.9958, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.9663064208518755, |
|
"grad_norm": 0.7134162783622742, |
|
"learning_rate": 3.2113550017198735e-09, |
|
"loss": 1.0067, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.991735537190083, |
|
"grad_norm": 0.7377886176109314, |
|
"learning_rate": 0.0, |
|
"loss": 0.9834, |
|
"step": 980 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 980, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2928446514593792e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|