|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.925925925925926, |
|
"eval_steps": 500, |
|
"global_step": 324, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018518518518518517, |
|
"grad_norm": 11.543508930958351, |
|
"learning_rate": 1.3333333333333334e-07, |
|
"loss": 2.5502, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.037037037037037035, |
|
"grad_norm": 11.036573442393484, |
|
"learning_rate": 2.6666666666666667e-07, |
|
"loss": 2.524, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 10.633243381981275, |
|
"learning_rate": 4e-07, |
|
"loss": 2.204, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 10.844156107788931, |
|
"learning_rate": 5.333333333333333e-07, |
|
"loss": 2.6556, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.09259259259259259, |
|
"grad_norm": 10.31689604512179, |
|
"learning_rate": 6.666666666666666e-07, |
|
"loss": 2.3083, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 9.817262372273788, |
|
"learning_rate": 8e-07, |
|
"loss": 2.4079, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.12962962962962962, |
|
"grad_norm": 9.116167654173315, |
|
"learning_rate": 9.333333333333333e-07, |
|
"loss": 2.3343, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 7.891190287353295, |
|
"learning_rate": 1.0666666666666667e-06, |
|
"loss": 2.3883, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 8.42233222280676, |
|
"learning_rate": 1.2e-06, |
|
"loss": 2.4733, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 6.5902867588718825, |
|
"learning_rate": 1.3333333333333332e-06, |
|
"loss": 2.2598, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2037037037037037, |
|
"grad_norm": 7.468618276890062, |
|
"learning_rate": 1.4666666666666665e-06, |
|
"loss": 2.6818, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 6.524430399848726, |
|
"learning_rate": 1.6e-06, |
|
"loss": 2.0609, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.24074074074074073, |
|
"grad_norm": 15.819198637332978, |
|
"learning_rate": 1.7333333333333334e-06, |
|
"loss": 1.8734, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.25925925925925924, |
|
"grad_norm": 12.251404296601525, |
|
"learning_rate": 1.8666666666666667e-06, |
|
"loss": 2.3952, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 12.014341658055084, |
|
"learning_rate": 2e-06, |
|
"loss": 2.0763, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 9.119171460936416, |
|
"learning_rate": 1.999948316841124e-06, |
|
"loss": 2.2581, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3148148148148148, |
|
"grad_norm": 7.0075699362300785, |
|
"learning_rate": 1.999793272706794e-06, |
|
"loss": 2.3189, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 8.434551205593468, |
|
"learning_rate": 1.9995348836233515e-06, |
|
"loss": 2.2956, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.35185185185185186, |
|
"grad_norm": 9.802253199544783, |
|
"learning_rate": 1.999173176299524e-06, |
|
"loss": 2.1106, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 7.364889431202562, |
|
"learning_rate": 1.9987081881236665e-06, |
|
"loss": 2.4001, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 5.5030313904087995, |
|
"learning_rate": 1.9981399671598938e-06, |
|
"loss": 2.0534, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.4074074074074074, |
|
"grad_norm": 4.022498450217217, |
|
"learning_rate": 1.997468572143115e-06, |
|
"loss": 1.9262, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.42592592592592593, |
|
"grad_norm": 4.237115597250525, |
|
"learning_rate": 1.9966940724729603e-06, |
|
"loss": 2.2743, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 4.300566273621826, |
|
"learning_rate": 1.995816548206609e-06, |
|
"loss": 2.028, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.46296296296296297, |
|
"grad_norm": 4.157703163471443, |
|
"learning_rate": 1.994836090050514e-06, |
|
"loss": 2.2021, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.48148148148148145, |
|
"grad_norm": 4.59475590188255, |
|
"learning_rate": 1.993752799351023e-06, |
|
"loss": 2.1409, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.553829762084, |
|
"learning_rate": 1.992566788083908e-06, |
|
"loss": 2.1277, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 2.808767466788676, |
|
"learning_rate": 1.9912781788427856e-06, |
|
"loss": 2.074, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5370370370370371, |
|
"grad_norm": 2.945994143903197, |
|
"learning_rate": 1.989887104826449e-06, |
|
"loss": 1.9894, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 2.9859402190241, |
|
"learning_rate": 1.988393709825096e-06, |
|
"loss": 2.1096, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5740740740740741, |
|
"grad_norm": 2.788646179800959, |
|
"learning_rate": 1.9867981482054697e-06, |
|
"loss": 2.315, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 2.428878990731119, |
|
"learning_rate": 1.9851005848948986e-06, |
|
"loss": 2.1129, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.6111111111111112, |
|
"grad_norm": 2.326070514005508, |
|
"learning_rate": 1.983301195364252e-06, |
|
"loss": 2.3507, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6296296296296297, |
|
"grad_norm": 2.2448623338584524, |
|
"learning_rate": 1.9814001656098e-06, |
|
"loss": 2.2176, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6481481481481481, |
|
"grad_norm": 3.345489216172997, |
|
"learning_rate": 1.9793976921339876e-06, |
|
"loss": 2.0352, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 2.454245882780074, |
|
"learning_rate": 1.9772939819251245e-06, |
|
"loss": 1.7644, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6851851851851852, |
|
"grad_norm": 2.2823601110851115, |
|
"learning_rate": 1.9750892524359894e-06, |
|
"loss": 2.0044, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.7037037037037037, |
|
"grad_norm": 2.378703420397497, |
|
"learning_rate": 1.9727837315613503e-06, |
|
"loss": 1.9992, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7222222222222222, |
|
"grad_norm": 2.2038000284491392, |
|
"learning_rate": 1.9703776576144106e-06, |
|
"loss": 2.1248, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 1.6625652175528476, |
|
"learning_rate": 1.9678712793021747e-06, |
|
"loss": 1.7908, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7592592592592593, |
|
"grad_norm": 1.649500064167637, |
|
"learning_rate": 1.9652648556997396e-06, |
|
"loss": 2.0346, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 2.127402784391995, |
|
"learning_rate": 1.962558656223516e-06, |
|
"loss": 2.1544, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.7962962962962963, |
|
"grad_norm": 2.4572023559040668, |
|
"learning_rate": 1.959752960603378e-06, |
|
"loss": 1.9295, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 1.511188510592738, |
|
"learning_rate": 1.956848058853751e-06, |
|
"loss": 2.1473, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 2.6425186462750276, |
|
"learning_rate": 1.9538442512436325e-06, |
|
"loss": 1.7632, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8518518518518519, |
|
"grad_norm": 2.528104013708182, |
|
"learning_rate": 1.9507418482655546e-06, |
|
"loss": 1.9125, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.8703703703703703, |
|
"grad_norm": 2.660072260955662, |
|
"learning_rate": 1.947541170603488e-06, |
|
"loss": 1.9839, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 2.303424321729968, |
|
"learning_rate": 1.9442425490996984e-06, |
|
"loss": 1.8381, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.9074074074074074, |
|
"grad_norm": 1.7413263437826438, |
|
"learning_rate": 1.940846324720544e-06, |
|
"loss": 2.2322, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 3.681741007928878, |
|
"learning_rate": 1.9373528485212327e-06, |
|
"loss": 2.1221, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9444444444444444, |
|
"grad_norm": 2.729258330107977, |
|
"learning_rate": 1.9337624816095357e-06, |
|
"loss": 1.8567, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 1.9607649593150183, |
|
"learning_rate": 1.9300755951084592e-06, |
|
"loss": 2.0553, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.9814814814814815, |
|
"grad_norm": 2.119362131138027, |
|
"learning_rate": 1.9262925701178863e-06, |
|
"loss": 1.936, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.057082578120893, |
|
"learning_rate": 1.9224137976751793e-06, |
|
"loss": 1.9584, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"grad_norm": 2.0207421134902708, |
|
"learning_rate": 1.918439678714763e-06, |
|
"loss": 1.9837, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"grad_norm": 1.938684997881939, |
|
"learning_rate": 1.9143706240266807e-06, |
|
"loss": 1.9354, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 2.0601195298871398, |
|
"learning_rate": 1.910207054214133e-06, |
|
"loss": 2.0174, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.0555555555555556, |
|
"grad_norm": 2.041620934780644, |
|
"learning_rate": 1.9059493996499985e-06, |
|
"loss": 1.7447, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.074074074074074, |
|
"grad_norm": 1.5682604954979573, |
|
"learning_rate": 1.9015981004323534e-06, |
|
"loss": 2.0106, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.0925925925925926, |
|
"grad_norm": 2.865965004078874, |
|
"learning_rate": 1.8971536063389742e-06, |
|
"loss": 2.2393, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 2.7462581398678787, |
|
"learning_rate": 1.89261637678085e-06, |
|
"loss": 1.7421, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.1296296296296295, |
|
"grad_norm": 3.120548437283878, |
|
"learning_rate": 1.8879868807546932e-06, |
|
"loss": 1.9877, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.1481481481481481, |
|
"grad_norm": 3.242255359642735, |
|
"learning_rate": 1.8832655967944605e-06, |
|
"loss": 1.9799, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 2.2159733738020275, |
|
"learning_rate": 1.8784530129218907e-06, |
|
"loss": 2.0581, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 2.08989006018966, |
|
"learning_rate": 1.873549626596057e-06, |
|
"loss": 1.8653, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.2037037037037037, |
|
"grad_norm": 1.4837874153680628, |
|
"learning_rate": 1.8685559446619487e-06, |
|
"loss": 1.9734, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 2.1071721482630403, |
|
"learning_rate": 1.863472483298079e-06, |
|
"loss": 1.7762, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.2407407407407407, |
|
"grad_norm": 2.6554851825477646, |
|
"learning_rate": 1.858299767963131e-06, |
|
"loss": 2.2267, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.2592592592592593, |
|
"grad_norm": 2.135758261049139, |
|
"learning_rate": 1.8530383333416415e-06, |
|
"loss": 2.0624, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.2777777777777777, |
|
"grad_norm": 2.256153463268274, |
|
"learning_rate": 1.847688723288733e-06, |
|
"loss": 2.0254, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 1.9270711341308566, |
|
"learning_rate": 1.8422514907738986e-06, |
|
"loss": 2.0873, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.3148148148148149, |
|
"grad_norm": 1.3698407936967985, |
|
"learning_rate": 1.8367271978238418e-06, |
|
"loss": 1.5655, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 1.7934950271719698, |
|
"learning_rate": 1.8311164154643833e-06, |
|
"loss": 2.1081, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.3518518518518519, |
|
"grad_norm": 1.7554770045810462, |
|
"learning_rate": 1.8254197236614353e-06, |
|
"loss": 1.6326, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.3703703703703702, |
|
"grad_norm": 1.7910726004582642, |
|
"learning_rate": 1.8196377112610524e-06, |
|
"loss": 1.9896, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 1.43155366985165, |
|
"learning_rate": 1.8137709759285662e-06, |
|
"loss": 1.8557, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.4074074074074074, |
|
"grad_norm": 1.816009532890727, |
|
"learning_rate": 1.8078201240868048e-06, |
|
"loss": 1.7878, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.425925925925926, |
|
"grad_norm": 1.612331881267257, |
|
"learning_rate": 1.8017857708534106e-06, |
|
"loss": 1.982, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 1.8628647966869196, |
|
"learning_rate": 1.7956685399772576e-06, |
|
"loss": 1.9704, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.462962962962963, |
|
"grad_norm": 1.9936817464029801, |
|
"learning_rate": 1.7894690637739762e-06, |
|
"loss": 1.8299, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 2.463393533692339, |
|
"learning_rate": 1.7831879830605936e-06, |
|
"loss": 2.0444, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.4979859149192305, |
|
"learning_rate": 1.776825947089294e-06, |
|
"loss": 2.0278, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.5185185185185186, |
|
"grad_norm": 2.7584711281071606, |
|
"learning_rate": 1.7703836134803102e-06, |
|
"loss": 1.8715, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.5370370370370372, |
|
"grad_norm": 1.9266117476771798, |
|
"learning_rate": 1.7638616481539448e-06, |
|
"loss": 2.3658, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 2.7609401761288908, |
|
"learning_rate": 1.7572607252617377e-06, |
|
"loss": 1.9736, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.574074074074074, |
|
"grad_norm": 2.114937054090088, |
|
"learning_rate": 1.7505815271167822e-06, |
|
"loss": 2.0398, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.5925925925925926, |
|
"grad_norm": 2.0664911123203513, |
|
"learning_rate": 1.743824744123196e-06, |
|
"loss": 2.1056, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.6111111111111112, |
|
"grad_norm": 2.5153483082090213, |
|
"learning_rate": 1.7369910747047571e-06, |
|
"loss": 1.8765, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 1.8949983903048848, |
|
"learning_rate": 1.7300812252327102e-06, |
|
"loss": 2.1245, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.6481481481481481, |
|
"grad_norm": 2.7037983362018565, |
|
"learning_rate": 1.723095909952751e-06, |
|
"loss": 1.5174, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 2.3396050215927673, |
|
"learning_rate": 1.7160358509111989e-06, |
|
"loss": 2.0559, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.6851851851851851, |
|
"grad_norm": 2.1357187531056976, |
|
"learning_rate": 1.7089017778803595e-06, |
|
"loss": 1.8264, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.7037037037037037, |
|
"grad_norm": 2.5298502653457358, |
|
"learning_rate": 1.701694428283093e-06, |
|
"loss": 2.1282, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.7222222222222223, |
|
"grad_norm": 2.0789215851330343, |
|
"learning_rate": 1.6944145471165881e-06, |
|
"loss": 2.1829, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.7407407407407407, |
|
"grad_norm": 1.8110067836025452, |
|
"learning_rate": 1.6870628868753545e-06, |
|
"loss": 1.7584, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.7592592592592593, |
|
"grad_norm": 2.7069181555694666, |
|
"learning_rate": 1.6796402074734402e-06, |
|
"loss": 1.897, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 2.3956521553142176, |
|
"learning_rate": 1.6721472761658836e-06, |
|
"loss": 1.9119, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.7962962962962963, |
|
"grad_norm": 1.3732811625669847, |
|
"learning_rate": 1.664584867469403e-06, |
|
"loss": 1.6848, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.8148148148148149, |
|
"grad_norm": 1.9512817035138257, |
|
"learning_rate": 1.6569537630823382e-06, |
|
"loss": 2.0185, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 1.864374052494234, |
|
"learning_rate": 1.6492547518038503e-06, |
|
"loss": 1.925, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 1.7728078338576356, |
|
"learning_rate": 1.6414886294523857e-06, |
|
"loss": 1.8965, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.8703703703703702, |
|
"grad_norm": 1.8362690886038369, |
|
"learning_rate": 1.6336561987834151e-06, |
|
"loss": 1.8881, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 3.120191999390615, |
|
"learning_rate": 1.6257582694064556e-06, |
|
"loss": 1.7192, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.9074074074074074, |
|
"grad_norm": 2.3586839267066044, |
|
"learning_rate": 1.6177956577013846e-06, |
|
"loss": 1.9387, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 2.779686602481001, |
|
"learning_rate": 1.6097691867340543e-06, |
|
"loss": 1.9497, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 2.321935224272705, |
|
"learning_rate": 1.6016796861712125e-06, |
|
"loss": 1.9367, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.9629629629629628, |
|
"grad_norm": 2.3211469537338276, |
|
"learning_rate": 1.5935279921947451e-06, |
|
"loss": 1.9765, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.9814814814814814, |
|
"grad_norm": 1.8048838385036454, |
|
"learning_rate": 1.585314947415242e-06, |
|
"loss": 2.1524, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.2432536623121866, |
|
"learning_rate": 1.5770414007848994e-06, |
|
"loss": 1.7596, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.0185185185185186, |
|
"grad_norm": 2.1527401042322984, |
|
"learning_rate": 1.5687082075097674e-06, |
|
"loss": 2.1903, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.0185185185185186, |
|
"grad_norm": 2.822074512897879, |
|
"learning_rate": 1.5603162289613501e-06, |
|
"loss": 2.0324, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.037037037037037, |
|
"grad_norm": 1.9685786022400997, |
|
"learning_rate": 1.551866332587568e-06, |
|
"loss": 1.8009, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.0555555555555554, |
|
"grad_norm": 2.515751939304619, |
|
"learning_rate": 1.5433593918230955e-06, |
|
"loss": 1.9487, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 2.349862710312166, |
|
"learning_rate": 1.5347962859990742e-06, |
|
"loss": 1.9967, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.0925925925925926, |
|
"grad_norm": 3.1803776539735233, |
|
"learning_rate": 1.5261779002522216e-06, |
|
"loss": 2.0633, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.111111111111111, |
|
"grad_norm": 2.6762686321709372, |
|
"learning_rate": 1.517505125433338e-06, |
|
"loss": 2.1631, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.1296296296296298, |
|
"grad_norm": 3.17350275984332, |
|
"learning_rate": 1.5087788580152206e-06, |
|
"loss": 1.7666, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.148148148148148, |
|
"grad_norm": 2.7374508335058128, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.6363, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.1666666666666665, |
|
"grad_norm": 2.518836889589819, |
|
"learning_rate": 1.4911694588259037e-06, |
|
"loss": 2.0306, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.185185185185185, |
|
"grad_norm": 2.0516490709057438, |
|
"learning_rate": 1.482288147273456e-06, |
|
"loss": 1.7322, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.2037037037037037, |
|
"grad_norm": 2.143653181079979, |
|
"learning_rate": 1.4733569833711299e-06, |
|
"loss": 1.9715, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 1.5702663497071736, |
|
"learning_rate": 1.4643768903004504e-06, |
|
"loss": 1.6725, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.240740740740741, |
|
"grad_norm": 2.1780515590527045, |
|
"learning_rate": 1.455348796300571e-06, |
|
"loss": 1.8871, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.259259259259259, |
|
"grad_norm": 1.7384200856334007, |
|
"learning_rate": 1.4462736345723259e-06, |
|
"loss": 1.8607, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.2777777777777777, |
|
"grad_norm": 2.602970978377197, |
|
"learning_rate": 1.437152343181765e-06, |
|
"loss": 2.0933, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.2962962962962963, |
|
"grad_norm": 2.2409639030493516, |
|
"learning_rate": 1.4279858649631928e-06, |
|
"loss": 2.1028, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.314814814814815, |
|
"grad_norm": 2.083427934167806, |
|
"learning_rate": 1.4187751474217096e-06, |
|
"loss": 1.7588, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 1.710343556502238, |
|
"learning_rate": 1.4095211426352718e-06, |
|
"loss": 1.8985, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.351851851851852, |
|
"grad_norm": 2.4282958584597645, |
|
"learning_rate": 1.4002248071562778e-06, |
|
"loss": 1.8267, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 2.2052175185263936, |
|
"learning_rate": 1.3908871019126954e-06, |
|
"loss": 2.254, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.388888888888889, |
|
"grad_norm": 2.4962771616425745, |
|
"learning_rate": 1.3815089921087315e-06, |
|
"loss": 1.8375, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.4074074074074074, |
|
"grad_norm": 2.420921240604477, |
|
"learning_rate": 1.3720914471250642e-06, |
|
"loss": 1.9705, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.425925925925926, |
|
"grad_norm": 1.6871014164962779, |
|
"learning_rate": 1.3626354404186404e-06, |
|
"loss": 1.866, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 1.5220778910671986, |
|
"learning_rate": 1.3531419494220545e-06, |
|
"loss": 2.0116, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.462962962962963, |
|
"grad_norm": 1.9736590287767704, |
|
"learning_rate": 1.343611955442513e-06, |
|
"loss": 1.7881, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.4814814814814814, |
|
"grad_norm": 1.357453526449638, |
|
"learning_rate": 1.334046443560402e-06, |
|
"loss": 1.7624, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.7906511346102865, |
|
"learning_rate": 1.324446402527462e-06, |
|
"loss": 1.7147, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"grad_norm": 2.0256913340352, |
|
"learning_rate": 1.3148128246645848e-06, |
|
"loss": 1.657, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.537037037037037, |
|
"grad_norm": 2.4368648915605786, |
|
"learning_rate": 1.3051467057592413e-06, |
|
"loss": 1.848, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.5555555555555554, |
|
"grad_norm": 1.7920760208344662, |
|
"learning_rate": 1.2954490449625491e-06, |
|
"loss": 2.2794, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.574074074074074, |
|
"grad_norm": 2.5934703428783115, |
|
"learning_rate": 1.2857208446859957e-06, |
|
"loss": 2.1465, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"grad_norm": 1.788260906958661, |
|
"learning_rate": 1.2759631104978224e-06, |
|
"loss": 2.067, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.611111111111111, |
|
"grad_norm": 2.7522723362234474, |
|
"learning_rate": 1.2661768510190816e-06, |
|
"loss": 1.8176, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.6296296296296298, |
|
"grad_norm": 2.43143502900473, |
|
"learning_rate": 1.2563630778193802e-06, |
|
"loss": 2.3366, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.648148148148148, |
|
"grad_norm": 1.7241238478367036, |
|
"learning_rate": 1.2465228053123172e-06, |
|
"loss": 1.9895, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 2.0266143160589802, |
|
"learning_rate": 1.2366570506506268e-06, |
|
"loss": 1.7781, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.685185185185185, |
|
"grad_norm": 1.9459670874156856, |
|
"learning_rate": 1.226766833621041e-06, |
|
"loss": 2.3116, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.7037037037037037, |
|
"grad_norm": 2.248556130449579, |
|
"learning_rate": 1.2168531765388755e-06, |
|
"loss": 1.8032, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.7222222222222223, |
|
"grad_norm": 1.711136470727862, |
|
"learning_rate": 1.2069171041423583e-06, |
|
"loss": 1.6228, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.7407407407407405, |
|
"grad_norm": 2.4614425382704352, |
|
"learning_rate": 1.1969596434867062e-06, |
|
"loss": 1.9709, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.7592592592592595, |
|
"grad_norm": 2.3445742482429788, |
|
"learning_rate": 1.186981823837961e-06, |
|
"loss": 2.0597, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 1.6706837512637804, |
|
"learning_rate": 1.1769846765665992e-06, |
|
"loss": 1.6263, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.7962962962962963, |
|
"grad_norm": 1.6603060271536991, |
|
"learning_rate": 1.1669692350409222e-06, |
|
"loss": 1.8723, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.814814814814815, |
|
"grad_norm": 1.7552257393882156, |
|
"learning_rate": 1.1569365345202413e-06, |
|
"loss": 2.224, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.8333333333333335, |
|
"grad_norm": 1.3677514217091091, |
|
"learning_rate": 1.1468876120478662e-06, |
|
"loss": 1.897, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.851851851851852, |
|
"grad_norm": 1.4681588115995392, |
|
"learning_rate": 1.1368235063439102e-06, |
|
"loss": 1.7654, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.8703703703703702, |
|
"grad_norm": 1.4166676047405766, |
|
"learning_rate": 1.1267452576979218e-06, |
|
"loss": 1.7817, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 1.5999665116208726, |
|
"learning_rate": 1.1166539078613525e-06, |
|
"loss": 1.814, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.9074074074074074, |
|
"grad_norm": 1.8734358713251535, |
|
"learning_rate": 1.106550499939876e-06, |
|
"loss": 2.0783, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.925925925925926, |
|
"grad_norm": 1.7212322982329384, |
|
"learning_rate": 1.0964360782855666e-06, |
|
"loss": 2.0753, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.9444444444444446, |
|
"grad_norm": 2.144799198000555, |
|
"learning_rate": 1.086311688388946e-06, |
|
"loss": 1.8936, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 1.578076988317517, |
|
"learning_rate": 1.076178376770918e-06, |
|
"loss": 1.8926, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.9814814814814814, |
|
"grad_norm": 2.092387225323448, |
|
"learning_rate": 1.0660371908745908e-06, |
|
"loss": 1.8244, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.91051937209127, |
|
"learning_rate": 1.0558891789570082e-06, |
|
"loss": 1.8447, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.0185185185185186, |
|
"grad_norm": 2.011878655711519, |
|
"learning_rate": 1.0457353899807946e-06, |
|
"loss": 1.6429, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.0185185185185186, |
|
"grad_norm": 1.501437779159261, |
|
"learning_rate": 1.0355768735057273e-06, |
|
"loss": 1.8726, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.037037037037037, |
|
"grad_norm": 2.2762397392089597, |
|
"learning_rate": 1.0254146795802495e-06, |
|
"loss": 1.8501, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 3.0555555555555554, |
|
"grad_norm": 1.711019377794848, |
|
"learning_rate": 1.015249858632926e-06, |
|
"loss": 1.9443, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.074074074074074, |
|
"grad_norm": 2.1218173803583733, |
|
"learning_rate": 1.0050834613638694e-06, |
|
"loss": 1.5682, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 3.0925925925925926, |
|
"grad_norm": 2.2421674612074383, |
|
"learning_rate": 9.949165386361303e-07, |
|
"loss": 1.8014, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 2.0898372243057706, |
|
"learning_rate": 9.847501413670742e-07, |
|
"loss": 1.8711, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 3.1296296296296298, |
|
"grad_norm": 2.367436693252952, |
|
"learning_rate": 9.745853204197508e-07, |
|
"loss": 1.9004, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.148148148148148, |
|
"grad_norm": 1.756679866289546, |
|
"learning_rate": 9.644231264942724e-07, |
|
"loss": 1.8121, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 3.1666666666666665, |
|
"grad_norm": 1.8172318892802939, |
|
"learning_rate": 9.542646100192055e-07, |
|
"loss": 1.9013, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.185185185185185, |
|
"grad_norm": 1.7124997061951257, |
|
"learning_rate": 9.441108210429921e-07, |
|
"loss": 1.7851, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.2037037037037037, |
|
"grad_norm": 1.4438554381375786, |
|
"learning_rate": 9.339628091254091e-07, |
|
"loss": 1.5955, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.2222222222222223, |
|
"grad_norm": 1.4447841103018684, |
|
"learning_rate": 9.238216232290821e-07, |
|
"loss": 2.0907, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 3.240740740740741, |
|
"grad_norm": 1.6937928047736799, |
|
"learning_rate": 9.136883116110541e-07, |
|
"loss": 1.915, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.259259259259259, |
|
"grad_norm": 1.306322824987709, |
|
"learning_rate": 9.035639217144334e-07, |
|
"loss": 2.0679, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 3.2777777777777777, |
|
"grad_norm": 1.850877358174252, |
|
"learning_rate": 8.93449500060124e-07, |
|
"loss": 2.024, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.2962962962962963, |
|
"grad_norm": 1.5601775038920753, |
|
"learning_rate": 8.833460921386477e-07, |
|
"loss": 1.9335, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 3.314814814814815, |
|
"grad_norm": 2.270038996895677, |
|
"learning_rate": 8.732547423020784e-07, |
|
"loss": 2.3019, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 1.3421300711986788, |
|
"learning_rate": 8.631764936560899e-07, |
|
"loss": 1.8503, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 3.351851851851852, |
|
"grad_norm": 2.071755001265988, |
|
"learning_rate": 8.53112387952134e-07, |
|
"loss": 1.838, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.3703703703703702, |
|
"grad_norm": 1.6174575169544287, |
|
"learning_rate": 8.430634654797588e-07, |
|
"loss": 2.2364, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.388888888888889, |
|
"grad_norm": 1.801580774474325, |
|
"learning_rate": 8.330307649590779e-07, |
|
"loss": 1.7633, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.4074074074074074, |
|
"grad_norm": 2.058657705709402, |
|
"learning_rate": 8.230153234334007e-07, |
|
"loss": 2.2177, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 3.425925925925926, |
|
"grad_norm": 1.5267427939756337, |
|
"learning_rate": 8.130181761620392e-07, |
|
"loss": 1.8588, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.4444444444444446, |
|
"grad_norm": 1.8491296560891988, |
|
"learning_rate": 8.030403565132942e-07, |
|
"loss": 2.0561, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 3.462962962962963, |
|
"grad_norm": 1.1987453530026493, |
|
"learning_rate": 7.930828958576417e-07, |
|
"loss": 2.0565, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.4814814814814814, |
|
"grad_norm": 1.7195298906541316, |
|
"learning_rate": 7.831468234611247e-07, |
|
"loss": 2.0798, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.20797833272688, |
|
"learning_rate": 7.73233166378959e-07, |
|
"loss": 1.8627, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.5185185185185186, |
|
"grad_norm": 1.5640684128902402, |
|
"learning_rate": 7.633429493493729e-07, |
|
"loss": 2.0137, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 3.537037037037037, |
|
"grad_norm": 1.6824510280578688, |
|
"learning_rate": 7.53477194687683e-07, |
|
"loss": 2.1517, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 1.4155640553151332, |
|
"learning_rate": 7.4363692218062e-07, |
|
"loss": 1.9426, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.574074074074074, |
|
"grad_norm": 1.3939742232946681, |
|
"learning_rate": 7.338231489809182e-07, |
|
"loss": 1.7207, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.5925925925925926, |
|
"grad_norm": 1.4589160544776356, |
|
"learning_rate": 7.240368895021775e-07, |
|
"loss": 1.8217, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 3.611111111111111, |
|
"grad_norm": 1.3991775241667967, |
|
"learning_rate": 7.142791553140044e-07, |
|
"loss": 1.9021, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.6296296296296298, |
|
"grad_norm": 1.5300112446112555, |
|
"learning_rate": 7.045509550374509e-07, |
|
"loss": 1.9647, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 3.648148148148148, |
|
"grad_norm": 1.449273309005635, |
|
"learning_rate": 6.948532942407587e-07, |
|
"loss": 1.9613, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.6666666666666665, |
|
"grad_norm": 1.069899380500529, |
|
"learning_rate": 6.851871753354153e-07, |
|
"loss": 1.7452, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 3.685185185185185, |
|
"grad_norm": 1.5579308530316032, |
|
"learning_rate": 6.755535974725379e-07, |
|
"loss": 1.9134, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 1.0814459794670248, |
|
"learning_rate": 6.659535564395982e-07, |
|
"loss": 1.6609, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 3.7222222222222223, |
|
"grad_norm": 1.8876967693657951, |
|
"learning_rate": 6.563880445574872e-07, |
|
"loss": 2.0948, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 3.7407407407407405, |
|
"grad_norm": 1.6093595543167938, |
|
"learning_rate": 6.468580505779455e-07, |
|
"loss": 1.6327, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.7592592592592595, |
|
"grad_norm": 1.9559640817344714, |
|
"learning_rate": 6.373645595813596e-07, |
|
"loss": 1.6376, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"grad_norm": 2.0405778845643288, |
|
"learning_rate": 6.27908552874936e-07, |
|
"loss": 2.1409, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 3.7962962962962963, |
|
"grad_norm": 1.230340254163767, |
|
"learning_rate": 6.184910078912686e-07, |
|
"loss": 1.686, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 3.814814814814815, |
|
"grad_norm": 2.171420345125834, |
|
"learning_rate": 6.091128980873045e-07, |
|
"loss": 1.9347, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 3.8333333333333335, |
|
"grad_norm": 1.8008532771859842, |
|
"learning_rate": 5.997751928437219e-07, |
|
"loss": 2.1292, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.851851851851852, |
|
"grad_norm": 1.502892647903443, |
|
"learning_rate": 5.904788573647282e-07, |
|
"loss": 1.7302, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 3.8703703703703702, |
|
"grad_norm": 1.4720170454603325, |
|
"learning_rate": 5.812248525782901e-07, |
|
"loss": 1.6652, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 1.4078435809618528, |
|
"learning_rate": 5.720141350368072e-07, |
|
"loss": 1.7847, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 3.9074074074074074, |
|
"grad_norm": 1.2860107867972834, |
|
"learning_rate": 5.628476568182349e-07, |
|
"loss": 1.818, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 3.925925925925926, |
|
"grad_norm": 1.5761560916907795, |
|
"learning_rate": 5.537263654276743e-07, |
|
"loss": 1.787, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.9444444444444446, |
|
"grad_norm": 1.463921943518727, |
|
"learning_rate": 5.446512036994286e-07, |
|
"loss": 1.9223, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 3.962962962962963, |
|
"grad_norm": 1.2770391505323755, |
|
"learning_rate": 5.356231096995499e-07, |
|
"loss": 1.5593, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 3.9814814814814814, |
|
"grad_norm": 1.4711865688844035, |
|
"learning_rate": 5.266430166288704e-07, |
|
"loss": 2.0863, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.1447313661292717, |
|
"learning_rate": 5.177118527265437e-07, |
|
"loss": 1.9428, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 4.018518518518519, |
|
"grad_norm": 1.6196943319397998, |
|
"learning_rate": 5.088305411740965e-07, |
|
"loss": 2.2068, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.018518518518518, |
|
"grad_norm": 1.2766493962889875, |
|
"learning_rate": 5.000000000000002e-07, |
|
"loss": 1.7437, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 4.037037037037037, |
|
"grad_norm": 1.594306405599087, |
|
"learning_rate": 4.912211419847793e-07, |
|
"loss": 2.0219, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 4.055555555555555, |
|
"grad_norm": 1.227716475966799, |
|
"learning_rate": 4.82494874566662e-07, |
|
"loss": 2.187, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 4.074074074074074, |
|
"grad_norm": 1.2852396998354376, |
|
"learning_rate": 4.738220997477784e-07, |
|
"loss": 1.8363, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 4.092592592592593, |
|
"grad_norm": 1.0923893050000644, |
|
"learning_rate": 4.6520371400092584e-07, |
|
"loss": 1.7177, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.111111111111111, |
|
"grad_norm": 1.1495819987216884, |
|
"learning_rate": 4.5664060817690476e-07, |
|
"loss": 2.0734, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 4.12962962962963, |
|
"grad_norm": 1.1120083230916684, |
|
"learning_rate": 4.481336674124323e-07, |
|
"loss": 1.7847, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 4.148148148148148, |
|
"grad_norm": 0.9789098979808262, |
|
"learning_rate": 4.3968377103865016e-07, |
|
"loss": 1.7989, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 0.9342477457439083, |
|
"learning_rate": 4.3129179249023274e-07, |
|
"loss": 1.6785, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 4.185185185185185, |
|
"grad_norm": 1.0718449337061247, |
|
"learning_rate": 4.229585992151006e-07, |
|
"loss": 1.7953, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.203703703703703, |
|
"grad_norm": 1.1500516991492213, |
|
"learning_rate": 4.1468505258475784e-07, |
|
"loss": 1.3975, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 4.222222222222222, |
|
"grad_norm": 0.9650831232767911, |
|
"learning_rate": 4.0647200780525483e-07, |
|
"loss": 1.8603, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 4.2407407407407405, |
|
"grad_norm": 1.0207088687244406, |
|
"learning_rate": 3.983203138287876e-07, |
|
"loss": 1.9807, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 4.2592592592592595, |
|
"grad_norm": 1.1991752171611891, |
|
"learning_rate": 3.9023081326594564e-07, |
|
"loss": 2.2322, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 4.277777777777778, |
|
"grad_norm": 1.0807801212200088, |
|
"learning_rate": 3.822043422986153e-07, |
|
"loss": 1.6295, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 4.296296296296296, |
|
"grad_norm": 1.0103392155699495, |
|
"learning_rate": 3.742417305935442e-07, |
|
"loss": 1.7882, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 4.314814814814815, |
|
"grad_norm": 1.0657639750720669, |
|
"learning_rate": 3.663438012165848e-07, |
|
"loss": 1.6027, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 4.333333333333333, |
|
"grad_norm": 0.9495451533397854, |
|
"learning_rate": 3.5851137054761426e-07, |
|
"loss": 1.8212, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 4.351851851851852, |
|
"grad_norm": 1.0780389016215326, |
|
"learning_rate": 3.507452481961495e-07, |
|
"loss": 1.6304, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 4.37037037037037, |
|
"grad_norm": 1.0244203325558825, |
|
"learning_rate": 3.430462369176619e-07, |
|
"loss": 1.9347, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.388888888888889, |
|
"grad_norm": 0.9762810523750869, |
|
"learning_rate": 3.3541513253059726e-07, |
|
"loss": 2.0351, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 4.407407407407407, |
|
"grad_norm": 0.8894982063199672, |
|
"learning_rate": 3.278527238341163e-07, |
|
"loss": 1.7788, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 4.425925925925926, |
|
"grad_norm": 0.9573443483478868, |
|
"learning_rate": 3.2035979252655976e-07, |
|
"loss": 1.6824, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.878347387417952, |
|
"learning_rate": 3.129371131246459e-07, |
|
"loss": 1.7893, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 4.462962962962963, |
|
"grad_norm": 1.2131347174643223, |
|
"learning_rate": 3.05585452883412e-07, |
|
"loss": 2.4755, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 4.481481481481482, |
|
"grad_norm": 0.9278993006726863, |
|
"learning_rate": 2.9830557171690693e-07, |
|
"loss": 2.051, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.9769923688632531, |
|
"learning_rate": 2.910982221196404e-07, |
|
"loss": 1.8307, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 4.518518518518518, |
|
"grad_norm": 1.0084007217465136, |
|
"learning_rate": 2.8396414908880095e-07, |
|
"loss": 2.0386, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 4.537037037037037, |
|
"grad_norm": 1.0273787706173494, |
|
"learning_rate": 2.769040900472488e-07, |
|
"loss": 1.9072, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 4.555555555555555, |
|
"grad_norm": 0.8621559648712259, |
|
"learning_rate": 2.6991877476728985e-07, |
|
"loss": 1.706, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.574074074074074, |
|
"grad_norm": 0.8247377172080764, |
|
"learning_rate": 2.6300892529524264e-07, |
|
"loss": 1.8414, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 4.592592592592593, |
|
"grad_norm": 0.8925073470001154, |
|
"learning_rate": 2.56175255876804e-07, |
|
"loss": 1.9007, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 4.611111111111111, |
|
"grad_norm": 0.7860274094152706, |
|
"learning_rate": 2.494184728832179e-07, |
|
"loss": 1.8654, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 4.62962962962963, |
|
"grad_norm": 0.8936613069940655, |
|
"learning_rate": 2.427392747382623e-07, |
|
"loss": 1.6996, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 4.648148148148148, |
|
"grad_norm": 1.0827181264619206, |
|
"learning_rate": 2.3613835184605523e-07, |
|
"loss": 1.9413, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 0.8918696543620299, |
|
"learning_rate": 2.2961638651968974e-07, |
|
"loss": 1.856, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 4.685185185185185, |
|
"grad_norm": 0.9976782397503938, |
|
"learning_rate": 2.2317405291070567e-07, |
|
"loss": 1.8228, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 4.703703703703704, |
|
"grad_norm": 1.021922767232776, |
|
"learning_rate": 2.1681201693940666e-07, |
|
"loss": 2.0057, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 4.722222222222222, |
|
"grad_norm": 1.1673943142630625, |
|
"learning_rate": 2.1053093622602402e-07, |
|
"loss": 1.9204, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 4.7407407407407405, |
|
"grad_norm": 1.0912049168909328, |
|
"learning_rate": 2.043314600227425e-07, |
|
"loss": 1.8173, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.7592592592592595, |
|
"grad_norm": 0.9358084522077252, |
|
"learning_rate": 1.9821422914658957e-07, |
|
"loss": 2.0846, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 4.777777777777778, |
|
"grad_norm": 1.0481784665647413, |
|
"learning_rate": 1.921798759131953e-07, |
|
"loss": 1.9789, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 4.796296296296296, |
|
"grad_norm": 0.983048254792995, |
|
"learning_rate": 1.8622902407143392e-07, |
|
"loss": 1.9294, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 4.814814814814815, |
|
"grad_norm": 0.8359638487960833, |
|
"learning_rate": 1.8036228873894744e-07, |
|
"loss": 1.7806, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 4.833333333333333, |
|
"grad_norm": 1.1295927764034195, |
|
"learning_rate": 1.7458027633856475e-07, |
|
"loss": 1.9495, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 4.851851851851852, |
|
"grad_norm": 1.1032897990848558, |
|
"learning_rate": 1.6888358453561646e-07, |
|
"loss": 2.0724, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 4.87037037037037, |
|
"grad_norm": 0.855002738874884, |
|
"learning_rate": 1.632728021761579e-07, |
|
"loss": 2.102, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 1.0646161730662291, |
|
"learning_rate": 1.5774850922610116e-07, |
|
"loss": 1.9046, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 4.907407407407407, |
|
"grad_norm": 1.0109654313968932, |
|
"learning_rate": 1.5231127671126676e-07, |
|
"loss": 2.0854, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 4.925925925925926, |
|
"grad_norm": 0.9390534047671891, |
|
"learning_rate": 1.4696166665835852e-07, |
|
"loss": 2.1436, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.944444444444445, |
|
"grad_norm": 0.9838446669064714, |
|
"learning_rate": 1.4170023203686875e-07, |
|
"loss": 1.9317, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 4.962962962962963, |
|
"grad_norm": 1.0678273880700424, |
|
"learning_rate": 1.3652751670192075e-07, |
|
"loss": 1.8309, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 4.981481481481482, |
|
"grad_norm": 1.1853311551704062, |
|
"learning_rate": 1.3144405533805136e-07, |
|
"loss": 1.948, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.0844767215232378, |
|
"learning_rate": 1.2645037340394281e-07, |
|
"loss": 2.1066, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 5.018518518518518, |
|
"grad_norm": 0.8509695959322425, |
|
"learning_rate": 1.2154698707810928e-07, |
|
"loss": 1.9217, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.037037037037037, |
|
"grad_norm": 0.9599815386335595, |
|
"learning_rate": 1.167344032055394e-07, |
|
"loss": 1.9898, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 5.055555555555555, |
|
"grad_norm": 0.9561022219351966, |
|
"learning_rate": 1.1201311924530688e-07, |
|
"loss": 1.6967, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 5.074074074074074, |
|
"grad_norm": 0.8614534074294055, |
|
"learning_rate": 1.0738362321914995e-07, |
|
"loss": 1.7586, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 5.092592592592593, |
|
"grad_norm": 0.884706815883145, |
|
"learning_rate": 1.0284639366102598e-07, |
|
"loss": 1.8692, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 5.111111111111111, |
|
"grad_norm": 0.8641496604329509, |
|
"learning_rate": 9.840189956764677e-08, |
|
"loss": 2.1101, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.12962962962963, |
|
"grad_norm": 0.8465414034017087, |
|
"learning_rate": 9.405060035000134e-08, |
|
"loss": 1.7827, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 5.148148148148148, |
|
"grad_norm": 0.6966794157650356, |
|
"learning_rate": 8.979294578586738e-08, |
|
"loss": 1.6446, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 5.166666666666667, |
|
"grad_norm": 0.8581271311276034, |
|
"learning_rate": 8.562937597331898e-08, |
|
"loss": 1.7243, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 5.185185185185185, |
|
"grad_norm": 0.9976947326325505, |
|
"learning_rate": 8.156032128523694e-08, |
|
"loss": 1.8994, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 5.203703703703703, |
|
"grad_norm": 0.9786757162446749, |
|
"learning_rate": 7.758620232482083e-08, |
|
"loss": 1.8625, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 5.222222222222222, |
|
"grad_norm": 0.7563393752170862, |
|
"learning_rate": 7.370742988211364e-08, |
|
"loss": 1.7512, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 5.2407407407407405, |
|
"grad_norm": 0.7955178168012043, |
|
"learning_rate": 6.99244048915405e-08, |
|
"loss": 2.2105, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 5.2592592592592595, |
|
"grad_norm": 0.8951178929520269, |
|
"learning_rate": 6.623751839046455e-08, |
|
"loss": 1.8276, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 5.277777777777778, |
|
"grad_norm": 0.9912120605663316, |
|
"learning_rate": 6.264715147876742e-08, |
|
"loss": 2.2784, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 5.296296296296296, |
|
"grad_norm": 0.759976000502015, |
|
"learning_rate": 5.915367527945614e-08, |
|
"loss": 1.9346, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.314814814814815, |
|
"grad_norm": 0.7423091105639062, |
|
"learning_rate": 5.575745090030137e-08, |
|
"loss": 1.8795, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 0.7811530255930925, |
|
"learning_rate": 5.245882939651181e-08, |
|
"loss": 2.0584, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 5.351851851851852, |
|
"grad_norm": 0.9202352755672565, |
|
"learning_rate": 4.9258151734445694e-08, |
|
"loss": 2.0563, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 5.37037037037037, |
|
"grad_norm": 0.7972657702760176, |
|
"learning_rate": 4.6155748756367294e-08, |
|
"loss": 1.8333, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 5.388888888888889, |
|
"grad_norm": 0.6829451582697305, |
|
"learning_rate": 4.3151941146248873e-08, |
|
"loss": 1.9896, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 5.407407407407407, |
|
"grad_norm": 0.7886670762082094, |
|
"learning_rate": 4.0247039396622e-08, |
|
"loss": 1.8183, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 5.425925925925926, |
|
"grad_norm": 0.8840244220041553, |
|
"learning_rate": 3.7441343776484113e-08, |
|
"loss": 1.9354, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 5.444444444444445, |
|
"grad_norm": 0.71587738270711, |
|
"learning_rate": 3.4735144300260255e-08, |
|
"loss": 2.0167, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 5.462962962962963, |
|
"grad_norm": 0.7108094024246895, |
|
"learning_rate": 3.212872069782513e-08, |
|
"loss": 1.7169, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 5.481481481481482, |
|
"grad_norm": 0.6662930242485889, |
|
"learning_rate": 2.962234238558925e-08, |
|
"loss": 2.2062, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.7122621954506775, |
|
"learning_rate": 2.721626843864977e-08, |
|
"loss": 2.0591, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 5.518518518518518, |
|
"grad_norm": 0.626318180659774, |
|
"learning_rate": 2.491074756401068e-08, |
|
"loss": 1.5866, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 5.537037037037037, |
|
"grad_norm": 0.6909592708288532, |
|
"learning_rate": 2.2706018074875043e-08, |
|
"loss": 1.9005, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 0.7144569439769612, |
|
"learning_rate": 2.0602307866012246e-08, |
|
"loss": 2.0294, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 5.574074074074074, |
|
"grad_norm": 0.684647174393133, |
|
"learning_rate": 1.8599834390199853e-08, |
|
"loss": 1.6046, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 5.592592592592593, |
|
"grad_norm": 0.7752801436279185, |
|
"learning_rate": 1.6698804635747576e-08, |
|
"loss": 1.7937, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 5.611111111111111, |
|
"grad_norm": 0.6862611972609113, |
|
"learning_rate": 1.4899415105101066e-08, |
|
"loss": 1.7256, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 5.62962962962963, |
|
"grad_norm": 0.6608135193001434, |
|
"learning_rate": 1.3201851794530371e-08, |
|
"loss": 1.7763, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 5.648148148148148, |
|
"grad_norm": 0.7625095579861546, |
|
"learning_rate": 1.1606290174903888e-08, |
|
"loss": 2.0082, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 5.666666666666667, |
|
"grad_norm": 0.6914220267730987, |
|
"learning_rate": 1.0112895173551183e-08, |
|
"loss": 1.9359, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.685185185185185, |
|
"grad_norm": 0.6505975431309626, |
|
"learning_rate": 8.721821157214316e-09, |
|
"loss": 1.9317, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 5.703703703703704, |
|
"grad_norm": 0.6947915176450158, |
|
"learning_rate": 7.433211916092141e-09, |
|
"loss": 1.6243, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 5.722222222222222, |
|
"grad_norm": 0.6360099423433963, |
|
"learning_rate": 6.247200648976991e-09, |
|
"loss": 1.9931, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 5.7407407407407405, |
|
"grad_norm": 0.6796797146249973, |
|
"learning_rate": 5.163909949486233e-09, |
|
"loss": 1.9858, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 5.7592592592592595, |
|
"grad_norm": 0.7636965994787633, |
|
"learning_rate": 4.183451793390747e-09, |
|
"loss": 1.8201, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 5.777777777777778, |
|
"grad_norm": 0.6434704456483539, |
|
"learning_rate": 3.30592752703962e-09, |
|
"loss": 1.5983, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 5.796296296296296, |
|
"grad_norm": 0.6697682736960676, |
|
"learning_rate": 2.531427856885093e-09, |
|
"loss": 1.985, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 5.814814814814815, |
|
"grad_norm": 0.657234650874368, |
|
"learning_rate": 1.8600328401061627e-09, |
|
"loss": 2.0918, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 5.833333333333333, |
|
"grad_norm": 0.6249721935624161, |
|
"learning_rate": 1.2918118763335372e-09, |
|
"loss": 2.1123, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 5.851851851851852, |
|
"grad_norm": 0.7274585554347512, |
|
"learning_rate": 8.268237004757095e-10, |
|
"loss": 2.2962, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.87037037037037, |
|
"grad_norm": 0.6060864357328691, |
|
"learning_rate": 4.651163766484778e-10, |
|
"loss": 1.6461, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 5.888888888888889, |
|
"grad_norm": 0.6626618247650778, |
|
"learning_rate": 2.0672729320581063e-10, |
|
"loss": 2.0178, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 5.907407407407407, |
|
"grad_norm": 0.5905608542721459, |
|
"learning_rate": 5.1683158875936994e-11, |
|
"loss": 1.7269, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 5.925925925925926, |
|
"grad_norm": 0.7138681736753105, |
|
"learning_rate": 0.0, |
|
"loss": 2.11, |
|
"step": 324 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 324, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 54, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 128024720179200.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|