|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99492385786802, |
|
"eval_steps": 50, |
|
"global_step": 885, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01692047377326565, |
|
"grad_norm": 17.95940849112974, |
|
"learning_rate": 5e-07, |
|
"loss": 1.7425, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0338409475465313, |
|
"grad_norm": 12.23967055037602, |
|
"learning_rate": 1e-06, |
|
"loss": 1.599, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.050761421319796954, |
|
"grad_norm": 7.600510364072396, |
|
"learning_rate": 9.99919433964529e-07, |
|
"loss": 1.2976, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0676818950930626, |
|
"grad_norm": 4.134278668785207, |
|
"learning_rate": 9.996777618216605e-07, |
|
"loss": 1.1572, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08460236886632826, |
|
"grad_norm": 3.846352071719652, |
|
"learning_rate": 9.992750614536604e-07, |
|
"loss": 1.0495, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10152284263959391, |
|
"grad_norm": 3.4799081252721886, |
|
"learning_rate": 9.98711462636417e-07, |
|
"loss": 1.0222, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11844331641285956, |
|
"grad_norm": 3.6435583580883644, |
|
"learning_rate": 9.979871469976195e-07, |
|
"loss": 0.982, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1353637901861252, |
|
"grad_norm": 3.474443331449369, |
|
"learning_rate": 9.971023479582256e-07, |
|
"loss": 0.9659, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15228426395939088, |
|
"grad_norm": 3.5291718369580094, |
|
"learning_rate": 9.960573506572389e-07, |
|
"loss": 0.9517, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1692047377326565, |
|
"grad_norm": 3.6607844369820763, |
|
"learning_rate": 9.948524918598173e-07, |
|
"loss": 0.9744, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1692047377326565, |
|
"eval_loss": 0.9363481402397156, |
|
"eval_runtime": 147.9244, |
|
"eval_samples_per_second": 56.786, |
|
"eval_steps_per_second": 0.892, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18612521150592218, |
|
"grad_norm": 3.5029465776623305, |
|
"learning_rate": 9.934881598487478e-07, |
|
"loss": 0.9291, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.20304568527918782, |
|
"grad_norm": 3.420583685374325, |
|
"learning_rate": 9.919647942993147e-07, |
|
"loss": 0.9373, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21996615905245348, |
|
"grad_norm": 3.639006411388483, |
|
"learning_rate": 9.9028288613761e-07, |
|
"loss": 0.9371, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.23688663282571912, |
|
"grad_norm": 3.739200188919654, |
|
"learning_rate": 9.884429773823236e-07, |
|
"loss": 0.9168, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25380710659898476, |
|
"grad_norm": 3.6961021681290647, |
|
"learning_rate": 9.864456609700723e-07, |
|
"loss": 0.9036, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2707275803722504, |
|
"grad_norm": 3.469126484018746, |
|
"learning_rate": 9.842915805643156e-07, |
|
"loss": 0.8789, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2876480541455161, |
|
"grad_norm": 3.4540644342177953, |
|
"learning_rate": 9.819814303479267e-07, |
|
"loss": 0.8843, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.30456852791878175, |
|
"grad_norm": 3.491589457047051, |
|
"learning_rate": 9.795159547994828e-07, |
|
"loss": 0.878, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.32148900169204736, |
|
"grad_norm": 3.3672158895857582, |
|
"learning_rate": 9.76895948453346e-07, |
|
"loss": 0.8817, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.338409475465313, |
|
"grad_norm": 3.4204437907535032, |
|
"learning_rate": 9.74122255643613e-07, |
|
"loss": 0.8749, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.338409475465313, |
|
"eval_loss": 0.8794726729393005, |
|
"eval_runtime": 146.5584, |
|
"eval_samples_per_second": 57.315, |
|
"eval_steps_per_second": 0.901, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3553299492385787, |
|
"grad_norm": 3.499218396193934, |
|
"learning_rate": 9.711957702320174e-07, |
|
"loss": 0.8689, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.37225042301184436, |
|
"grad_norm": 3.575982847323226, |
|
"learning_rate": 9.681174353198686e-07, |
|
"loss": 0.8552, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.38917089678510997, |
|
"grad_norm": 3.5820322461640624, |
|
"learning_rate": 9.648882429441256e-07, |
|
"loss": 0.8723, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.40609137055837563, |
|
"grad_norm": 3.428649928071515, |
|
"learning_rate": 9.615092337576987e-07, |
|
"loss": 0.8737, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4230118443316413, |
|
"grad_norm": 3.439041413094408, |
|
"learning_rate": 9.579814966940833e-07, |
|
"loss": 0.8574, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.43993231810490696, |
|
"grad_norm": 3.611321063385662, |
|
"learning_rate": 9.543061686164372e-07, |
|
"loss": 0.8774, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.45685279187817257, |
|
"grad_norm": 3.3003493132482657, |
|
"learning_rate": 9.504844339512094e-07, |
|
"loss": 0.8594, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.47377326565143824, |
|
"grad_norm": 3.4741051861556684, |
|
"learning_rate": 9.465175243064428e-07, |
|
"loss": 0.8674, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4906937394247039, |
|
"grad_norm": 3.2142657401436416, |
|
"learning_rate": 9.424067180748691e-07, |
|
"loss": 0.8648, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"grad_norm": 3.5722793490057336, |
|
"learning_rate": 9.381533400219317e-07, |
|
"loss": 0.8423, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"eval_loss": 0.8532436490058899, |
|
"eval_runtime": 146.5502, |
|
"eval_samples_per_second": 57.318, |
|
"eval_steps_per_second": 0.901, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5245346869712352, |
|
"grad_norm": 3.387238895689527, |
|
"learning_rate": 9.337587608588588e-07, |
|
"loss": 0.8344, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5414551607445008, |
|
"grad_norm": 3.3257959351398165, |
|
"learning_rate": 9.29224396800933e-07, |
|
"loss": 0.8296, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5583756345177665, |
|
"grad_norm": 3.4206543705067425, |
|
"learning_rate": 9.245517091110968e-07, |
|
"loss": 0.8281, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5752961082910322, |
|
"grad_norm": 3.2917606038623672, |
|
"learning_rate": 9.197422036290386e-07, |
|
"loss": 0.8388, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5922165820642978, |
|
"grad_norm": 3.6356342856497874, |
|
"learning_rate": 9.147974302859156e-07, |
|
"loss": 0.8479, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6091370558375635, |
|
"grad_norm": 3.531537582517598, |
|
"learning_rate": 9.097189826048659e-07, |
|
"loss": 0.8465, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.626057529610829, |
|
"grad_norm": 3.534890106733192, |
|
"learning_rate": 9.045084971874737e-07, |
|
"loss": 0.8338, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6429780033840947, |
|
"grad_norm": 3.5162595243845476, |
|
"learning_rate": 8.991676531863507e-07, |
|
"loss": 0.8454, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6598984771573604, |
|
"grad_norm": 3.6712481126005607, |
|
"learning_rate": 8.93698171764006e-07, |
|
"loss": 0.8239, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.676818950930626, |
|
"grad_norm": 3.3780652656023573, |
|
"learning_rate": 8.881018155381765e-07, |
|
"loss": 0.8269, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.676818950930626, |
|
"eval_loss": 0.8365465998649597, |
|
"eval_runtime": 146.6071, |
|
"eval_samples_per_second": 57.296, |
|
"eval_steps_per_second": 0.9, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6937394247038917, |
|
"grad_norm": 3.3425859477068123, |
|
"learning_rate": 8.823803880137992e-07, |
|
"loss": 0.8382, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7106598984771574, |
|
"grad_norm": 3.2738301474432956, |
|
"learning_rate": 8.765357330018055e-07, |
|
"loss": 0.8456, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.727580372250423, |
|
"grad_norm": 3.563151004829781, |
|
"learning_rate": 8.705697340249274e-07, |
|
"loss": 0.8266, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7445008460236887, |
|
"grad_norm": 3.1683419089159126, |
|
"learning_rate": 8.644843137107057e-07, |
|
"loss": 0.8403, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7614213197969543, |
|
"grad_norm": 3.609357791743884, |
|
"learning_rate": 8.58281433171896e-07, |
|
"loss": 0.8244, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7783417935702199, |
|
"grad_norm": 3.421312500448169, |
|
"learning_rate": 8.519630913744724e-07, |
|
"loss": 0.8288, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7952622673434856, |
|
"grad_norm": 3.577171842922122, |
|
"learning_rate": 8.455313244934324e-07, |
|
"loss": 0.8167, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8121827411167513, |
|
"grad_norm": 3.325456007325782, |
|
"learning_rate": 8.389882052566105e-07, |
|
"loss": 0.8118, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8291032148900169, |
|
"grad_norm": 3.5627054194832914, |
|
"learning_rate": 8.323358422767128e-07, |
|
"loss": 0.8378, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8460236886632826, |
|
"grad_norm": 3.57666960479274, |
|
"learning_rate": 8.255763793717867e-07, |
|
"loss": 0.8223, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8460236886632826, |
|
"eval_loss": 0.8226217031478882, |
|
"eval_runtime": 146.6337, |
|
"eval_samples_per_second": 57.286, |
|
"eval_steps_per_second": 0.9, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8629441624365483, |
|
"grad_norm": 3.4619536067260244, |
|
"learning_rate": 8.187119948743449e-07, |
|
"loss": 0.8215, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8798646362098139, |
|
"grad_norm": 3.3755438705432805, |
|
"learning_rate": 8.117449009293668e-07, |
|
"loss": 0.7959, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8967851099830795, |
|
"grad_norm": 3.3504490747841595, |
|
"learning_rate": 8.046773427814041e-07, |
|
"loss": 0.8198, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9137055837563451, |
|
"grad_norm": 3.652293584185451, |
|
"learning_rate": 7.975115980510185e-07, |
|
"loss": 0.8198, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9306260575296108, |
|
"grad_norm": 3.497277962331386, |
|
"learning_rate": 7.902499760007867e-07, |
|
"loss": 0.8181, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9475465313028765, |
|
"grad_norm": 3.5043433150139336, |
|
"learning_rate": 7.828948167911073e-07, |
|
"loss": 0.8151, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9644670050761421, |
|
"grad_norm": 3.5662591692739034, |
|
"learning_rate": 7.754484907260512e-07, |
|
"loss": 0.8192, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9813874788494078, |
|
"grad_norm": 3.741586297285277, |
|
"learning_rate": 7.679133974894982e-07, |
|
"loss": 0.7975, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9983079526226735, |
|
"grad_norm": 3.2550447105418057, |
|
"learning_rate": 7.602919653718043e-07, |
|
"loss": 0.7885, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"grad_norm": 3.3766404706055613, |
|
"learning_rate": 7.525866504872506e-07, |
|
"loss": 0.7651, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"eval_loss": 0.8148965239524841, |
|
"eval_runtime": 146.61, |
|
"eval_samples_per_second": 57.295, |
|
"eval_steps_per_second": 0.9, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0321489001692048, |
|
"grad_norm": 3.5183190070892314, |
|
"learning_rate": 7.447999359825262e-07, |
|
"loss": 0.7393, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.0490693739424704, |
|
"grad_norm": 3.7614891711473657, |
|
"learning_rate": 7.369343312364993e-07, |
|
"loss": 0.7621, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0659898477157361, |
|
"grad_norm": 3.5318777711133125, |
|
"learning_rate": 7.289923710515338e-07, |
|
"loss": 0.7546, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0829103214890017, |
|
"grad_norm": 3.5586679876971754, |
|
"learning_rate": 7.209766148366134e-07, |
|
"loss": 0.759, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0998307952622675, |
|
"grad_norm": 3.3557844891140305, |
|
"learning_rate": 7.128896457825363e-07, |
|
"loss": 0.7445, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.116751269035533, |
|
"grad_norm": 3.439711853714508, |
|
"learning_rate": 7.047340700294453e-07, |
|
"loss": 0.7406, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1336717428087986, |
|
"grad_norm": 3.522824978614251, |
|
"learning_rate": 6.965125158269618e-07, |
|
"loss": 0.7368, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.1505922165820643, |
|
"grad_norm": 4.008601044386287, |
|
"learning_rate": 6.882276326871959e-07, |
|
"loss": 0.7578, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.16751269035533, |
|
"grad_norm": 3.6557873733426955, |
|
"learning_rate": 6.798820905309035e-07, |
|
"loss": 0.7332, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.1844331641285957, |
|
"grad_norm": 3.5152732593214515, |
|
"learning_rate": 6.714785788270657e-07, |
|
"loss": 0.7388, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1844331641285957, |
|
"eval_loss": 0.8107805252075195, |
|
"eval_runtime": 146.5199, |
|
"eval_samples_per_second": 57.33, |
|
"eval_steps_per_second": 0.901, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2013536379018612, |
|
"grad_norm": 3.7338182802228093, |
|
"learning_rate": 6.630198057261709e-07, |
|
"loss": 0.7406, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.218274111675127, |
|
"grad_norm": 3.5135812697699724, |
|
"learning_rate": 6.545084971874736e-07, |
|
"loss": 0.7421, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2351945854483926, |
|
"grad_norm": 3.508021675469905, |
|
"learning_rate": 6.459473961005168e-07, |
|
"loss": 0.7755, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.252115059221658, |
|
"grad_norm": 3.5287017860167196, |
|
"learning_rate": 6.373392614011951e-07, |
|
"loss": 0.7408, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2690355329949239, |
|
"grad_norm": 3.6233235029794093, |
|
"learning_rate": 6.286868671826511e-07, |
|
"loss": 0.751, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2859560067681894, |
|
"grad_norm": 3.5669498367227304, |
|
"learning_rate": 6.199930018012829e-07, |
|
"loss": 0.7276, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.3028764805414552, |
|
"grad_norm": 3.7287000280408176, |
|
"learning_rate": 6.112604669781572e-07, |
|
"loss": 0.7278, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.3197969543147208, |
|
"grad_norm": 3.824405237133237, |
|
"learning_rate": 6.024920768961152e-07, |
|
"loss": 0.743, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.3367174280879865, |
|
"grad_norm": 3.5197677626280965, |
|
"learning_rate": 5.936906572928624e-07, |
|
"loss": 0.7159, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.353637901861252, |
|
"grad_norm": 3.759524343808812, |
|
"learning_rate": 5.848590445503344e-07, |
|
"loss": 0.7429, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.353637901861252, |
|
"eval_loss": 0.805133044719696, |
|
"eval_runtime": 146.7971, |
|
"eval_samples_per_second": 57.222, |
|
"eval_steps_per_second": 0.899, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3705583756345177, |
|
"grad_norm": 3.797267695279564, |
|
"learning_rate": 5.760000847806337e-07, |
|
"loss": 0.7464, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.3874788494077834, |
|
"grad_norm": 3.439223330784389, |
|
"learning_rate": 5.671166329088277e-07, |
|
"loss": 0.725, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.404399323181049, |
|
"grad_norm": 3.6761682639396653, |
|
"learning_rate": 5.582115517529114e-07, |
|
"loss": 0.7311, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.4213197969543148, |
|
"grad_norm": 3.571768390407566, |
|
"learning_rate": 5.492877111012218e-07, |
|
"loss": 0.7393, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4382402707275803, |
|
"grad_norm": 3.8046958424761623, |
|
"learning_rate": 5.403479867876087e-07, |
|
"loss": 0.758, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.455160744500846, |
|
"grad_norm": 3.552061598209118, |
|
"learning_rate": 5.313952597646567e-07, |
|
"loss": 0.741, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4720812182741116, |
|
"grad_norm": 3.5137582048526546, |
|
"learning_rate": 5.224324151752575e-07, |
|
"loss": 0.736, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.4890016920473772, |
|
"grad_norm": 3.6806640730520046, |
|
"learning_rate": 5.134623414228315e-07, |
|
"loss": 0.7414, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.505922165820643, |
|
"grad_norm": 3.7306988391241203, |
|
"learning_rate": 5.044879292404989e-07, |
|
"loss": 0.7578, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.5228426395939088, |
|
"grad_norm": 3.5044826704791543, |
|
"learning_rate": 4.95512070759501e-07, |
|
"loss": 0.7481, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5228426395939088, |
|
"eval_loss": 0.8002220392227173, |
|
"eval_runtime": 146.6261, |
|
"eval_samples_per_second": 57.289, |
|
"eval_steps_per_second": 0.9, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5397631133671743, |
|
"grad_norm": 3.5593238876932416, |
|
"learning_rate": 4.865376585771687e-07, |
|
"loss": 0.741, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.5566835871404399, |
|
"grad_norm": 3.9021045537145174, |
|
"learning_rate": 4.775675848247427e-07, |
|
"loss": 0.7462, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5736040609137056, |
|
"grad_norm": 3.603020142861588, |
|
"learning_rate": 4.686047402353433e-07, |
|
"loss": 0.7344, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.5905245346869712, |
|
"grad_norm": 3.5798855947417247, |
|
"learning_rate": 4.596520132123914e-07, |
|
"loss": 0.7246, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.6074450084602367, |
|
"grad_norm": 3.392440988553216, |
|
"learning_rate": 4.507122888987782e-07, |
|
"loss": 0.7304, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.6243654822335025, |
|
"grad_norm": 3.7346005543444307, |
|
"learning_rate": 4.417884482470886e-07, |
|
"loss": 0.7329, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.6412859560067683, |
|
"grad_norm": 3.929271128512869, |
|
"learning_rate": 4.328833670911724e-07, |
|
"loss": 0.7529, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.6582064297800339, |
|
"grad_norm": 3.5171536414776163, |
|
"learning_rate": 4.239999152193664e-07, |
|
"loss": 0.7531, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6751269035532994, |
|
"grad_norm": 3.574806818948794, |
|
"learning_rate": 4.1514095544966557e-07, |
|
"loss": 0.7418, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.6920473773265652, |
|
"grad_norm": 3.5129274484405486, |
|
"learning_rate": 4.0630934270713755e-07, |
|
"loss": 0.7308, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6920473773265652, |
|
"eval_loss": 0.795360267162323, |
|
"eval_runtime": 146.6381, |
|
"eval_samples_per_second": 57.284, |
|
"eval_steps_per_second": 0.9, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.708967851099831, |
|
"grad_norm": 3.483481666306759, |
|
"learning_rate": 3.9750792310388483e-07, |
|
"loss": 0.7311, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.7258883248730963, |
|
"grad_norm": 3.5163371793641387, |
|
"learning_rate": 3.8873953302184283e-07, |
|
"loss": 0.7268, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.742808798646362, |
|
"grad_norm": 3.746899781553285, |
|
"learning_rate": 3.80006998198717e-07, |
|
"loss": 0.7471, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.7597292724196278, |
|
"grad_norm": 3.6600329176410735, |
|
"learning_rate": 3.713131328173489e-07, |
|
"loss": 0.7426, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7766497461928934, |
|
"grad_norm": 3.715103932641678, |
|
"learning_rate": 3.62660738598805e-07, |
|
"loss": 0.7452, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.793570219966159, |
|
"grad_norm": 3.6219953866879036, |
|
"learning_rate": 3.5405260389948333e-07, |
|
"loss": 0.7447, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.8104906937394247, |
|
"grad_norm": 3.5483251792927866, |
|
"learning_rate": 3.454915028125263e-07, |
|
"loss": 0.7219, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.8274111675126905, |
|
"grad_norm": 3.687667827428392, |
|
"learning_rate": 3.369801942738291e-07, |
|
"loss": 0.7297, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.844331641285956, |
|
"grad_norm": 3.538997656633232, |
|
"learning_rate": 3.285214211729343e-07, |
|
"loss": 0.7498, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.8612521150592216, |
|
"grad_norm": 3.763523903194986, |
|
"learning_rate": 3.2011790946909666e-07, |
|
"loss": 0.7306, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8612521150592216, |
|
"eval_loss": 0.7919500470161438, |
|
"eval_runtime": 146.647, |
|
"eval_samples_per_second": 57.28, |
|
"eval_steps_per_second": 0.9, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8781725888324874, |
|
"grad_norm": 4.154646770046085, |
|
"learning_rate": 3.11772367312804e-07, |
|
"loss": 0.7364, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.895093062605753, |
|
"grad_norm": 3.6964867615997874, |
|
"learning_rate": 3.034874841730382e-07, |
|
"loss": 0.7357, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.9120135363790185, |
|
"grad_norm": 3.6098682228440024, |
|
"learning_rate": 2.9526592997055483e-07, |
|
"loss": 0.7435, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.9289340101522843, |
|
"grad_norm": 3.5510859742021514, |
|
"learning_rate": 2.8711035421746363e-07, |
|
"loss": 0.7401, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.94585448392555, |
|
"grad_norm": 3.5339412485916477, |
|
"learning_rate": 2.7902338516338674e-07, |
|
"loss": 0.7196, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.9627749576988156, |
|
"grad_norm": 3.654769871220945, |
|
"learning_rate": 2.7100762894846627e-07, |
|
"loss": 0.7427, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9796954314720812, |
|
"grad_norm": 3.5585570542699485, |
|
"learning_rate": 2.6306566876350067e-07, |
|
"loss": 0.7549, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.996615905245347, |
|
"grad_norm": 3.5792060121886804, |
|
"learning_rate": 2.5520006401747395e-07, |
|
"loss": 0.7306, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.0135363790186127, |
|
"grad_norm": 3.629067553958508, |
|
"learning_rate": 2.474133495127494e-07, |
|
"loss": 0.7062, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"grad_norm": 3.646688738000953, |
|
"learning_rate": 2.3970803462819583e-07, |
|
"loss": 0.7065, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"eval_loss": 0.7942918539047241, |
|
"eval_runtime": 304.0198, |
|
"eval_samples_per_second": 27.63, |
|
"eval_steps_per_second": 0.434, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.047377326565144, |
|
"grad_norm": 4.052211962047984, |
|
"learning_rate": 2.3208660251050156e-07, |
|
"loss": 0.675, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.0642978003384096, |
|
"grad_norm": 3.9688880359901444, |
|
"learning_rate": 2.2455150927394878e-07, |
|
"loss": 0.6934, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.081218274111675, |
|
"grad_norm": 3.8191830448574575, |
|
"learning_rate": 2.1710518320889276e-07, |
|
"loss": 0.695, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.0981387478849407, |
|
"grad_norm": 3.9122017489827488, |
|
"learning_rate": 2.097500239992132e-07, |
|
"loss": 0.6909, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.1150592216582065, |
|
"grad_norm": 4.035937936283847, |
|
"learning_rate": 2.0248840194898155e-07, |
|
"loss": 0.6869, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.1319796954314723, |
|
"grad_norm": 4.0592719997150875, |
|
"learning_rate": 1.9532265721859597e-07, |
|
"loss": 0.6758, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.1489001692047376, |
|
"grad_norm": 3.99048820587498, |
|
"learning_rate": 1.8825509907063326e-07, |
|
"loss": 0.6717, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.1658206429780034, |
|
"grad_norm": 3.6085759994280413, |
|
"learning_rate": 1.812880051256551e-07, |
|
"loss": 0.7084, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.182741116751269, |
|
"grad_norm": 3.8339000315450633, |
|
"learning_rate": 1.744236206282132e-07, |
|
"loss": 0.6795, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.199661590524535, |
|
"grad_norm": 3.8042578709482937, |
|
"learning_rate": 1.6766415772328728e-07, |
|
"loss": 0.695, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.199661590524535, |
|
"eval_loss": 0.7947296500205994, |
|
"eval_runtime": 146.6011, |
|
"eval_samples_per_second": 57.298, |
|
"eval_steps_per_second": 0.9, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.2165820642978002, |
|
"grad_norm": 3.6822683929855318, |
|
"learning_rate": 1.6101179474338966e-07, |
|
"loss": 0.6637, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.233502538071066, |
|
"grad_norm": 3.8036837748646746, |
|
"learning_rate": 1.5446867550656767e-07, |
|
"loss": 0.6846, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.250423011844332, |
|
"grad_norm": 3.7154882715692747, |
|
"learning_rate": 1.4803690862552753e-07, |
|
"loss": 0.6761, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.267343485617597, |
|
"grad_norm": 3.8188043343483167, |
|
"learning_rate": 1.4171856682810384e-07, |
|
"loss": 0.6834, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.284263959390863, |
|
"grad_norm": 3.7763683799555494, |
|
"learning_rate": 1.3551568628929432e-07, |
|
"loss": 0.682, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.3011844331641287, |
|
"grad_norm": 3.7184968099032742, |
|
"learning_rate": 1.2943026597507267e-07, |
|
"loss": 0.6758, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.3181049069373945, |
|
"grad_norm": 3.6752732128517978, |
|
"learning_rate": 1.2346426699819456e-07, |
|
"loss": 0.6778, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.33502538071066, |
|
"grad_norm": 3.914291257516614, |
|
"learning_rate": 1.176196119862008e-07, |
|
"loss": 0.6915, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.3519458544839256, |
|
"grad_norm": 3.6920364954305307, |
|
"learning_rate": 1.1189818446182358e-07, |
|
"loss": 0.6858, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.3688663282571913, |
|
"grad_norm": 3.848115462041246, |
|
"learning_rate": 1.0630182823599399e-07, |
|
"loss": 0.7013, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3688663282571913, |
|
"eval_loss": 0.7938902378082275, |
|
"eval_runtime": 146.616, |
|
"eval_samples_per_second": 57.292, |
|
"eval_steps_per_second": 0.9, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3857868020304567, |
|
"grad_norm": 3.7700574896430665, |
|
"learning_rate": 1.0083234681364932e-07, |
|
"loss": 0.6637, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.4027072758037225, |
|
"grad_norm": 3.662805050537354, |
|
"learning_rate": 9.549150281252632e-08, |
|
"loss": 0.6885, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.4196277495769882, |
|
"grad_norm": 3.7513042616130616, |
|
"learning_rate": 9.028101739513405e-08, |
|
"loss": 0.6949, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.436548223350254, |
|
"grad_norm": 3.5781011423869193, |
|
"learning_rate": 8.520256971408452e-08, |
|
"loss": 0.6796, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.4534686971235193, |
|
"grad_norm": 3.8550128332718465, |
|
"learning_rate": 8.025779637096137e-08, |
|
"loss": 0.6724, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.470389170896785, |
|
"grad_norm": 3.908604741906314, |
|
"learning_rate": 7.544829088890325e-08, |
|
"loss": 0.6789, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.487309644670051, |
|
"grad_norm": 3.815438363979128, |
|
"learning_rate": 7.077560319906694e-08, |
|
"loss": 0.6878, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.504230118443316, |
|
"grad_norm": 3.678241183457789, |
|
"learning_rate": 6.624123914114122e-08, |
|
"loss": 0.6758, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.521150592216582, |
|
"grad_norm": 3.766677918686516, |
|
"learning_rate": 6.184665997806831e-08, |
|
"loss": 0.6743, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.5380710659898478, |
|
"grad_norm": 3.7999340905324974, |
|
"learning_rate": 5.759328192513074e-08, |
|
"loss": 0.6743, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.5380710659898478, |
|
"eval_loss": 0.7932254672050476, |
|
"eval_runtime": 146.605, |
|
"eval_samples_per_second": 57.297, |
|
"eval_steps_per_second": 0.9, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.5549915397631136, |
|
"grad_norm": 3.865893271080869, |
|
"learning_rate": 5.348247569355735e-08, |
|
"loss": 0.6872, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.571912013536379, |
|
"grad_norm": 4.144060563570108, |
|
"learning_rate": 4.951556604879048e-08, |
|
"loss": 0.694, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.5888324873096447, |
|
"grad_norm": 3.780680669356305, |
|
"learning_rate": 4.569383138356275e-08, |
|
"loss": 0.6826, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.6057529610829104, |
|
"grad_norm": 4.114587718099576, |
|
"learning_rate": 4.201850330591677e-08, |
|
"loss": 0.68, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.6226734348561758, |
|
"grad_norm": 3.4623751046204285, |
|
"learning_rate": 3.8490766242301353e-08, |
|
"loss": 0.6658, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.6395939086294415, |
|
"grad_norm": 3.9196860968662137, |
|
"learning_rate": 3.5111757055874326e-08, |
|
"loss": 0.6898, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.6565143824027073, |
|
"grad_norm": 3.820036546151991, |
|
"learning_rate": 3.188256468013139e-08, |
|
"loss": 0.6732, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.673434856175973, |
|
"grad_norm": 4.231574698760902, |
|
"learning_rate": 2.8804229767982636e-08, |
|
"loss": 0.6728, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.6903553299492384, |
|
"grad_norm": 4.115986044491551, |
|
"learning_rate": 2.587774435638679e-08, |
|
"loss": 0.6755, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.707275803722504, |
|
"grad_norm": 3.724220098163462, |
|
"learning_rate": 2.3104051546654013e-08, |
|
"loss": 0.6778, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.707275803722504, |
|
"eval_loss": 0.7928686141967773, |
|
"eval_runtime": 146.5361, |
|
"eval_samples_per_second": 57.324, |
|
"eval_steps_per_second": 0.901, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.72419627749577, |
|
"grad_norm": 4.055902737307998, |
|
"learning_rate": 2.048404520051722e-08, |
|
"loss": 0.6846, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.7411167512690353, |
|
"grad_norm": 3.761429857208659, |
|
"learning_rate": 1.8018569652073378e-08, |
|
"loss": 0.6865, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.758037225042301, |
|
"grad_norm": 3.828128027906773, |
|
"learning_rate": 1.570841943568446e-08, |
|
"loss": 0.6712, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.774957698815567, |
|
"grad_norm": 3.765063878680342, |
|
"learning_rate": 1.3554339029927531e-08, |
|
"loss": 0.6816, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.7918781725888326, |
|
"grad_norm": 3.827232987418165, |
|
"learning_rate": 1.1557022617676216e-08, |
|
"loss": 0.6789, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.808798646362098, |
|
"grad_norm": 3.9934159183359106, |
|
"learning_rate": 9.717113862389992e-09, |
|
"loss": 0.6961, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.8257191201353637, |
|
"grad_norm": 3.905375697814281, |
|
"learning_rate": 8.035205700685165e-09, |
|
"loss": 0.6794, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.8426395939086295, |
|
"grad_norm": 3.8176339677184687, |
|
"learning_rate": 6.511840151252168e-09, |
|
"loss": 0.6858, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.859560067681895, |
|
"grad_norm": 3.669045035086845, |
|
"learning_rate": 5.147508140182555e-09, |
|
"loss": 0.6709, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.8764805414551606, |
|
"grad_norm": 3.7873007524702644, |
|
"learning_rate": 3.9426493427611175e-09, |
|
"loss": 0.6951, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.8764805414551606, |
|
"eval_loss": 0.7926760911941528, |
|
"eval_runtime": 146.7334, |
|
"eval_samples_per_second": 57.247, |
|
"eval_steps_per_second": 0.9, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.8934010152284264, |
|
"grad_norm": 3.6861972611151548, |
|
"learning_rate": 2.897652041774279e-09, |
|
"loss": 0.6878, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.910321489001692, |
|
"grad_norm": 3.6981706590030003, |
|
"learning_rate": 2.0128530023804656e-09, |
|
"loss": 0.6979, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.927241962774958, |
|
"grad_norm": 3.788952889279561, |
|
"learning_rate": 1.2885373635829754e-09, |
|
"loss": 0.6897, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.9441624365482233, |
|
"grad_norm": 3.818071711218172, |
|
"learning_rate": 7.249385463395374e-10, |
|
"loss": 0.6883, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.961082910321489, |
|
"grad_norm": 3.612189636258872, |
|
"learning_rate": 3.22238178339318e-10, |
|
"loss": 0.6879, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.9780033840947544, |
|
"grad_norm": 3.6807850492378975, |
|
"learning_rate": 8.056603547090812e-11, |
|
"loss": 0.6952, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.99492385786802, |
|
"grad_norm": 3.843102017586158, |
|
"learning_rate": 0.0, |
|
"loss": 0.6819, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.99492385786802, |
|
"step": 885, |
|
"total_flos": 5218127163949056.0, |
|
"train_loss": 0.776407975396194, |
|
"train_runtime": 14269.6597, |
|
"train_samples_per_second": 15.893, |
|
"train_steps_per_second": 0.062 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 885, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5218127163949056.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|