htlou's picture
Upload folder using huggingface_hub
39f9860 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.99492385786802,
"eval_steps": 50,
"global_step": 885,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01692047377326565,
"grad_norm": 17.95940849112974,
"learning_rate": 5e-07,
"loss": 1.7425,
"step": 5
},
{
"epoch": 0.0338409475465313,
"grad_norm": 12.23967055037602,
"learning_rate": 1e-06,
"loss": 1.599,
"step": 10
},
{
"epoch": 0.050761421319796954,
"grad_norm": 7.600510364072396,
"learning_rate": 9.99919433964529e-07,
"loss": 1.2976,
"step": 15
},
{
"epoch": 0.0676818950930626,
"grad_norm": 4.134278668785207,
"learning_rate": 9.996777618216605e-07,
"loss": 1.1572,
"step": 20
},
{
"epoch": 0.08460236886632826,
"grad_norm": 3.846352071719652,
"learning_rate": 9.992750614536604e-07,
"loss": 1.0495,
"step": 25
},
{
"epoch": 0.10152284263959391,
"grad_norm": 3.4799081252721886,
"learning_rate": 9.98711462636417e-07,
"loss": 1.0222,
"step": 30
},
{
"epoch": 0.11844331641285956,
"grad_norm": 3.6435583580883644,
"learning_rate": 9.979871469976195e-07,
"loss": 0.982,
"step": 35
},
{
"epoch": 0.1353637901861252,
"grad_norm": 3.474443331449369,
"learning_rate": 9.971023479582256e-07,
"loss": 0.9659,
"step": 40
},
{
"epoch": 0.15228426395939088,
"grad_norm": 3.5291718369580094,
"learning_rate": 9.960573506572389e-07,
"loss": 0.9517,
"step": 45
},
{
"epoch": 0.1692047377326565,
"grad_norm": 3.6607844369820763,
"learning_rate": 9.948524918598173e-07,
"loss": 0.9744,
"step": 50
},
{
"epoch": 0.1692047377326565,
"eval_loss": 0.9363481402397156,
"eval_runtime": 147.9244,
"eval_samples_per_second": 56.786,
"eval_steps_per_second": 0.892,
"step": 50
},
{
"epoch": 0.18612521150592218,
"grad_norm": 3.5029465776623305,
"learning_rate": 9.934881598487478e-07,
"loss": 0.9291,
"step": 55
},
{
"epoch": 0.20304568527918782,
"grad_norm": 3.420583685374325,
"learning_rate": 9.919647942993147e-07,
"loss": 0.9373,
"step": 60
},
{
"epoch": 0.21996615905245348,
"grad_norm": 3.639006411388483,
"learning_rate": 9.9028288613761e-07,
"loss": 0.9371,
"step": 65
},
{
"epoch": 0.23688663282571912,
"grad_norm": 3.739200188919654,
"learning_rate": 9.884429773823236e-07,
"loss": 0.9168,
"step": 70
},
{
"epoch": 0.25380710659898476,
"grad_norm": 3.6961021681290647,
"learning_rate": 9.864456609700723e-07,
"loss": 0.9036,
"step": 75
},
{
"epoch": 0.2707275803722504,
"grad_norm": 3.469126484018746,
"learning_rate": 9.842915805643156e-07,
"loss": 0.8789,
"step": 80
},
{
"epoch": 0.2876480541455161,
"grad_norm": 3.4540644342177953,
"learning_rate": 9.819814303479267e-07,
"loss": 0.8843,
"step": 85
},
{
"epoch": 0.30456852791878175,
"grad_norm": 3.491589457047051,
"learning_rate": 9.795159547994828e-07,
"loss": 0.878,
"step": 90
},
{
"epoch": 0.32148900169204736,
"grad_norm": 3.3672158895857582,
"learning_rate": 9.76895948453346e-07,
"loss": 0.8817,
"step": 95
},
{
"epoch": 0.338409475465313,
"grad_norm": 3.4204437907535032,
"learning_rate": 9.74122255643613e-07,
"loss": 0.8749,
"step": 100
},
{
"epoch": 0.338409475465313,
"eval_loss": 0.8794726729393005,
"eval_runtime": 146.5584,
"eval_samples_per_second": 57.315,
"eval_steps_per_second": 0.901,
"step": 100
},
{
"epoch": 0.3553299492385787,
"grad_norm": 3.499218396193934,
"learning_rate": 9.711957702320174e-07,
"loss": 0.8689,
"step": 105
},
{
"epoch": 0.37225042301184436,
"grad_norm": 3.575982847323226,
"learning_rate": 9.681174353198686e-07,
"loss": 0.8552,
"step": 110
},
{
"epoch": 0.38917089678510997,
"grad_norm": 3.5820322461640624,
"learning_rate": 9.648882429441256e-07,
"loss": 0.8723,
"step": 115
},
{
"epoch": 0.40609137055837563,
"grad_norm": 3.428649928071515,
"learning_rate": 9.615092337576987e-07,
"loss": 0.8737,
"step": 120
},
{
"epoch": 0.4230118443316413,
"grad_norm": 3.439041413094408,
"learning_rate": 9.579814966940833e-07,
"loss": 0.8574,
"step": 125
},
{
"epoch": 0.43993231810490696,
"grad_norm": 3.611321063385662,
"learning_rate": 9.543061686164372e-07,
"loss": 0.8774,
"step": 130
},
{
"epoch": 0.45685279187817257,
"grad_norm": 3.3003493132482657,
"learning_rate": 9.504844339512094e-07,
"loss": 0.8594,
"step": 135
},
{
"epoch": 0.47377326565143824,
"grad_norm": 3.4741051861556684,
"learning_rate": 9.465175243064428e-07,
"loss": 0.8674,
"step": 140
},
{
"epoch": 0.4906937394247039,
"grad_norm": 3.2142657401436416,
"learning_rate": 9.424067180748691e-07,
"loss": 0.8648,
"step": 145
},
{
"epoch": 0.5076142131979695,
"grad_norm": 3.5722793490057336,
"learning_rate": 9.381533400219317e-07,
"loss": 0.8423,
"step": 150
},
{
"epoch": 0.5076142131979695,
"eval_loss": 0.8532436490058899,
"eval_runtime": 146.5502,
"eval_samples_per_second": 57.318,
"eval_steps_per_second": 0.901,
"step": 150
},
{
"epoch": 0.5245346869712352,
"grad_norm": 3.387238895689527,
"learning_rate": 9.337587608588588e-07,
"loss": 0.8344,
"step": 155
},
{
"epoch": 0.5414551607445008,
"grad_norm": 3.3257959351398165,
"learning_rate": 9.29224396800933e-07,
"loss": 0.8296,
"step": 160
},
{
"epoch": 0.5583756345177665,
"grad_norm": 3.4206543705067425,
"learning_rate": 9.245517091110968e-07,
"loss": 0.8281,
"step": 165
},
{
"epoch": 0.5752961082910322,
"grad_norm": 3.2917606038623672,
"learning_rate": 9.197422036290386e-07,
"loss": 0.8388,
"step": 170
},
{
"epoch": 0.5922165820642978,
"grad_norm": 3.6356342856497874,
"learning_rate": 9.147974302859156e-07,
"loss": 0.8479,
"step": 175
},
{
"epoch": 0.6091370558375635,
"grad_norm": 3.531537582517598,
"learning_rate": 9.097189826048659e-07,
"loss": 0.8465,
"step": 180
},
{
"epoch": 0.626057529610829,
"grad_norm": 3.534890106733192,
"learning_rate": 9.045084971874737e-07,
"loss": 0.8338,
"step": 185
},
{
"epoch": 0.6429780033840947,
"grad_norm": 3.5162595243845476,
"learning_rate": 8.991676531863507e-07,
"loss": 0.8454,
"step": 190
},
{
"epoch": 0.6598984771573604,
"grad_norm": 3.6712481126005607,
"learning_rate": 8.93698171764006e-07,
"loss": 0.8239,
"step": 195
},
{
"epoch": 0.676818950930626,
"grad_norm": 3.3780652656023573,
"learning_rate": 8.881018155381765e-07,
"loss": 0.8269,
"step": 200
},
{
"epoch": 0.676818950930626,
"eval_loss": 0.8365465998649597,
"eval_runtime": 146.6071,
"eval_samples_per_second": 57.296,
"eval_steps_per_second": 0.9,
"step": 200
},
{
"epoch": 0.6937394247038917,
"grad_norm": 3.3425859477068123,
"learning_rate": 8.823803880137992e-07,
"loss": 0.8382,
"step": 205
},
{
"epoch": 0.7106598984771574,
"grad_norm": 3.2738301474432956,
"learning_rate": 8.765357330018055e-07,
"loss": 0.8456,
"step": 210
},
{
"epoch": 0.727580372250423,
"grad_norm": 3.563151004829781,
"learning_rate": 8.705697340249274e-07,
"loss": 0.8266,
"step": 215
},
{
"epoch": 0.7445008460236887,
"grad_norm": 3.1683419089159126,
"learning_rate": 8.644843137107057e-07,
"loss": 0.8403,
"step": 220
},
{
"epoch": 0.7614213197969543,
"grad_norm": 3.609357791743884,
"learning_rate": 8.58281433171896e-07,
"loss": 0.8244,
"step": 225
},
{
"epoch": 0.7783417935702199,
"grad_norm": 3.421312500448169,
"learning_rate": 8.519630913744724e-07,
"loss": 0.8288,
"step": 230
},
{
"epoch": 0.7952622673434856,
"grad_norm": 3.577171842922122,
"learning_rate": 8.455313244934324e-07,
"loss": 0.8167,
"step": 235
},
{
"epoch": 0.8121827411167513,
"grad_norm": 3.325456007325782,
"learning_rate": 8.389882052566105e-07,
"loss": 0.8118,
"step": 240
},
{
"epoch": 0.8291032148900169,
"grad_norm": 3.5627054194832914,
"learning_rate": 8.323358422767128e-07,
"loss": 0.8378,
"step": 245
},
{
"epoch": 0.8460236886632826,
"grad_norm": 3.57666960479274,
"learning_rate": 8.255763793717867e-07,
"loss": 0.8223,
"step": 250
},
{
"epoch": 0.8460236886632826,
"eval_loss": 0.8226217031478882,
"eval_runtime": 146.6337,
"eval_samples_per_second": 57.286,
"eval_steps_per_second": 0.9,
"step": 250
},
{
"epoch": 0.8629441624365483,
"grad_norm": 3.4619536067260244,
"learning_rate": 8.187119948743449e-07,
"loss": 0.8215,
"step": 255
},
{
"epoch": 0.8798646362098139,
"grad_norm": 3.3755438705432805,
"learning_rate": 8.117449009293668e-07,
"loss": 0.7959,
"step": 260
},
{
"epoch": 0.8967851099830795,
"grad_norm": 3.3504490747841595,
"learning_rate": 8.046773427814041e-07,
"loss": 0.8198,
"step": 265
},
{
"epoch": 0.9137055837563451,
"grad_norm": 3.652293584185451,
"learning_rate": 7.975115980510185e-07,
"loss": 0.8198,
"step": 270
},
{
"epoch": 0.9306260575296108,
"grad_norm": 3.497277962331386,
"learning_rate": 7.902499760007867e-07,
"loss": 0.8181,
"step": 275
},
{
"epoch": 0.9475465313028765,
"grad_norm": 3.5043433150139336,
"learning_rate": 7.828948167911073e-07,
"loss": 0.8151,
"step": 280
},
{
"epoch": 0.9644670050761421,
"grad_norm": 3.5662591692739034,
"learning_rate": 7.754484907260512e-07,
"loss": 0.8192,
"step": 285
},
{
"epoch": 0.9813874788494078,
"grad_norm": 3.741586297285277,
"learning_rate": 7.679133974894982e-07,
"loss": 0.7975,
"step": 290
},
{
"epoch": 0.9983079526226735,
"grad_norm": 3.2550447105418057,
"learning_rate": 7.602919653718043e-07,
"loss": 0.7885,
"step": 295
},
{
"epoch": 1.015228426395939,
"grad_norm": 3.3766404706055613,
"learning_rate": 7.525866504872506e-07,
"loss": 0.7651,
"step": 300
},
{
"epoch": 1.015228426395939,
"eval_loss": 0.8148965239524841,
"eval_runtime": 146.61,
"eval_samples_per_second": 57.295,
"eval_steps_per_second": 0.9,
"step": 300
},
{
"epoch": 1.0321489001692048,
"grad_norm": 3.5183190070892314,
"learning_rate": 7.447999359825262e-07,
"loss": 0.7393,
"step": 305
},
{
"epoch": 1.0490693739424704,
"grad_norm": 3.7614891711473657,
"learning_rate": 7.369343312364993e-07,
"loss": 0.7621,
"step": 310
},
{
"epoch": 1.0659898477157361,
"grad_norm": 3.5318777711133125,
"learning_rate": 7.289923710515338e-07,
"loss": 0.7546,
"step": 315
},
{
"epoch": 1.0829103214890017,
"grad_norm": 3.5586679876971754,
"learning_rate": 7.209766148366134e-07,
"loss": 0.759,
"step": 320
},
{
"epoch": 1.0998307952622675,
"grad_norm": 3.3557844891140305,
"learning_rate": 7.128896457825363e-07,
"loss": 0.7445,
"step": 325
},
{
"epoch": 1.116751269035533,
"grad_norm": 3.439711853714508,
"learning_rate": 7.047340700294453e-07,
"loss": 0.7406,
"step": 330
},
{
"epoch": 1.1336717428087986,
"grad_norm": 3.522824978614251,
"learning_rate": 6.965125158269618e-07,
"loss": 0.7368,
"step": 335
},
{
"epoch": 1.1505922165820643,
"grad_norm": 4.008601044386287,
"learning_rate": 6.882276326871959e-07,
"loss": 0.7578,
"step": 340
},
{
"epoch": 1.16751269035533,
"grad_norm": 3.6557873733426955,
"learning_rate": 6.798820905309035e-07,
"loss": 0.7332,
"step": 345
},
{
"epoch": 1.1844331641285957,
"grad_norm": 3.5152732593214515,
"learning_rate": 6.714785788270657e-07,
"loss": 0.7388,
"step": 350
},
{
"epoch": 1.1844331641285957,
"eval_loss": 0.8107805252075195,
"eval_runtime": 146.5199,
"eval_samples_per_second": 57.33,
"eval_steps_per_second": 0.901,
"step": 350
},
{
"epoch": 1.2013536379018612,
"grad_norm": 3.7338182802228093,
"learning_rate": 6.630198057261709e-07,
"loss": 0.7406,
"step": 355
},
{
"epoch": 1.218274111675127,
"grad_norm": 3.5135812697699724,
"learning_rate": 6.545084971874736e-07,
"loss": 0.7421,
"step": 360
},
{
"epoch": 1.2351945854483926,
"grad_norm": 3.508021675469905,
"learning_rate": 6.459473961005168e-07,
"loss": 0.7755,
"step": 365
},
{
"epoch": 1.252115059221658,
"grad_norm": 3.5287017860167196,
"learning_rate": 6.373392614011951e-07,
"loss": 0.7408,
"step": 370
},
{
"epoch": 1.2690355329949239,
"grad_norm": 3.6233235029794093,
"learning_rate": 6.286868671826511e-07,
"loss": 0.751,
"step": 375
},
{
"epoch": 1.2859560067681894,
"grad_norm": 3.5669498367227304,
"learning_rate": 6.199930018012829e-07,
"loss": 0.7276,
"step": 380
},
{
"epoch": 1.3028764805414552,
"grad_norm": 3.7287000280408176,
"learning_rate": 6.112604669781572e-07,
"loss": 0.7278,
"step": 385
},
{
"epoch": 1.3197969543147208,
"grad_norm": 3.824405237133237,
"learning_rate": 6.024920768961152e-07,
"loss": 0.743,
"step": 390
},
{
"epoch": 1.3367174280879865,
"grad_norm": 3.5197677626280965,
"learning_rate": 5.936906572928624e-07,
"loss": 0.7159,
"step": 395
},
{
"epoch": 1.353637901861252,
"grad_norm": 3.759524343808812,
"learning_rate": 5.848590445503344e-07,
"loss": 0.7429,
"step": 400
},
{
"epoch": 1.353637901861252,
"eval_loss": 0.805133044719696,
"eval_runtime": 146.7971,
"eval_samples_per_second": 57.222,
"eval_steps_per_second": 0.899,
"step": 400
},
{
"epoch": 1.3705583756345177,
"grad_norm": 3.797267695279564,
"learning_rate": 5.760000847806337e-07,
"loss": 0.7464,
"step": 405
},
{
"epoch": 1.3874788494077834,
"grad_norm": 3.439223330784389,
"learning_rate": 5.671166329088277e-07,
"loss": 0.725,
"step": 410
},
{
"epoch": 1.404399323181049,
"grad_norm": 3.6761682639396653,
"learning_rate": 5.582115517529114e-07,
"loss": 0.7311,
"step": 415
},
{
"epoch": 1.4213197969543148,
"grad_norm": 3.571768390407566,
"learning_rate": 5.492877111012218e-07,
"loss": 0.7393,
"step": 420
},
{
"epoch": 1.4382402707275803,
"grad_norm": 3.8046958424761623,
"learning_rate": 5.403479867876087e-07,
"loss": 0.758,
"step": 425
},
{
"epoch": 1.455160744500846,
"grad_norm": 3.552061598209118,
"learning_rate": 5.313952597646567e-07,
"loss": 0.741,
"step": 430
},
{
"epoch": 1.4720812182741116,
"grad_norm": 3.5137582048526546,
"learning_rate": 5.224324151752575e-07,
"loss": 0.736,
"step": 435
},
{
"epoch": 1.4890016920473772,
"grad_norm": 3.6806640730520046,
"learning_rate": 5.134623414228315e-07,
"loss": 0.7414,
"step": 440
},
{
"epoch": 1.505922165820643,
"grad_norm": 3.7306988391241203,
"learning_rate": 5.044879292404989e-07,
"loss": 0.7578,
"step": 445
},
{
"epoch": 1.5228426395939088,
"grad_norm": 3.5044826704791543,
"learning_rate": 4.95512070759501e-07,
"loss": 0.7481,
"step": 450
},
{
"epoch": 1.5228426395939088,
"eval_loss": 0.8002220392227173,
"eval_runtime": 146.6261,
"eval_samples_per_second": 57.289,
"eval_steps_per_second": 0.9,
"step": 450
},
{
"epoch": 1.5397631133671743,
"grad_norm": 3.5593238876932416,
"learning_rate": 4.865376585771687e-07,
"loss": 0.741,
"step": 455
},
{
"epoch": 1.5566835871404399,
"grad_norm": 3.9021045537145174,
"learning_rate": 4.775675848247427e-07,
"loss": 0.7462,
"step": 460
},
{
"epoch": 1.5736040609137056,
"grad_norm": 3.603020142861588,
"learning_rate": 4.686047402353433e-07,
"loss": 0.7344,
"step": 465
},
{
"epoch": 1.5905245346869712,
"grad_norm": 3.5798855947417247,
"learning_rate": 4.596520132123914e-07,
"loss": 0.7246,
"step": 470
},
{
"epoch": 1.6074450084602367,
"grad_norm": 3.392440988553216,
"learning_rate": 4.507122888987782e-07,
"loss": 0.7304,
"step": 475
},
{
"epoch": 1.6243654822335025,
"grad_norm": 3.7346005543444307,
"learning_rate": 4.417884482470886e-07,
"loss": 0.7329,
"step": 480
},
{
"epoch": 1.6412859560067683,
"grad_norm": 3.929271128512869,
"learning_rate": 4.328833670911724e-07,
"loss": 0.7529,
"step": 485
},
{
"epoch": 1.6582064297800339,
"grad_norm": 3.5171536414776163,
"learning_rate": 4.239999152193664e-07,
"loss": 0.7531,
"step": 490
},
{
"epoch": 1.6751269035532994,
"grad_norm": 3.574806818948794,
"learning_rate": 4.1514095544966557e-07,
"loss": 0.7418,
"step": 495
},
{
"epoch": 1.6920473773265652,
"grad_norm": 3.5129274484405486,
"learning_rate": 4.0630934270713755e-07,
"loss": 0.7308,
"step": 500
},
{
"epoch": 1.6920473773265652,
"eval_loss": 0.795360267162323,
"eval_runtime": 146.6381,
"eval_samples_per_second": 57.284,
"eval_steps_per_second": 0.9,
"step": 500
},
{
"epoch": 1.708967851099831,
"grad_norm": 3.483481666306759,
"learning_rate": 3.9750792310388483e-07,
"loss": 0.7311,
"step": 505
},
{
"epoch": 1.7258883248730963,
"grad_norm": 3.5163371793641387,
"learning_rate": 3.8873953302184283e-07,
"loss": 0.7268,
"step": 510
},
{
"epoch": 1.742808798646362,
"grad_norm": 3.746899781553285,
"learning_rate": 3.80006998198717e-07,
"loss": 0.7471,
"step": 515
},
{
"epoch": 1.7597292724196278,
"grad_norm": 3.6600329176410735,
"learning_rate": 3.713131328173489e-07,
"loss": 0.7426,
"step": 520
},
{
"epoch": 1.7766497461928934,
"grad_norm": 3.715103932641678,
"learning_rate": 3.62660738598805e-07,
"loss": 0.7452,
"step": 525
},
{
"epoch": 1.793570219966159,
"grad_norm": 3.6219953866879036,
"learning_rate": 3.5405260389948333e-07,
"loss": 0.7447,
"step": 530
},
{
"epoch": 1.8104906937394247,
"grad_norm": 3.5483251792927866,
"learning_rate": 3.454915028125263e-07,
"loss": 0.7219,
"step": 535
},
{
"epoch": 1.8274111675126905,
"grad_norm": 3.687667827428392,
"learning_rate": 3.369801942738291e-07,
"loss": 0.7297,
"step": 540
},
{
"epoch": 1.844331641285956,
"grad_norm": 3.538997656633232,
"learning_rate": 3.285214211729343e-07,
"loss": 0.7498,
"step": 545
},
{
"epoch": 1.8612521150592216,
"grad_norm": 3.763523903194986,
"learning_rate": 3.2011790946909666e-07,
"loss": 0.7306,
"step": 550
},
{
"epoch": 1.8612521150592216,
"eval_loss": 0.7919500470161438,
"eval_runtime": 146.647,
"eval_samples_per_second": 57.28,
"eval_steps_per_second": 0.9,
"step": 550
},
{
"epoch": 1.8781725888324874,
"grad_norm": 4.154646770046085,
"learning_rate": 3.11772367312804e-07,
"loss": 0.7364,
"step": 555
},
{
"epoch": 1.895093062605753,
"grad_norm": 3.6964867615997874,
"learning_rate": 3.034874841730382e-07,
"loss": 0.7357,
"step": 560
},
{
"epoch": 1.9120135363790185,
"grad_norm": 3.6098682228440024,
"learning_rate": 2.9526592997055483e-07,
"loss": 0.7435,
"step": 565
},
{
"epoch": 1.9289340101522843,
"grad_norm": 3.5510859742021514,
"learning_rate": 2.8711035421746363e-07,
"loss": 0.7401,
"step": 570
},
{
"epoch": 1.94585448392555,
"grad_norm": 3.5339412485916477,
"learning_rate": 2.7902338516338674e-07,
"loss": 0.7196,
"step": 575
},
{
"epoch": 1.9627749576988156,
"grad_norm": 3.654769871220945,
"learning_rate": 2.7100762894846627e-07,
"loss": 0.7427,
"step": 580
},
{
"epoch": 1.9796954314720812,
"grad_norm": 3.5585570542699485,
"learning_rate": 2.6306566876350067e-07,
"loss": 0.7549,
"step": 585
},
{
"epoch": 1.996615905245347,
"grad_norm": 3.5792060121886804,
"learning_rate": 2.5520006401747395e-07,
"loss": 0.7306,
"step": 590
},
{
"epoch": 2.0135363790186127,
"grad_norm": 3.629067553958508,
"learning_rate": 2.474133495127494e-07,
"loss": 0.7062,
"step": 595
},
{
"epoch": 2.030456852791878,
"grad_norm": 3.646688738000953,
"learning_rate": 2.3970803462819583e-07,
"loss": 0.7065,
"step": 600
},
{
"epoch": 2.030456852791878,
"eval_loss": 0.7942918539047241,
"eval_runtime": 304.0198,
"eval_samples_per_second": 27.63,
"eval_steps_per_second": 0.434,
"step": 600
},
{
"epoch": 2.047377326565144,
"grad_norm": 4.052211962047984,
"learning_rate": 2.3208660251050156e-07,
"loss": 0.675,
"step": 605
},
{
"epoch": 2.0642978003384096,
"grad_norm": 3.9688880359901444,
"learning_rate": 2.2455150927394878e-07,
"loss": 0.6934,
"step": 610
},
{
"epoch": 2.081218274111675,
"grad_norm": 3.8191830448574575,
"learning_rate": 2.1710518320889276e-07,
"loss": 0.695,
"step": 615
},
{
"epoch": 2.0981387478849407,
"grad_norm": 3.9122017489827488,
"learning_rate": 2.097500239992132e-07,
"loss": 0.6909,
"step": 620
},
{
"epoch": 2.1150592216582065,
"grad_norm": 4.035937936283847,
"learning_rate": 2.0248840194898155e-07,
"loss": 0.6869,
"step": 625
},
{
"epoch": 2.1319796954314723,
"grad_norm": 4.0592719997150875,
"learning_rate": 1.9532265721859597e-07,
"loss": 0.6758,
"step": 630
},
{
"epoch": 2.1489001692047376,
"grad_norm": 3.99048820587498,
"learning_rate": 1.8825509907063326e-07,
"loss": 0.6717,
"step": 635
},
{
"epoch": 2.1658206429780034,
"grad_norm": 3.6085759994280413,
"learning_rate": 1.812880051256551e-07,
"loss": 0.7084,
"step": 640
},
{
"epoch": 2.182741116751269,
"grad_norm": 3.8339000315450633,
"learning_rate": 1.744236206282132e-07,
"loss": 0.6795,
"step": 645
},
{
"epoch": 2.199661590524535,
"grad_norm": 3.8042578709482937,
"learning_rate": 1.6766415772328728e-07,
"loss": 0.695,
"step": 650
},
{
"epoch": 2.199661590524535,
"eval_loss": 0.7947296500205994,
"eval_runtime": 146.6011,
"eval_samples_per_second": 57.298,
"eval_steps_per_second": 0.9,
"step": 650
},
{
"epoch": 2.2165820642978002,
"grad_norm": 3.6822683929855318,
"learning_rate": 1.6101179474338966e-07,
"loss": 0.6637,
"step": 655
},
{
"epoch": 2.233502538071066,
"grad_norm": 3.8036837748646746,
"learning_rate": 1.5446867550656767e-07,
"loss": 0.6846,
"step": 660
},
{
"epoch": 2.250423011844332,
"grad_norm": 3.7154882715692747,
"learning_rate": 1.4803690862552753e-07,
"loss": 0.6761,
"step": 665
},
{
"epoch": 2.267343485617597,
"grad_norm": 3.8188043343483167,
"learning_rate": 1.4171856682810384e-07,
"loss": 0.6834,
"step": 670
},
{
"epoch": 2.284263959390863,
"grad_norm": 3.7763683799555494,
"learning_rate": 1.3551568628929432e-07,
"loss": 0.682,
"step": 675
},
{
"epoch": 2.3011844331641287,
"grad_norm": 3.7184968099032742,
"learning_rate": 1.2943026597507267e-07,
"loss": 0.6758,
"step": 680
},
{
"epoch": 2.3181049069373945,
"grad_norm": 3.6752732128517978,
"learning_rate": 1.2346426699819456e-07,
"loss": 0.6778,
"step": 685
},
{
"epoch": 2.33502538071066,
"grad_norm": 3.914291257516614,
"learning_rate": 1.176196119862008e-07,
"loss": 0.6915,
"step": 690
},
{
"epoch": 2.3519458544839256,
"grad_norm": 3.6920364954305307,
"learning_rate": 1.1189818446182358e-07,
"loss": 0.6858,
"step": 695
},
{
"epoch": 2.3688663282571913,
"grad_norm": 3.848115462041246,
"learning_rate": 1.0630182823599399e-07,
"loss": 0.7013,
"step": 700
},
{
"epoch": 2.3688663282571913,
"eval_loss": 0.7938902378082275,
"eval_runtime": 146.616,
"eval_samples_per_second": 57.292,
"eval_steps_per_second": 0.9,
"step": 700
},
{
"epoch": 2.3857868020304567,
"grad_norm": 3.7700574896430665,
"learning_rate": 1.0083234681364932e-07,
"loss": 0.6637,
"step": 705
},
{
"epoch": 2.4027072758037225,
"grad_norm": 3.662805050537354,
"learning_rate": 9.549150281252632e-08,
"loss": 0.6885,
"step": 710
},
{
"epoch": 2.4196277495769882,
"grad_norm": 3.7513042616130616,
"learning_rate": 9.028101739513405e-08,
"loss": 0.6949,
"step": 715
},
{
"epoch": 2.436548223350254,
"grad_norm": 3.5781011423869193,
"learning_rate": 8.520256971408452e-08,
"loss": 0.6796,
"step": 720
},
{
"epoch": 2.4534686971235193,
"grad_norm": 3.8550128332718465,
"learning_rate": 8.025779637096137e-08,
"loss": 0.6724,
"step": 725
},
{
"epoch": 2.470389170896785,
"grad_norm": 3.908604741906314,
"learning_rate": 7.544829088890325e-08,
"loss": 0.6789,
"step": 730
},
{
"epoch": 2.487309644670051,
"grad_norm": 3.815438363979128,
"learning_rate": 7.077560319906694e-08,
"loss": 0.6878,
"step": 735
},
{
"epoch": 2.504230118443316,
"grad_norm": 3.678241183457789,
"learning_rate": 6.624123914114122e-08,
"loss": 0.6758,
"step": 740
},
{
"epoch": 2.521150592216582,
"grad_norm": 3.766677918686516,
"learning_rate": 6.184665997806831e-08,
"loss": 0.6743,
"step": 745
},
{
"epoch": 2.5380710659898478,
"grad_norm": 3.7999340905324974,
"learning_rate": 5.759328192513074e-08,
"loss": 0.6743,
"step": 750
},
{
"epoch": 2.5380710659898478,
"eval_loss": 0.7932254672050476,
"eval_runtime": 146.605,
"eval_samples_per_second": 57.297,
"eval_steps_per_second": 0.9,
"step": 750
},
{
"epoch": 2.5549915397631136,
"grad_norm": 3.865893271080869,
"learning_rate": 5.348247569355735e-08,
"loss": 0.6872,
"step": 755
},
{
"epoch": 2.571912013536379,
"grad_norm": 4.144060563570108,
"learning_rate": 4.951556604879048e-08,
"loss": 0.694,
"step": 760
},
{
"epoch": 2.5888324873096447,
"grad_norm": 3.780680669356305,
"learning_rate": 4.569383138356275e-08,
"loss": 0.6826,
"step": 765
},
{
"epoch": 2.6057529610829104,
"grad_norm": 4.114587718099576,
"learning_rate": 4.201850330591677e-08,
"loss": 0.68,
"step": 770
},
{
"epoch": 2.6226734348561758,
"grad_norm": 3.4623751046204285,
"learning_rate": 3.8490766242301353e-08,
"loss": 0.6658,
"step": 775
},
{
"epoch": 2.6395939086294415,
"grad_norm": 3.9196860968662137,
"learning_rate": 3.5111757055874326e-08,
"loss": 0.6898,
"step": 780
},
{
"epoch": 2.6565143824027073,
"grad_norm": 3.820036546151991,
"learning_rate": 3.188256468013139e-08,
"loss": 0.6732,
"step": 785
},
{
"epoch": 2.673434856175973,
"grad_norm": 4.231574698760902,
"learning_rate": 2.8804229767982636e-08,
"loss": 0.6728,
"step": 790
},
{
"epoch": 2.6903553299492384,
"grad_norm": 4.115986044491551,
"learning_rate": 2.587774435638679e-08,
"loss": 0.6755,
"step": 795
},
{
"epoch": 2.707275803722504,
"grad_norm": 3.724220098163462,
"learning_rate": 2.3104051546654013e-08,
"loss": 0.6778,
"step": 800
},
{
"epoch": 2.707275803722504,
"eval_loss": 0.7928686141967773,
"eval_runtime": 146.5361,
"eval_samples_per_second": 57.324,
"eval_steps_per_second": 0.901,
"step": 800
},
{
"epoch": 2.72419627749577,
"grad_norm": 4.055902737307998,
"learning_rate": 2.048404520051722e-08,
"loss": 0.6846,
"step": 805
},
{
"epoch": 2.7411167512690353,
"grad_norm": 3.761429857208659,
"learning_rate": 1.8018569652073378e-08,
"loss": 0.6865,
"step": 810
},
{
"epoch": 2.758037225042301,
"grad_norm": 3.828128027906773,
"learning_rate": 1.570841943568446e-08,
"loss": 0.6712,
"step": 815
},
{
"epoch": 2.774957698815567,
"grad_norm": 3.765063878680342,
"learning_rate": 1.3554339029927531e-08,
"loss": 0.6816,
"step": 820
},
{
"epoch": 2.7918781725888326,
"grad_norm": 3.827232987418165,
"learning_rate": 1.1557022617676216e-08,
"loss": 0.6789,
"step": 825
},
{
"epoch": 2.808798646362098,
"grad_norm": 3.9934159183359106,
"learning_rate": 9.717113862389992e-09,
"loss": 0.6961,
"step": 830
},
{
"epoch": 2.8257191201353637,
"grad_norm": 3.905375697814281,
"learning_rate": 8.035205700685165e-09,
"loss": 0.6794,
"step": 835
},
{
"epoch": 2.8426395939086295,
"grad_norm": 3.8176339677184687,
"learning_rate": 6.511840151252168e-09,
"loss": 0.6858,
"step": 840
},
{
"epoch": 2.859560067681895,
"grad_norm": 3.669045035086845,
"learning_rate": 5.147508140182555e-09,
"loss": 0.6709,
"step": 845
},
{
"epoch": 2.8764805414551606,
"grad_norm": 3.7873007524702644,
"learning_rate": 3.9426493427611175e-09,
"loss": 0.6951,
"step": 850
},
{
"epoch": 2.8764805414551606,
"eval_loss": 0.7926760911941528,
"eval_runtime": 146.7334,
"eval_samples_per_second": 57.247,
"eval_steps_per_second": 0.9,
"step": 850
},
{
"epoch": 2.8934010152284264,
"grad_norm": 3.6861972611151548,
"learning_rate": 2.897652041774279e-09,
"loss": 0.6878,
"step": 855
},
{
"epoch": 2.910321489001692,
"grad_norm": 3.6981706590030003,
"learning_rate": 2.0128530023804656e-09,
"loss": 0.6979,
"step": 860
},
{
"epoch": 2.927241962774958,
"grad_norm": 3.788952889279561,
"learning_rate": 1.2885373635829754e-09,
"loss": 0.6897,
"step": 865
},
{
"epoch": 2.9441624365482233,
"grad_norm": 3.818071711218172,
"learning_rate": 7.249385463395374e-10,
"loss": 0.6883,
"step": 870
},
{
"epoch": 2.961082910321489,
"grad_norm": 3.612189636258872,
"learning_rate": 3.22238178339318e-10,
"loss": 0.6879,
"step": 875
},
{
"epoch": 2.9780033840947544,
"grad_norm": 3.6807850492378975,
"learning_rate": 8.056603547090812e-11,
"loss": 0.6952,
"step": 880
},
{
"epoch": 2.99492385786802,
"grad_norm": 3.843102017586158,
"learning_rate": 0.0,
"loss": 0.6819,
"step": 885
},
{
"epoch": 2.99492385786802,
"step": 885,
"total_flos": 5218127163949056.0,
"train_loss": 0.776407975396194,
"train_runtime": 14269.6597,
"train_samples_per_second": 15.893,
"train_steps_per_second": 0.062
}
],
"logging_steps": 5,
"max_steps": 885,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5218127163949056.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}