youssefkhalil320's picture
Upload folder using huggingface_hub
cf7bb94 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.5960942634494413,
"eval_steps": 5000,
"global_step": 17000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009388789784996713,
"grad_norm": 72.80598449707031,
"learning_rate": 9.103707179727828e-07,
"loss": 16.0755,
"step": 100
},
{
"epoch": 0.018777579569993427,
"grad_norm": 72.60921478271484,
"learning_rate": 1.8301267010793056e-06,
"loss": 13.0643,
"step": 200
},
{
"epoch": 0.02816636935499014,
"grad_norm": 10.017908096313477,
"learning_rate": 2.7592679493195683e-06,
"loss": 9.3474,
"step": 300
},
{
"epoch": 0.03755515913998685,
"grad_norm": 5.694988250732422,
"learning_rate": 3.6977944626935713e-06,
"loss": 8.2606,
"step": 400
},
{
"epoch": 0.04694394892498357,
"grad_norm": 4.844100475311279,
"learning_rate": 4.6363209760675744e-06,
"loss": 8.084,
"step": 500
},
{
"epoch": 0.05633273870998028,
"grad_norm": 3.6125142574310303,
"learning_rate": 5.574847489441577e-06,
"loss": 8.0581,
"step": 600
},
{
"epoch": 0.06572152849497699,
"grad_norm": 3.166046380996704,
"learning_rate": 6.51337400281558e-06,
"loss": 8.0175,
"step": 700
},
{
"epoch": 0.0751103182799737,
"grad_norm": 2.600433111190796,
"learning_rate": 7.451900516189583e-06,
"loss": 8.0285,
"step": 800
},
{
"epoch": 0.08449910806497042,
"grad_norm": 2.3805315494537354,
"learning_rate": 8.390427029563585e-06,
"loss": 8.0024,
"step": 900
},
{
"epoch": 0.09388789784996714,
"grad_norm": 12.382240295410156,
"learning_rate": 9.328953542937589e-06,
"loss": 8.0161,
"step": 1000
},
{
"epoch": 0.10327668763496385,
"grad_norm": 2.7355728149414062,
"learning_rate": 1.0267480056311592e-05,
"loss": 7.9941,
"step": 1100
},
{
"epoch": 0.11266547741996057,
"grad_norm": 2.0243470668792725,
"learning_rate": 1.1206006569685594e-05,
"loss": 8.0233,
"step": 1200
},
{
"epoch": 0.12205426720495728,
"grad_norm": 1.9162158966064453,
"learning_rate": 1.2144533083059597e-05,
"loss": 8.0141,
"step": 1300
},
{
"epoch": 0.13144305698995398,
"grad_norm": 11.409939765930176,
"learning_rate": 1.3083059596433601e-05,
"loss": 7.9644,
"step": 1400
},
{
"epoch": 0.1408318467749507,
"grad_norm": 1.712424635887146,
"learning_rate": 1.4021586109807603e-05,
"loss": 8.0311,
"step": 1500
},
{
"epoch": 0.1502206365599474,
"grad_norm": 2.4589834213256836,
"learning_rate": 1.4960112623181606e-05,
"loss": 8.0306,
"step": 1600
},
{
"epoch": 0.15960942634494413,
"grad_norm": 1.7343533039093018,
"learning_rate": 1.589863913655561e-05,
"loss": 7.989,
"step": 1700
},
{
"epoch": 0.16899821612994084,
"grad_norm": 2.0726826190948486,
"learning_rate": 1.6837165649929613e-05,
"loss": 8.0034,
"step": 1800
},
{
"epoch": 0.17838700591493756,
"grad_norm": 1.7758458852767944,
"learning_rate": 1.7775692163303613e-05,
"loss": 8.0107,
"step": 1900
},
{
"epoch": 0.18777579569993427,
"grad_norm": 3.3816475868225098,
"learning_rate": 1.8714218676677617e-05,
"loss": 7.9737,
"step": 2000
},
{
"epoch": 0.197164585484931,
"grad_norm": 1.8136950731277466,
"learning_rate": 1.965274519005162e-05,
"loss": 7.9827,
"step": 2100
},
{
"epoch": 0.2065533752699277,
"grad_norm": 1.7819303274154663,
"learning_rate": 1.9934275728965626e-05,
"loss": 8.0389,
"step": 2200
},
{
"epoch": 0.21594216505492442,
"grad_norm": 2.269160509109497,
"learning_rate": 1.9829951489228525e-05,
"loss": 7.973,
"step": 2300
},
{
"epoch": 0.22533095483992113,
"grad_norm": 3.3508036136627197,
"learning_rate": 1.972562724949142e-05,
"loss": 7.9669,
"step": 2400
},
{
"epoch": 0.23471974462491785,
"grad_norm": 1.674142599105835,
"learning_rate": 1.962130300975432e-05,
"loss": 8.0296,
"step": 2500
},
{
"epoch": 0.24410853440991456,
"grad_norm": 1.454300880432129,
"learning_rate": 1.9516978770017215e-05,
"loss": 7.9984,
"step": 2600
},
{
"epoch": 0.2534973241949113,
"grad_norm": 2.2951695919036865,
"learning_rate": 1.9412654530280113e-05,
"loss": 7.9772,
"step": 2700
},
{
"epoch": 0.26288611397990796,
"grad_norm": 6.295051574707031,
"learning_rate": 1.930833029054301e-05,
"loss": 7.9838,
"step": 2800
},
{
"epoch": 0.2722749037649047,
"grad_norm": 1.8874555826187134,
"learning_rate": 1.9204006050805904e-05,
"loss": 7.9816,
"step": 2900
},
{
"epoch": 0.2816636935499014,
"grad_norm": 20.835277557373047,
"learning_rate": 1.9099681811068803e-05,
"loss": 8.0021,
"step": 3000
},
{
"epoch": 0.29105248333489814,
"grad_norm": 2.1683876514434814,
"learning_rate": 1.8995357571331702e-05,
"loss": 7.9715,
"step": 3100
},
{
"epoch": 0.3004412731198948,
"grad_norm": 1.6533387899398804,
"learning_rate": 1.8891033331594598e-05,
"loss": 7.9809,
"step": 3200
},
{
"epoch": 0.30983006290489157,
"grad_norm": 4.595189094543457,
"learning_rate": 1.8786709091857496e-05,
"loss": 7.9849,
"step": 3300
},
{
"epoch": 0.31921885268988826,
"grad_norm": 1.994147539138794,
"learning_rate": 1.8682384852120392e-05,
"loss": 7.9463,
"step": 3400
},
{
"epoch": 0.328607642474885,
"grad_norm": 1.961474895477295,
"learning_rate": 1.8578060612383287e-05,
"loss": 8.0067,
"step": 3500
},
{
"epoch": 0.3379964322598817,
"grad_norm": 24.005535125732422,
"learning_rate": 1.8473736372646186e-05,
"loss": 7.9431,
"step": 3600
},
{
"epoch": 0.34738522204487843,
"grad_norm": 1.9433845281600952,
"learning_rate": 1.8369412132909085e-05,
"loss": 7.9877,
"step": 3700
},
{
"epoch": 0.3567740118298751,
"grad_norm": 10.296500205993652,
"learning_rate": 1.826508789317198e-05,
"loss": 7.9494,
"step": 3800
},
{
"epoch": 0.36616280161487186,
"grad_norm": 2.194976568222046,
"learning_rate": 1.8160763653434876e-05,
"loss": 7.9466,
"step": 3900
},
{
"epoch": 0.37555159139986855,
"grad_norm": 1.5201098918914795,
"learning_rate": 1.8056439413697775e-05,
"loss": 7.9708,
"step": 4000
},
{
"epoch": 0.3849403811848653,
"grad_norm": 2.9077212810516357,
"learning_rate": 1.795211517396067e-05,
"loss": 7.9525,
"step": 4100
},
{
"epoch": 0.394329170969862,
"grad_norm": 2.041530132293701,
"learning_rate": 1.784779093422357e-05,
"loss": 7.9322,
"step": 4200
},
{
"epoch": 0.4037179607548587,
"grad_norm": 2.0275838375091553,
"learning_rate": 1.7743466694486468e-05,
"loss": 7.9415,
"step": 4300
},
{
"epoch": 0.4131067505398554,
"grad_norm": 2.000778913497925,
"learning_rate": 1.7639142454749364e-05,
"loss": 7.9932,
"step": 4400
},
{
"epoch": 0.42249554032485215,
"grad_norm": 2.2984609603881836,
"learning_rate": 1.753481821501226e-05,
"loss": 7.9481,
"step": 4500
},
{
"epoch": 0.43188433010984884,
"grad_norm": 1.0885875225067139,
"learning_rate": 1.7430493975275155e-05,
"loss": 7.976,
"step": 4600
},
{
"epoch": 0.4412731198948456,
"grad_norm": 4.441020488739014,
"learning_rate": 1.7326169735538053e-05,
"loss": 7.971,
"step": 4700
},
{
"epoch": 0.45066190967984227,
"grad_norm": 2.2056221961975098,
"learning_rate": 1.7221845495800952e-05,
"loss": 7.9647,
"step": 4800
},
{
"epoch": 0.460050699464839,
"grad_norm": 2.1192028522491455,
"learning_rate": 1.7117521256063848e-05,
"loss": 7.9217,
"step": 4900
},
{
"epoch": 0.4694394892498357,
"grad_norm": 14.011516571044922,
"learning_rate": 1.7013197016326747e-05,
"loss": 7.9374,
"step": 5000
},
{
"epoch": 0.4694394892498357,
"eval_loss": 7.951793193817139,
"eval_runtime": 900.8057,
"eval_samples_per_second": 378.358,
"eval_steps_per_second": 2.956,
"step": 5000
},
{
"epoch": 0.4788282790348324,
"grad_norm": 39.19038009643555,
"learning_rate": 1.6908872776589642e-05,
"loss": 7.9026,
"step": 5100
},
{
"epoch": 0.4882170688198291,
"grad_norm": 1.812458872795105,
"learning_rate": 1.6804548536852537e-05,
"loss": 7.9304,
"step": 5200
},
{
"epoch": 0.4976058586048258,
"grad_norm": 70.96247863769531,
"learning_rate": 1.6700224297115436e-05,
"loss": 7.9148,
"step": 5300
},
{
"epoch": 0.5069946483898226,
"grad_norm": 1.6605011224746704,
"learning_rate": 1.6595900057378335e-05,
"loss": 7.9538,
"step": 5400
},
{
"epoch": 0.5163834381748192,
"grad_norm": 2.0463483333587646,
"learning_rate": 1.649157581764123e-05,
"loss": 8.0002,
"step": 5500
},
{
"epoch": 0.5257722279598159,
"grad_norm": 3.8601722717285156,
"learning_rate": 1.6387251577904126e-05,
"loss": 7.9571,
"step": 5600
},
{
"epoch": 0.5351610177448127,
"grad_norm": 2.184122323989868,
"learning_rate": 1.6282927338167025e-05,
"loss": 7.932,
"step": 5700
},
{
"epoch": 0.5445498075298094,
"grad_norm": 2.165367603302002,
"learning_rate": 1.617860309842992e-05,
"loss": 7.9047,
"step": 5800
},
{
"epoch": 0.5539385973148061,
"grad_norm": 1.5312166213989258,
"learning_rate": 1.607427885869282e-05,
"loss": 7.9353,
"step": 5900
},
{
"epoch": 0.5633273870998028,
"grad_norm": 21.75490379333496,
"learning_rate": 1.5969954618955715e-05,
"loss": 7.9203,
"step": 6000
},
{
"epoch": 0.5727161768847996,
"grad_norm": 1.8674250841140747,
"learning_rate": 1.5865630379218614e-05,
"loss": 7.8967,
"step": 6100
},
{
"epoch": 0.5821049666697963,
"grad_norm": 49.87809371948242,
"learning_rate": 1.576130613948151e-05,
"loss": 7.9414,
"step": 6200
},
{
"epoch": 0.591493756454793,
"grad_norm": 54.42366409301758,
"learning_rate": 1.5658025142141778e-05,
"loss": 7.9631,
"step": 6300
},
{
"epoch": 0.6008825462397896,
"grad_norm": 37.58320236206055,
"learning_rate": 1.5554744144802047e-05,
"loss": 7.9606,
"step": 6400
},
{
"epoch": 0.6102713360247864,
"grad_norm": 3.1502482891082764,
"learning_rate": 1.5450419905064945e-05,
"loss": 7.9377,
"step": 6500
},
{
"epoch": 0.6196601258097831,
"grad_norm": 2.5369224548339844,
"learning_rate": 1.534609566532784e-05,
"loss": 7.9108,
"step": 6600
},
{
"epoch": 0.6290489155947798,
"grad_norm": 2.5891005992889404,
"learning_rate": 1.5241771425590736e-05,
"loss": 7.9225,
"step": 6700
},
{
"epoch": 0.6384377053797765,
"grad_norm": 1.7794080972671509,
"learning_rate": 1.5137447185853635e-05,
"loss": 7.9154,
"step": 6800
},
{
"epoch": 0.6478264951647733,
"grad_norm": 2.324805974960327,
"learning_rate": 1.5033122946116532e-05,
"loss": 7.9191,
"step": 6900
},
{
"epoch": 0.65721528494977,
"grad_norm": 2.601715564727783,
"learning_rate": 1.4928798706379428e-05,
"loss": 7.8903,
"step": 7000
},
{
"epoch": 0.6666040747347667,
"grad_norm": 2.4438092708587646,
"learning_rate": 1.4824474466642325e-05,
"loss": 7.9213,
"step": 7100
},
{
"epoch": 0.6759928645197634,
"grad_norm": 8.118125915527344,
"learning_rate": 1.4720150226905224e-05,
"loss": 7.9202,
"step": 7200
},
{
"epoch": 0.6853816543047602,
"grad_norm": 3.908555746078491,
"learning_rate": 1.461582598716812e-05,
"loss": 7.8998,
"step": 7300
},
{
"epoch": 0.6947704440897569,
"grad_norm": 2.72293758392334,
"learning_rate": 1.4511501747431017e-05,
"loss": 7.9153,
"step": 7400
},
{
"epoch": 0.7041592338747535,
"grad_norm": 3.108797073364258,
"learning_rate": 1.4407177507693915e-05,
"loss": 7.9037,
"step": 7500
},
{
"epoch": 0.7135480236597502,
"grad_norm": 2.6256439685821533,
"learning_rate": 1.430285326795681e-05,
"loss": 7.9146,
"step": 7600
},
{
"epoch": 0.7229368134447469,
"grad_norm": 3.5525624752044678,
"learning_rate": 1.4198529028219708e-05,
"loss": 7.8972,
"step": 7700
},
{
"epoch": 0.7323256032297437,
"grad_norm": 2.6983673572540283,
"learning_rate": 1.4094204788482607e-05,
"loss": 7.9374,
"step": 7800
},
{
"epoch": 0.7417143930147404,
"grad_norm": 1.545486569404602,
"learning_rate": 1.3989880548745502e-05,
"loss": 7.8647,
"step": 7900
},
{
"epoch": 0.7511031827997371,
"grad_norm": 2.5116941928863525,
"learning_rate": 1.38855563090084e-05,
"loss": 7.8915,
"step": 8000
},
{
"epoch": 0.7604919725847338,
"grad_norm": 1.8576518297195435,
"learning_rate": 1.3781232069271295e-05,
"loss": 7.8846,
"step": 8100
},
{
"epoch": 0.7698807623697306,
"grad_norm": 3.3226571083068848,
"learning_rate": 1.3676907829534194e-05,
"loss": 7.8988,
"step": 8200
},
{
"epoch": 0.7792695521547273,
"grad_norm": 2.946324586868286,
"learning_rate": 1.3572583589797091e-05,
"loss": 7.8702,
"step": 8300
},
{
"epoch": 0.788658341939724,
"grad_norm": 2.5089969635009766,
"learning_rate": 1.3468259350059986e-05,
"loss": 7.923,
"step": 8400
},
{
"epoch": 0.7980471317247206,
"grad_norm": 2.2807912826538086,
"learning_rate": 1.3363935110322885e-05,
"loss": 7.891,
"step": 8500
},
{
"epoch": 0.8074359215097174,
"grad_norm": 2.5889735221862793,
"learning_rate": 1.3259610870585782e-05,
"loss": 7.8832,
"step": 8600
},
{
"epoch": 0.8168247112947141,
"grad_norm": 2.8306784629821777,
"learning_rate": 1.3155286630848678e-05,
"loss": 7.8726,
"step": 8700
},
{
"epoch": 0.8262135010797108,
"grad_norm": 2.626786231994629,
"learning_rate": 1.3050962391111577e-05,
"loss": 7.8813,
"step": 8800
},
{
"epoch": 0.8356022908647075,
"grad_norm": 3.195319414138794,
"learning_rate": 1.2946638151374474e-05,
"loss": 7.8986,
"step": 8900
},
{
"epoch": 0.8449910806497043,
"grad_norm": 5.254043102264404,
"learning_rate": 1.284231391163737e-05,
"loss": 7.8743,
"step": 9000
},
{
"epoch": 0.854379870434701,
"grad_norm": 2.9493279457092285,
"learning_rate": 1.2737989671900267e-05,
"loss": 7.8791,
"step": 9100
},
{
"epoch": 0.8637686602196977,
"grad_norm": 3.130415439605713,
"learning_rate": 1.2633665432163165e-05,
"loss": 7.8783,
"step": 9200
},
{
"epoch": 0.8731574500046944,
"grad_norm": 4.030152797698975,
"learning_rate": 1.2529341192426061e-05,
"loss": 7.8528,
"step": 9300
},
{
"epoch": 0.8825462397896912,
"grad_norm": 2.9882099628448486,
"learning_rate": 1.2425016952688958e-05,
"loss": 7.8864,
"step": 9400
},
{
"epoch": 0.8919350295746878,
"grad_norm": 3.802172899246216,
"learning_rate": 1.2320692712951855e-05,
"loss": 7.8989,
"step": 9500
},
{
"epoch": 0.9013238193596845,
"grad_norm": 2.724433183670044,
"learning_rate": 1.2216368473214752e-05,
"loss": 7.8617,
"step": 9600
},
{
"epoch": 0.9107126091446812,
"grad_norm": 2.459376573562622,
"learning_rate": 1.211204423347765e-05,
"loss": 7.8371,
"step": 9700
},
{
"epoch": 0.920101398929678,
"grad_norm": 4.715926647186279,
"learning_rate": 1.2007719993740547e-05,
"loss": 7.8566,
"step": 9800
},
{
"epoch": 0.9294901887146747,
"grad_norm": 2.6845057010650635,
"learning_rate": 1.1903395754003444e-05,
"loss": 7.8776,
"step": 9900
},
{
"epoch": 0.9388789784996714,
"grad_norm": 2.62907075881958,
"learning_rate": 1.1799071514266341e-05,
"loss": 7.8558,
"step": 10000
},
{
"epoch": 0.9388789784996714,
"eval_loss": 7.849188327789307,
"eval_runtime": 1155.7489,
"eval_samples_per_second": 294.897,
"eval_steps_per_second": 2.304,
"step": 10000
},
{
"epoch": 0.9482677682846681,
"grad_norm": 4.570381164550781,
"learning_rate": 1.1694747274529237e-05,
"loss": 7.848,
"step": 10100
},
{
"epoch": 0.9576565580696648,
"grad_norm": 21.764062881469727,
"learning_rate": 1.1590423034792135e-05,
"loss": 7.8227,
"step": 10200
},
{
"epoch": 0.9670453478546616,
"grad_norm": 18.442140579223633,
"learning_rate": 1.1486098795055033e-05,
"loss": 7.8311,
"step": 10300
},
{
"epoch": 0.9764341376396583,
"grad_norm": 4.737902641296387,
"learning_rate": 1.1381774555317928e-05,
"loss": 7.8437,
"step": 10400
},
{
"epoch": 0.9858229274246549,
"grad_norm": 3.0295650959014893,
"learning_rate": 1.1277450315580827e-05,
"loss": 7.8454,
"step": 10500
},
{
"epoch": 0.9952117172096516,
"grad_norm": 3.0269651412963867,
"learning_rate": 1.1173126075843724e-05,
"loss": 7.8362,
"step": 10600
},
{
"epoch": 1.0046005069946484,
"grad_norm": 4.033662796020508,
"learning_rate": 1.1069845078503991e-05,
"loss": 7.8681,
"step": 10700
},
{
"epoch": 1.013989296779645,
"grad_norm": 3.5319488048553467,
"learning_rate": 1.0965520838766888e-05,
"loss": 7.8745,
"step": 10800
},
{
"epoch": 1.0233780865646418,
"grad_norm": 2.7731130123138428,
"learning_rate": 1.0861196599029787e-05,
"loss": 7.8339,
"step": 10900
},
{
"epoch": 1.0327668763496385,
"grad_norm": 4.000971794128418,
"learning_rate": 1.0756872359292683e-05,
"loss": 7.8458,
"step": 11000
},
{
"epoch": 1.0421556661346352,
"grad_norm": 15.05604362487793,
"learning_rate": 1.065254811955558e-05,
"loss": 7.8493,
"step": 11100
},
{
"epoch": 1.0515444559196319,
"grad_norm": 4.498584747314453,
"learning_rate": 1.0548223879818477e-05,
"loss": 7.8317,
"step": 11200
},
{
"epoch": 1.0609332457046288,
"grad_norm": 2.8218085765838623,
"learning_rate": 1.0443899640081374e-05,
"loss": 7.841,
"step": 11300
},
{
"epoch": 1.0703220354896255,
"grad_norm": 3.627685785293579,
"learning_rate": 1.0339575400344271e-05,
"loss": 7.8292,
"step": 11400
},
{
"epoch": 1.0797108252746221,
"grad_norm": 4.804520606994629,
"learning_rate": 1.0235251160607167e-05,
"loss": 7.8121,
"step": 11500
},
{
"epoch": 1.0890996150596188,
"grad_norm": 15.256156921386719,
"learning_rate": 1.0130926920870066e-05,
"loss": 7.8165,
"step": 11600
},
{
"epoch": 1.0984884048446155,
"grad_norm": 3.684401273727417,
"learning_rate": 1.0026602681132963e-05,
"loss": 7.8259,
"step": 11700
},
{
"epoch": 1.1078771946296122,
"grad_norm": 3.7146763801574707,
"learning_rate": 9.92227844139586e-06,
"loss": 7.8303,
"step": 11800
},
{
"epoch": 1.117265984414609,
"grad_norm": 3.4437708854675293,
"learning_rate": 9.817954201658755e-06,
"loss": 7.809,
"step": 11900
},
{
"epoch": 1.1266547741996056,
"grad_norm": 4.232120990753174,
"learning_rate": 9.713629961921654e-06,
"loss": 7.818,
"step": 12000
},
{
"epoch": 1.1360435639846025,
"grad_norm": 3.4967739582061768,
"learning_rate": 9.60930572218455e-06,
"loss": 7.8071,
"step": 12100
},
{
"epoch": 1.1454323537695992,
"grad_norm": 10.542444229125977,
"learning_rate": 9.504981482447447e-06,
"loss": 7.801,
"step": 12200
},
{
"epoch": 1.1548211435545959,
"grad_norm": 3.744981527328491,
"learning_rate": 9.400657242710344e-06,
"loss": 7.8123,
"step": 12300
},
{
"epoch": 1.1642099333395926,
"grad_norm": 3.3549323081970215,
"learning_rate": 9.296333002973241e-06,
"loss": 7.8203,
"step": 12400
},
{
"epoch": 1.1735987231245892,
"grad_norm": 5.337845325469971,
"learning_rate": 9.192008763236138e-06,
"loss": 7.8609,
"step": 12500
},
{
"epoch": 1.182987512909586,
"grad_norm": 3.204465627670288,
"learning_rate": 9.087684523499036e-06,
"loss": 7.7782,
"step": 12600
},
{
"epoch": 1.1923763026945826,
"grad_norm": 4.669897079467773,
"learning_rate": 8.983360283761933e-06,
"loss": 7.8092,
"step": 12700
},
{
"epoch": 1.2017650924795793,
"grad_norm": 3.1824800968170166,
"learning_rate": 8.87903604402483e-06,
"loss": 7.815,
"step": 12800
},
{
"epoch": 1.211153882264576,
"grad_norm": 3.6459527015686035,
"learning_rate": 8.774711804287727e-06,
"loss": 7.8196,
"step": 12900
},
{
"epoch": 1.220542672049573,
"grad_norm": 3.732983112335205,
"learning_rate": 8.670387564550624e-06,
"loss": 7.8206,
"step": 13000
},
{
"epoch": 1.2299314618345696,
"grad_norm": 4.478656768798828,
"learning_rate": 8.566063324813521e-06,
"loss": 7.8022,
"step": 13100
},
{
"epoch": 1.2393202516195663,
"grad_norm": 3.7781801223754883,
"learning_rate": 8.461739085076418e-06,
"loss": 7.8043,
"step": 13200
},
{
"epoch": 1.248709041404563,
"grad_norm": 5.932605743408203,
"learning_rate": 8.357414845339316e-06,
"loss": 7.7823,
"step": 13300
},
{
"epoch": 1.2580978311895596,
"grad_norm": 3.8288068771362305,
"learning_rate": 8.254133847999584e-06,
"loss": 7.8061,
"step": 13400
},
{
"epoch": 1.2674866209745563,
"grad_norm": 4.60470724105835,
"learning_rate": 8.14980960826248e-06,
"loss": 7.8016,
"step": 13500
},
{
"epoch": 1.276875410759553,
"grad_norm": 5.450839996337891,
"learning_rate": 8.045485368525377e-06,
"loss": 7.8076,
"step": 13600
},
{
"epoch": 1.28626420054455,
"grad_norm": 7.866298198699951,
"learning_rate": 7.941161128788276e-06,
"loss": 7.7996,
"step": 13700
},
{
"epoch": 1.2956529903295464,
"grad_norm": 3.059967041015625,
"learning_rate": 7.836836889051171e-06,
"loss": 7.8035,
"step": 13800
},
{
"epoch": 1.3050417801145433,
"grad_norm": 3.5380911827087402,
"learning_rate": 7.732512649314069e-06,
"loss": 7.8092,
"step": 13900
},
{
"epoch": 1.31443056989954,
"grad_norm": 4.589097499847412,
"learning_rate": 7.628188409576966e-06,
"loss": 7.7902,
"step": 14000
},
{
"epoch": 1.3238193596845367,
"grad_norm": 6.932407855987549,
"learning_rate": 7.523864169839863e-06,
"loss": 7.8114,
"step": 14100
},
{
"epoch": 1.3332081494695334,
"grad_norm": 3.5786869525909424,
"learning_rate": 7.41953993010276e-06,
"loss": 7.8112,
"step": 14200
},
{
"epoch": 1.34259693925453,
"grad_norm": 4.283187389373779,
"learning_rate": 7.315215690365657e-06,
"loss": 7.8036,
"step": 14300
},
{
"epoch": 1.3519857290395267,
"grad_norm": 14.625285148620605,
"learning_rate": 7.210891450628554e-06,
"loss": 7.8178,
"step": 14400
},
{
"epoch": 1.3613745188245234,
"grad_norm": 3.5072567462921143,
"learning_rate": 7.106567210891451e-06,
"loss": 7.8391,
"step": 14500
},
{
"epoch": 1.3707633086095203,
"grad_norm": 4.140475749969482,
"learning_rate": 7.002242971154349e-06,
"loss": 7.8151,
"step": 14600
},
{
"epoch": 1.380152098394517,
"grad_norm": 6.985396385192871,
"learning_rate": 6.897918731417246e-06,
"loss": 7.7957,
"step": 14700
},
{
"epoch": 1.3895408881795137,
"grad_norm": 3.8024065494537354,
"learning_rate": 6.793594491680142e-06,
"loss": 7.7833,
"step": 14800
},
{
"epoch": 1.3989296779645104,
"grad_norm": 4.183823585510254,
"learning_rate": 6.689270251943039e-06,
"loss": 7.8049,
"step": 14900
},
{
"epoch": 1.408318467749507,
"grad_norm": 3.431105375289917,
"learning_rate": 6.5849460122059365e-06,
"loss": 7.8163,
"step": 15000
},
{
"epoch": 1.408318467749507,
"eval_loss": 7.807833671569824,
"eval_runtime": 335.7694,
"eval_samples_per_second": 1015.063,
"eval_steps_per_second": 7.931,
"step": 15000
},
{
"epoch": 1.4177072575345038,
"grad_norm": 8.183846473693848,
"learning_rate": 6.480621772468834e-06,
"loss": 7.7864,
"step": 15100
},
{
"epoch": 1.4270960473195005,
"grad_norm": 11.66592788696289,
"learning_rate": 6.376297532731731e-06,
"loss": 7.8241,
"step": 15200
},
{
"epoch": 1.4364848371044971,
"grad_norm": 9.620813369750977,
"learning_rate": 6.271973292994628e-06,
"loss": 7.7694,
"step": 15300
},
{
"epoch": 1.4458736268894938,
"grad_norm": 4.250575065612793,
"learning_rate": 6.167649053257525e-06,
"loss": 7.7784,
"step": 15400
},
{
"epoch": 1.4552624166744907,
"grad_norm": 3.8679049015045166,
"learning_rate": 6.0633248135204214e-06,
"loss": 7.7628,
"step": 15500
},
{
"epoch": 1.4646512064594874,
"grad_norm": 4.648382186889648,
"learning_rate": 5.959000573783319e-06,
"loss": 7.8044,
"step": 15600
},
{
"epoch": 1.4740399962444841,
"grad_norm": 4.5424113273620605,
"learning_rate": 5.854676334046217e-06,
"loss": 7.7871,
"step": 15700
},
{
"epoch": 1.4834287860294808,
"grad_norm": 4.026553630828857,
"learning_rate": 5.750352094309113e-06,
"loss": 7.809,
"step": 15800
},
{
"epoch": 1.4928175758144775,
"grad_norm": 6.175102233886719,
"learning_rate": 5.647071096969381e-06,
"loss": 7.7955,
"step": 15900
},
{
"epoch": 1.5022063655994742,
"grad_norm": 4.672608375549316,
"learning_rate": 5.542746857232279e-06,
"loss": 7.8056,
"step": 16000
},
{
"epoch": 1.5115951553844709,
"grad_norm": 7.012312412261963,
"learning_rate": 5.438422617495176e-06,
"loss": 7.774,
"step": 16100
},
{
"epoch": 1.5209839451694678,
"grad_norm": 5.2042131423950195,
"learning_rate": 5.334098377758072e-06,
"loss": 7.7874,
"step": 16200
},
{
"epoch": 1.5303727349544642,
"grad_norm": 3.745805501937866,
"learning_rate": 5.22977413802097e-06,
"loss": 7.7918,
"step": 16300
},
{
"epoch": 1.5397615247394612,
"grad_norm": 4.060446262359619,
"learning_rate": 5.125449898283867e-06,
"loss": 7.7787,
"step": 16400
},
{
"epoch": 1.5491503145244578,
"grad_norm": 21.851919174194336,
"learning_rate": 5.021125658546764e-06,
"loss": 7.7881,
"step": 16500
},
{
"epoch": 1.5585391043094545,
"grad_norm": 4.261013507843018,
"learning_rate": 4.916801418809661e-06,
"loss": 7.7723,
"step": 16600
},
{
"epoch": 1.5679278940944512,
"grad_norm": 3.9473931789398193,
"learning_rate": 4.812477179072558e-06,
"loss": 7.7809,
"step": 16700
},
{
"epoch": 1.577316683879448,
"grad_norm": 6.088964939117432,
"learning_rate": 4.709196181732826e-06,
"loss": 7.8096,
"step": 16800
},
{
"epoch": 1.5867054736644448,
"grad_norm": 7.912614822387695,
"learning_rate": 4.604871941995723e-06,
"loss": 7.7559,
"step": 16900
},
{
"epoch": 1.5960942634494413,
"grad_norm": 7.268245697021484,
"learning_rate": 4.50054770225862e-06,
"loss": 7.8063,
"step": 17000
}
],
"logging_steps": 100,
"max_steps": 21302,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}