TweedleDeepLearnings's picture
Training in progress, step 4200, checkpoint
95190da verified
raw
history blame
79.9 kB
{
"best_metric": 1.3470451831817627,
"best_model_checkpoint": "./output/checkpoint-4200",
"epoch": 0.11244377811094453,
"eval_steps": 150,
"global_step": 4200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002677232812165346,
"grad_norm": 12.16030502319336,
"learning_rate": 4.4e-06,
"loss": 1.5442,
"step": 10
},
{
"epoch": 0.0005354465624330692,
"grad_norm": 12.028721809387207,
"learning_rate": 8.8e-06,
"loss": 1.5341,
"step": 20
},
{
"epoch": 0.0008031698436496038,
"grad_norm": 9.741438865661621,
"learning_rate": 1.3199999999999999e-05,
"loss": 1.5258,
"step": 30
},
{
"epoch": 0.0010708931248661383,
"grad_norm": 11.791377067565918,
"learning_rate": 1.76e-05,
"loss": 1.4766,
"step": 40
},
{
"epoch": 0.0013386164060826729,
"grad_norm": 10.31489372253418,
"learning_rate": 2.2e-05,
"loss": 1.4898,
"step": 50
},
{
"epoch": 0.0016063396872992076,
"grad_norm": 11.65046501159668,
"learning_rate": 2.6399999999999998e-05,
"loss": 1.4536,
"step": 60
},
{
"epoch": 0.0018740629685157421,
"grad_norm": 11.001107215881348,
"learning_rate": 3.0799999999999996e-05,
"loss": 1.4933,
"step": 70
},
{
"epoch": 0.0021417862497322766,
"grad_norm": 10.670427322387695,
"learning_rate": 3.52e-05,
"loss": 1.4816,
"step": 80
},
{
"epoch": 0.002409509530948811,
"grad_norm": 11.35387134552002,
"learning_rate": 3.96e-05,
"loss": 1.4636,
"step": 90
},
{
"epoch": 0.0026772328121653457,
"grad_norm": 10.275943756103516,
"learning_rate": 4.4e-05,
"loss": 1.4996,
"step": 100
},
{
"epoch": 0.0029449560933818807,
"grad_norm": 9.64588451385498,
"learning_rate": 4.399954783308405e-05,
"loss": 1.5114,
"step": 110
},
{
"epoch": 0.003212679374598415,
"grad_norm": 9.566597938537598,
"learning_rate": 4.399819135092302e-05,
"loss": 1.542,
"step": 120
},
{
"epoch": 0.0034804026558149497,
"grad_norm": 10.304100036621094,
"learning_rate": 4.399593060927658e-05,
"loss": 1.4911,
"step": 130
},
{
"epoch": 0.0037481259370314842,
"grad_norm": 8.622906684875488,
"learning_rate": 4.3992765701074955e-05,
"loss": 1.4655,
"step": 140
},
{
"epoch": 0.004015849218248019,
"grad_norm": 9.855925559997559,
"learning_rate": 4.398869675641513e-05,
"loss": 1.5424,
"step": 150
},
{
"epoch": 0.004015849218248019,
"eval_loss": 1.5061633586883545,
"eval_runtime": 76.7585,
"eval_samples_per_second": 6.514,
"eval_steps_per_second": 6.514,
"step": 150
},
{
"epoch": 0.004283572499464553,
"grad_norm": 9.257708549499512,
"learning_rate": 4.398372394255549e-05,
"loss": 1.4863,
"step": 160
},
{
"epoch": 0.004551295780681088,
"grad_norm": 7.029283046722412,
"learning_rate": 4.397784746390892e-05,
"loss": 1.504,
"step": 170
},
{
"epoch": 0.004819019061897622,
"grad_norm": 8.059552192687988,
"learning_rate": 4.3971067562034454e-05,
"loss": 1.4734,
"step": 180
},
{
"epoch": 0.005086742343114157,
"grad_norm": 8.756830215454102,
"learning_rate": 4.39633845156273e-05,
"loss": 1.5308,
"step": 190
},
{
"epoch": 0.005354465624330691,
"grad_norm": 8.60611629486084,
"learning_rate": 4.39547986405074e-05,
"loss": 1.4985,
"step": 200
},
{
"epoch": 0.005622188905547227,
"grad_norm": 8.613848686218262,
"learning_rate": 4.3945310289606455e-05,
"loss": 1.493,
"step": 210
},
{
"epoch": 0.005889912186763761,
"grad_norm": 7.788877010345459,
"learning_rate": 4.39349198529534e-05,
"loss": 1.492,
"step": 220
},
{
"epoch": 0.006157635467980296,
"grad_norm": 7.323453903198242,
"learning_rate": 4.39236277576584e-05,
"loss": 1.5138,
"step": 230
},
{
"epoch": 0.00642535874919683,
"grad_norm": 7.350474834442139,
"learning_rate": 4.391143446789526e-05,
"loss": 1.5314,
"step": 240
},
{
"epoch": 0.006693082030413365,
"grad_norm": 7.383894443511963,
"learning_rate": 4.389834048488236e-05,
"loss": 1.5231,
"step": 250
},
{
"epoch": 0.006960805311629899,
"grad_norm": 7.436572551727295,
"learning_rate": 4.388434634686206e-05,
"loss": 1.5264,
"step": 260
},
{
"epoch": 0.007228528592846434,
"grad_norm": 7.9078779220581055,
"learning_rate": 4.386945262907856e-05,
"loss": 1.4744,
"step": 270
},
{
"epoch": 0.0074962518740629685,
"grad_norm": 7.032011985778809,
"learning_rate": 4.3853659943754275e-05,
"loss": 1.4156,
"step": 280
},
{
"epoch": 0.007763975155279503,
"grad_norm": 7.743986129760742,
"learning_rate": 4.383696894006463e-05,
"loss": 1.5338,
"step": 290
},
{
"epoch": 0.008031698436496038,
"grad_norm": 7.996994495391846,
"learning_rate": 4.381938030411141e-05,
"loss": 1.4912,
"step": 300
},
{
"epoch": 0.008031698436496038,
"eval_loss": 1.5058677196502686,
"eval_runtime": 76.63,
"eval_samples_per_second": 6.525,
"eval_steps_per_second": 6.525,
"step": 300
},
{
"epoch": 0.008299421717712573,
"grad_norm": 6.838953971862793,
"learning_rate": 4.380089475889457e-05,
"loss": 1.4854,
"step": 310
},
{
"epoch": 0.008567144998929107,
"grad_norm": 7.260242462158203,
"learning_rate": 4.378151306428244e-05,
"loss": 1.5401,
"step": 320
},
{
"epoch": 0.008834868280145642,
"grad_norm": 6.737756729125977,
"learning_rate": 4.3761236016980594e-05,
"loss": 1.5013,
"step": 330
},
{
"epoch": 0.009102591561362176,
"grad_norm": 6.963688373565674,
"learning_rate": 4.3740064450499026e-05,
"loss": 1.4989,
"step": 340
},
{
"epoch": 0.009370314842578711,
"grad_norm": 7.6938557624816895,
"learning_rate": 4.37179992351179e-05,
"loss": 1.5317,
"step": 350
},
{
"epoch": 0.009638038123795245,
"grad_norm": 6.700031757354736,
"learning_rate": 4.3695041277851804e-05,
"loss": 1.405,
"step": 360
},
{
"epoch": 0.00990576140501178,
"grad_norm": 6.925078868865967,
"learning_rate": 4.367119152241245e-05,
"loss": 1.4966,
"step": 370
},
{
"epoch": 0.010173484686228314,
"grad_norm": 6.87849235534668,
"learning_rate": 4.364645094916985e-05,
"loss": 1.4933,
"step": 380
},
{
"epoch": 0.01044120796744485,
"grad_norm": 8.074585914611816,
"learning_rate": 4.3620820575112083e-05,
"loss": 1.4782,
"step": 390
},
{
"epoch": 0.010708931248661383,
"grad_norm": 7.07144832611084,
"learning_rate": 4.359430145380344e-05,
"loss": 1.4871,
"step": 400
},
{
"epoch": 0.010976654529877918,
"grad_norm": 7.2088117599487305,
"learning_rate": 4.356689467534112e-05,
"loss": 1.4855,
"step": 410
},
{
"epoch": 0.011244377811094454,
"grad_norm": 7.868666648864746,
"learning_rate": 4.353860136631044e-05,
"loss": 1.5246,
"step": 420
},
{
"epoch": 0.011512101092310987,
"grad_norm": 7.853616237640381,
"learning_rate": 4.350942268973854e-05,
"loss": 1.5302,
"step": 430
},
{
"epoch": 0.011779824373527523,
"grad_norm": 7.353667259216309,
"learning_rate": 4.347935984504649e-05,
"loss": 1.4305,
"step": 440
},
{
"epoch": 0.012047547654744056,
"grad_norm": 6.460302352905273,
"learning_rate": 4.344841406800012e-05,
"loss": 1.4506,
"step": 450
},
{
"epoch": 0.012047547654744056,
"eval_loss": 1.4909805059432983,
"eval_runtime": 76.606,
"eval_samples_per_second": 6.527,
"eval_steps_per_second": 6.527,
"step": 450
},
{
"epoch": 0.012315270935960592,
"grad_norm": 7.3225226402282715,
"learning_rate": 4.34165866306591e-05,
"loss": 1.461,
"step": 460
},
{
"epoch": 0.012582994217177125,
"grad_norm": 6.5725297927856445,
"learning_rate": 4.3383878841324734e-05,
"loss": 1.4007,
"step": 470
},
{
"epoch": 0.01285071749839366,
"grad_norm": 7.1205315589904785,
"learning_rate": 4.3350292044486125e-05,
"loss": 1.557,
"step": 480
},
{
"epoch": 0.013118440779610194,
"grad_norm": 6.783862113952637,
"learning_rate": 4.331582762076494e-05,
"loss": 1.5214,
"step": 490
},
{
"epoch": 0.01338616406082673,
"grad_norm": 6.475296974182129,
"learning_rate": 4.328048698685865e-05,
"loss": 1.4874,
"step": 500
},
{
"epoch": 0.013653887342043263,
"grad_norm": 7.158287525177002,
"learning_rate": 4.32442715954823e-05,
"loss": 1.4794,
"step": 510
},
{
"epoch": 0.013921610623259799,
"grad_norm": 5.706000804901123,
"learning_rate": 4.320718293530877e-05,
"loss": 1.4921,
"step": 520
},
{
"epoch": 0.014189333904476333,
"grad_norm": 7.483499050140381,
"learning_rate": 4.3169222530907634e-05,
"loss": 1.4899,
"step": 530
},
{
"epoch": 0.014457057185692868,
"grad_norm": 6.9520182609558105,
"learning_rate": 4.313039194268243e-05,
"loss": 1.4908,
"step": 540
},
{
"epoch": 0.014724780466909402,
"grad_norm": 6.7435784339904785,
"learning_rate": 4.309069276680653e-05,
"loss": 1.45,
"step": 550
},
{
"epoch": 0.014992503748125937,
"grad_norm": 7.162035942077637,
"learning_rate": 4.305012663515759e-05,
"loss": 1.4702,
"step": 560
},
{
"epoch": 0.015260227029342472,
"grad_norm": 6.717561721801758,
"learning_rate": 4.300869521525039e-05,
"loss": 1.5131,
"step": 570
},
{
"epoch": 0.015527950310559006,
"grad_norm": 6.205082893371582,
"learning_rate": 4.296640021016832e-05,
"loss": 1.4342,
"step": 580
},
{
"epoch": 0.01579567359177554,
"grad_norm": 6.673497676849365,
"learning_rate": 4.292324335849338e-05,
"loss": 1.4917,
"step": 590
},
{
"epoch": 0.016063396872992075,
"grad_norm": 6.291605472564697,
"learning_rate": 4.287922643423471e-05,
"loss": 1.5018,
"step": 600
},
{
"epoch": 0.016063396872992075,
"eval_loss": 1.476717472076416,
"eval_runtime": 76.7435,
"eval_samples_per_second": 6.515,
"eval_steps_per_second": 6.515,
"step": 600
},
{
"epoch": 0.01633112015420861,
"grad_norm": 6.723529815673828,
"learning_rate": 4.283435124675567e-05,
"loss": 1.4652,
"step": 610
},
{
"epoch": 0.016598843435425146,
"grad_norm": 6.2483062744140625,
"learning_rate": 4.278861964069944e-05,
"loss": 1.5094,
"step": 620
},
{
"epoch": 0.01686656671664168,
"grad_norm": 6.773886203765869,
"learning_rate": 4.274203349591324e-05,
"loss": 1.4771,
"step": 630
},
{
"epoch": 0.017134289997858213,
"grad_norm": 7.28003454208374,
"learning_rate": 4.269459472737102e-05,
"loss": 1.436,
"step": 640
},
{
"epoch": 0.017402013279074747,
"grad_norm": 6.813667297363281,
"learning_rate": 4.264630528509473e-05,
"loss": 1.4094,
"step": 650
},
{
"epoch": 0.017669736560291284,
"grad_norm": 7.017402172088623,
"learning_rate": 4.259716715407422e-05,
"loss": 1.5255,
"step": 660
},
{
"epoch": 0.017937459841507818,
"grad_norm": 7.313465595245361,
"learning_rate": 4.254718235418559e-05,
"loss": 1.4647,
"step": 670
},
{
"epoch": 0.01820518312272435,
"grad_norm": 6.323640823364258,
"learning_rate": 4.249635294010819e-05,
"loss": 1.4799,
"step": 680
},
{
"epoch": 0.01847290640394089,
"grad_norm": 7.1620588302612305,
"learning_rate": 4.244468100124014e-05,
"loss": 1.4344,
"step": 690
},
{
"epoch": 0.018740629685157422,
"grad_norm": 6.160943508148193,
"learning_rate": 4.239216866161248e-05,
"loss": 1.516,
"step": 700
},
{
"epoch": 0.019008352966373956,
"grad_norm": 6.571516036987305,
"learning_rate": 4.233881807980179e-05,
"loss": 1.5133,
"step": 710
},
{
"epoch": 0.01927607624759049,
"grad_norm": 5.696547031402588,
"learning_rate": 4.228463144884155e-05,
"loss": 1.4318,
"step": 720
},
{
"epoch": 0.019543799528807027,
"grad_norm": 6.653096675872803,
"learning_rate": 4.2229610996131915e-05,
"loss": 1.461,
"step": 730
},
{
"epoch": 0.01981152281002356,
"grad_norm": 6.497095584869385,
"learning_rate": 4.217375898334819e-05,
"loss": 1.4359,
"step": 740
},
{
"epoch": 0.020079246091240094,
"grad_norm": 6.566861629486084,
"learning_rate": 4.211707770634788e-05,
"loss": 1.445,
"step": 750
},
{
"epoch": 0.020079246091240094,
"eval_loss": 1.4696097373962402,
"eval_runtime": 76.6203,
"eval_samples_per_second": 6.526,
"eval_steps_per_second": 6.526,
"step": 750
},
{
"epoch": 0.020346969372456628,
"grad_norm": 6.802274703979492,
"learning_rate": 4.205956949507625e-05,
"loss": 1.4485,
"step": 760
},
{
"epoch": 0.020614692653673165,
"grad_norm": 6.832027912139893,
"learning_rate": 4.200123671347065e-05,
"loss": 1.5034,
"step": 770
},
{
"epoch": 0.0208824159348897,
"grad_norm": 7.041072368621826,
"learning_rate": 4.1942081759363236e-05,
"loss": 1.5225,
"step": 780
},
{
"epoch": 0.021150139216106232,
"grad_norm": 6.773481369018555,
"learning_rate": 4.1882107064382496e-05,
"loss": 1.4718,
"step": 790
},
{
"epoch": 0.021417862497322766,
"grad_norm": 6.902709007263184,
"learning_rate": 4.1821315093853216e-05,
"loss": 1.4562,
"step": 800
},
{
"epoch": 0.021685585778539303,
"grad_norm": 5.957550048828125,
"learning_rate": 4.1759708346695215e-05,
"loss": 1.4798,
"step": 810
},
{
"epoch": 0.021953309059755836,
"grad_norm": 6.139000415802002,
"learning_rate": 4.1697289355320565e-05,
"loss": 1.5084,
"step": 820
},
{
"epoch": 0.02222103234097237,
"grad_norm": 6.227492332458496,
"learning_rate": 4.1634060685529527e-05,
"loss": 1.4597,
"step": 830
},
{
"epoch": 0.022488755622188907,
"grad_norm": 6.531513214111328,
"learning_rate": 4.157002493640506e-05,
"loss": 1.4326,
"step": 840
},
{
"epoch": 0.02275647890340544,
"grad_norm": 6.589105606079102,
"learning_rate": 4.1505184740206006e-05,
"loss": 1.431,
"step": 850
},
{
"epoch": 0.023024202184621975,
"grad_norm": 6.289806842803955,
"learning_rate": 4.143954276225886e-05,
"loss": 1.5167,
"step": 860
},
{
"epoch": 0.023291925465838508,
"grad_norm": 6.444394111633301,
"learning_rate": 4.1373101700848235e-05,
"loss": 1.4948,
"step": 870
},
{
"epoch": 0.023559648747055045,
"grad_norm": 6.666561126708984,
"learning_rate": 4.1305864287105946e-05,
"loss": 1.4879,
"step": 880
},
{
"epoch": 0.02382737202827158,
"grad_norm": 6.169593811035156,
"learning_rate": 4.12378332848987e-05,
"loss": 1.5557,
"step": 890
},
{
"epoch": 0.024095095309488113,
"grad_norm": 6.8753743171691895,
"learning_rate": 4.116901149071457e-05,
"loss": 1.4446,
"step": 900
},
{
"epoch": 0.024095095309488113,
"eval_loss": 1.4628037214279175,
"eval_runtime": 76.661,
"eval_samples_per_second": 6.522,
"eval_steps_per_second": 6.522,
"step": 900
},
{
"epoch": 0.024362818590704646,
"grad_norm": 6.948112964630127,
"learning_rate": 4.1099401733547925e-05,
"loss": 1.4916,
"step": 910
},
{
"epoch": 0.024630541871921183,
"grad_norm": 6.508751392364502,
"learning_rate": 4.102900687478326e-05,
"loss": 1.4659,
"step": 920
},
{
"epoch": 0.024898265153137717,
"grad_norm": 6.732906818389893,
"learning_rate": 4.095782980807749e-05,
"loss": 1.4834,
"step": 930
},
{
"epoch": 0.02516598843435425,
"grad_norm": 6.261349678039551,
"learning_rate": 4.088587345924105e-05,
"loss": 1.4585,
"step": 940
},
{
"epoch": 0.025433711715570784,
"grad_norm": 5.926994323730469,
"learning_rate": 4.081314078611762e-05,
"loss": 1.4887,
"step": 950
},
{
"epoch": 0.02570143499678732,
"grad_norm": 6.746396064758301,
"learning_rate": 4.073963477846249e-05,
"loss": 1.4561,
"step": 960
},
{
"epoch": 0.025969158278003855,
"grad_norm": 6.503916263580322,
"learning_rate": 4.066535845781975e-05,
"loss": 1.5013,
"step": 970
},
{
"epoch": 0.02623688155922039,
"grad_norm": 6.83921480178833,
"learning_rate": 4.059031487739803e-05,
"loss": 1.517,
"step": 980
},
{
"epoch": 0.026504604840436926,
"grad_norm": 6.210860252380371,
"learning_rate": 4.051450712194497e-05,
"loss": 1.4849,
"step": 990
},
{
"epoch": 0.02677232812165346,
"grad_norm": 6.381270885467529,
"learning_rate": 4.043793830762049e-05,
"loss": 1.4685,
"step": 1000
},
{
"epoch": 0.027040051402869993,
"grad_norm": 6.763075351715088,
"learning_rate": 4.036061158186866e-05,
"loss": 1.5412,
"step": 1010
},
{
"epoch": 0.027307774684086527,
"grad_norm": 6.492913722991943,
"learning_rate": 4.028253012328828e-05,
"loss": 1.4398,
"step": 1020
},
{
"epoch": 0.027575497965303064,
"grad_norm": 6.383675575256348,
"learning_rate": 4.0203697141502323e-05,
"loss": 1.4514,
"step": 1030
},
{
"epoch": 0.027843221246519598,
"grad_norm": 6.685030937194824,
"learning_rate": 4.0124115877025874e-05,
"loss": 1.4688,
"step": 1040
},
{
"epoch": 0.02811094452773613,
"grad_norm": 6.681553840637207,
"learning_rate": 4.004378960113303e-05,
"loss": 1.4862,
"step": 1050
},
{
"epoch": 0.02811094452773613,
"eval_loss": 1.4577986001968384,
"eval_runtime": 76.6839,
"eval_samples_per_second": 6.52,
"eval_steps_per_second": 6.52,
"step": 1050
},
{
"epoch": 0.028378667808952665,
"grad_norm": 6.192584037780762,
"learning_rate": 3.996272161572237e-05,
"loss": 1.4383,
"step": 1060
},
{
"epoch": 0.028646391090169202,
"grad_norm": 6.383575439453125,
"learning_rate": 3.988091525318126e-05,
"loss": 1.4015,
"step": 1070
},
{
"epoch": 0.028914114371385736,
"grad_norm": 7.042336940765381,
"learning_rate": 3.979837387624884e-05,
"loss": 1.468,
"step": 1080
},
{
"epoch": 0.02918183765260227,
"grad_norm": 6.60390567779541,
"learning_rate": 3.971510087787784e-05,
"loss": 1.4932,
"step": 1090
},
{
"epoch": 0.029449560933818803,
"grad_norm": 6.5373029708862305,
"learning_rate": 3.9631099681095044e-05,
"loss": 1.4381,
"step": 1100
},
{
"epoch": 0.02971728421503534,
"grad_norm": 7.118472099304199,
"learning_rate": 3.954637373886066e-05,
"loss": 1.4057,
"step": 1110
},
{
"epoch": 0.029985007496251874,
"grad_norm": 7.386786460876465,
"learning_rate": 3.9460926533926315e-05,
"loss": 1.4978,
"step": 1120
},
{
"epoch": 0.030252730777468408,
"grad_norm": 6.418981552124023,
"learning_rate": 3.937476157869193e-05,
"loss": 1.4897,
"step": 1130
},
{
"epoch": 0.030520454058684945,
"grad_norm": 5.941694259643555,
"learning_rate": 3.9287882415061334e-05,
"loss": 1.4381,
"step": 1140
},
{
"epoch": 0.03078817733990148,
"grad_norm": 6.574525833129883,
"learning_rate": 3.9200292614296655e-05,
"loss": 1.4143,
"step": 1150
},
{
"epoch": 0.031055900621118012,
"grad_norm": 6.349545478820801,
"learning_rate": 3.911199577687154e-05,
"loss": 1.3937,
"step": 1160
},
{
"epoch": 0.03132362390233455,
"grad_norm": 6.873767375946045,
"learning_rate": 3.902299553232315e-05,
"loss": 1.4515,
"step": 1170
},
{
"epoch": 0.03159134718355108,
"grad_norm": 6.043056964874268,
"learning_rate": 3.893329553910293e-05,
"loss": 1.5538,
"step": 1180
},
{
"epoch": 0.031859070464767617,
"grad_norm": 6.418127059936523,
"learning_rate": 3.884289948442628e-05,
"loss": 1.4745,
"step": 1190
},
{
"epoch": 0.03212679374598415,
"grad_norm": 6.102353096008301,
"learning_rate": 3.875181108412096e-05,
"loss": 1.4772,
"step": 1200
},
{
"epoch": 0.03212679374598415,
"eval_loss": 1.449093222618103,
"eval_runtime": 76.8079,
"eval_samples_per_second": 6.51,
"eval_steps_per_second": 6.51,
"step": 1200
},
{
"epoch": 0.032394517027200684,
"grad_norm": 6.624678134918213,
"learning_rate": 3.8660034082474316e-05,
"loss": 1.4526,
"step": 1210
},
{
"epoch": 0.03266224030841722,
"grad_norm": 6.601352214813232,
"learning_rate": 3.856757225207944e-05,
"loss": 1.5247,
"step": 1220
},
{
"epoch": 0.03292996358963375,
"grad_norm": 6.580589771270752,
"learning_rate": 3.847442939368002e-05,
"loss": 1.4694,
"step": 1230
},
{
"epoch": 0.03319768687085029,
"grad_norm": 5.504952430725098,
"learning_rate": 3.8380609336014156e-05,
"loss": 1.411,
"step": 1240
},
{
"epoch": 0.033465410152066825,
"grad_norm": 6.643352508544922,
"learning_rate": 3.828611593565694e-05,
"loss": 1.4278,
"step": 1250
},
{
"epoch": 0.03373313343328336,
"grad_norm": 6.425754070281982,
"learning_rate": 3.819095307686197e-05,
"loss": 1.4253,
"step": 1260
},
{
"epoch": 0.03400085671449989,
"grad_norm": 6.0909504890441895,
"learning_rate": 3.809512467140163e-05,
"loss": 1.4681,
"step": 1270
},
{
"epoch": 0.034268579995716426,
"grad_norm": 5.82462215423584,
"learning_rate": 3.799863465840634e-05,
"loss": 1.4275,
"step": 1280
},
{
"epoch": 0.03453630327693296,
"grad_norm": 6.24934720993042,
"learning_rate": 3.790148700420261e-05,
"loss": 1.4313,
"step": 1290
},
{
"epoch": 0.034804026558149494,
"grad_norm": 6.211289882659912,
"learning_rate": 3.7803685702150006e-05,
"loss": 1.4216,
"step": 1300
},
{
"epoch": 0.035071749839366034,
"grad_norm": 5.66799259185791,
"learning_rate": 3.7705234772476984e-05,
"loss": 1.4799,
"step": 1310
},
{
"epoch": 0.03533947312058257,
"grad_norm": 6.351191997528076,
"learning_rate": 3.760613826211567e-05,
"loss": 1.4281,
"step": 1320
},
{
"epoch": 0.0356071964017991,
"grad_norm": 5.707607269287109,
"learning_rate": 3.7506400244535455e-05,
"loss": 1.463,
"step": 1330
},
{
"epoch": 0.035874919683015635,
"grad_norm": 5.780487537384033,
"learning_rate": 3.740602481957561e-05,
"loss": 1.4731,
"step": 1340
},
{
"epoch": 0.03614264296423217,
"grad_norm": 5.354376316070557,
"learning_rate": 3.7305016113276704e-05,
"loss": 1.4492,
"step": 1350
},
{
"epoch": 0.03614264296423217,
"eval_loss": 1.444738745689392,
"eval_runtime": 76.8239,
"eval_samples_per_second": 6.508,
"eval_steps_per_second": 6.508,
"step": 1350
},
{
"epoch": 0.0364103662454487,
"grad_norm": 5.93463134765625,
"learning_rate": 3.7203378277711024e-05,
"loss": 1.4602,
"step": 1360
},
{
"epoch": 0.036678089526665236,
"grad_norm": 5.922112464904785,
"learning_rate": 3.710111549081191e-05,
"loss": 1.4412,
"step": 1370
},
{
"epoch": 0.03694581280788178,
"grad_norm": 6.667977809906006,
"learning_rate": 3.699823195620199e-05,
"loss": 1.4475,
"step": 1380
},
{
"epoch": 0.03721353608909831,
"grad_norm": 6.021790504455566,
"learning_rate": 3.689473190302041e-05,
"loss": 1.4206,
"step": 1390
},
{
"epoch": 0.037481259370314844,
"grad_norm": 6.152276039123535,
"learning_rate": 3.679061958574897e-05,
"loss": 1.4288,
"step": 1400
},
{
"epoch": 0.03774898265153138,
"grad_norm": 5.695444583892822,
"learning_rate": 3.668589928403726e-05,
"loss": 1.4424,
"step": 1410
},
{
"epoch": 0.03801670593274791,
"grad_norm": 6.346884727478027,
"learning_rate": 3.6580575302526706e-05,
"loss": 1.5001,
"step": 1420
},
{
"epoch": 0.038284429213964445,
"grad_norm": 5.674633979797363,
"learning_rate": 3.647465197067368e-05,
"loss": 1.4796,
"step": 1430
},
{
"epoch": 0.03855215249518098,
"grad_norm": 6.168262481689453,
"learning_rate": 3.6368133642571464e-05,
"loss": 1.4428,
"step": 1440
},
{
"epoch": 0.03881987577639751,
"grad_norm": 6.4981369972229,
"learning_rate": 3.6261024696771345e-05,
"loss": 1.4281,
"step": 1450
},
{
"epoch": 0.03908759905761405,
"grad_norm": 5.703588962554932,
"learning_rate": 3.615332953610255e-05,
"loss": 1.3934,
"step": 1460
},
{
"epoch": 0.03935532233883059,
"grad_norm": 5.69433069229126,
"learning_rate": 3.604505258749132e-05,
"loss": 1.4482,
"step": 1470
},
{
"epoch": 0.03962304562004712,
"grad_norm": 5.763268947601318,
"learning_rate": 3.5936198301778945e-05,
"loss": 1.4629,
"step": 1480
},
{
"epoch": 0.039890768901263654,
"grad_norm": 6.472227096557617,
"learning_rate": 3.5826771153538716e-05,
"loss": 1.4301,
"step": 1490
},
{
"epoch": 0.04015849218248019,
"grad_norm": 6.155264854431152,
"learning_rate": 3.571677564089214e-05,
"loss": 1.4703,
"step": 1500
},
{
"epoch": 0.04015849218248019,
"eval_loss": 1.4356995820999146,
"eval_runtime": 76.8148,
"eval_samples_per_second": 6.509,
"eval_steps_per_second": 6.509,
"step": 1500
},
{
"epoch": 0.04042621546369672,
"grad_norm": 6.241977214813232,
"learning_rate": 3.560621628532389e-05,
"loss": 1.4461,
"step": 1510
},
{
"epoch": 0.040693938744913255,
"grad_norm": 6.195336818695068,
"learning_rate": 3.5495097631496066e-05,
"loss": 1.3735,
"step": 1520
},
{
"epoch": 0.040961662026129796,
"grad_norm": 5.899549961090088,
"learning_rate": 3.5383424247061286e-05,
"loss": 1.4787,
"step": 1530
},
{
"epoch": 0.04122938530734633,
"grad_norm": 6.187289714813232,
"learning_rate": 3.5271200722475e-05,
"loss": 1.4413,
"step": 1540
},
{
"epoch": 0.04149710858856286,
"grad_norm": 5.872448444366455,
"learning_rate": 3.515843167080675e-05,
"loss": 1.4317,
"step": 1550
},
{
"epoch": 0.0417648318697794,
"grad_norm": 6.877863883972168,
"learning_rate": 3.5045121727550566e-05,
"loss": 1.4593,
"step": 1560
},
{
"epoch": 0.04203255515099593,
"grad_norm": 6.644635200500488,
"learning_rate": 3.493127555043441e-05,
"loss": 1.4622,
"step": 1570
},
{
"epoch": 0.042300278432212464,
"grad_norm": 6.314537525177002,
"learning_rate": 3.481689781922871e-05,
"loss": 1.5365,
"step": 1580
},
{
"epoch": 0.042568001713429,
"grad_norm": 6.645462989807129,
"learning_rate": 3.470199323555403e-05,
"loss": 1.4534,
"step": 1590
},
{
"epoch": 0.04283572499464553,
"grad_norm": 6.462603569030762,
"learning_rate": 3.4586566522687734e-05,
"loss": 1.4786,
"step": 1600
},
{
"epoch": 0.04310344827586207,
"grad_norm": 6.516161918640137,
"learning_rate": 3.44706224253699e-05,
"loss": 1.3987,
"step": 1610
},
{
"epoch": 0.043371171557078605,
"grad_norm": 6.5383076667785645,
"learning_rate": 3.435416570960824e-05,
"loss": 1.4993,
"step": 1620
},
{
"epoch": 0.04363889483829514,
"grad_norm": 6.37644100189209,
"learning_rate": 3.4237201162482225e-05,
"loss": 1.4527,
"step": 1630
},
{
"epoch": 0.04390661811951167,
"grad_norm": 6.744267463684082,
"learning_rate": 3.411973359194625e-05,
"loss": 1.4213,
"step": 1640
},
{
"epoch": 0.044174341400728206,
"grad_norm": 6.487947463989258,
"learning_rate": 3.400176782663207e-05,
"loss": 1.4266,
"step": 1650
},
{
"epoch": 0.044174341400728206,
"eval_loss": 1.4300212860107422,
"eval_runtime": 76.7363,
"eval_samples_per_second": 6.516,
"eval_steps_per_second": 6.516,
"step": 1650
},
{
"epoch": 0.04444206468194474,
"grad_norm": 5.463531494140625,
"learning_rate": 3.3883308715650246e-05,
"loss": 1.4868,
"step": 1660
},
{
"epoch": 0.044709787963161274,
"grad_norm": 6.814722537994385,
"learning_rate": 3.3764361128390853e-05,
"loss": 1.441,
"step": 1670
},
{
"epoch": 0.044977511244377814,
"grad_norm": 5.653580665588379,
"learning_rate": 3.3644929954323324e-05,
"loss": 1.4674,
"step": 1680
},
{
"epoch": 0.04524523452559435,
"grad_norm": 5.94467306137085,
"learning_rate": 3.3525020102795434e-05,
"loss": 1.4337,
"step": 1690
},
{
"epoch": 0.04551295780681088,
"grad_norm": 6.066728115081787,
"learning_rate": 3.3404636502831555e-05,
"loss": 1.4701,
"step": 1700
},
{
"epoch": 0.045780681088027415,
"grad_norm": 6.075364112854004,
"learning_rate": 3.328378410292994e-05,
"loss": 1.4264,
"step": 1710
},
{
"epoch": 0.04604840436924395,
"grad_norm": 6.536380290985107,
"learning_rate": 3.3162467870859404e-05,
"loss": 1.4928,
"step": 1720
},
{
"epoch": 0.04631612765046048,
"grad_norm": 6.932302951812744,
"learning_rate": 3.3040692793455106e-05,
"loss": 1.4472,
"step": 1730
},
{
"epoch": 0.046583850931677016,
"grad_norm": 6.54493522644043,
"learning_rate": 3.2918463876413504e-05,
"loss": 1.3929,
"step": 1740
},
{
"epoch": 0.04685157421289355,
"grad_norm": 5.732593059539795,
"learning_rate": 3.279578614408664e-05,
"loss": 1.4182,
"step": 1750
},
{
"epoch": 0.04711929749411009,
"grad_norm": 6.897890090942383,
"learning_rate": 3.2672664639275584e-05,
"loss": 1.466,
"step": 1760
},
{
"epoch": 0.047387020775326624,
"grad_norm": 6.196009159088135,
"learning_rate": 3.254910442302319e-05,
"loss": 1.4552,
"step": 1770
},
{
"epoch": 0.04765474405654316,
"grad_norm": 6.375300407409668,
"learning_rate": 3.242511057440597e-05,
"loss": 1.4139,
"step": 1780
},
{
"epoch": 0.04792246733775969,
"grad_norm": 5.817831993103027,
"learning_rate": 3.2300688190325404e-05,
"loss": 1.4855,
"step": 1790
},
{
"epoch": 0.048190190618976225,
"grad_norm": 5.730225563049316,
"learning_rate": 3.217584238529838e-05,
"loss": 1.3845,
"step": 1800
},
{
"epoch": 0.048190190618976225,
"eval_loss": 1.4201979637145996,
"eval_runtime": 76.7247,
"eval_samples_per_second": 6.517,
"eval_steps_per_second": 6.517,
"step": 1800
},
{
"epoch": 0.04845791390019276,
"grad_norm": 7.2009382247924805,
"learning_rate": 3.205057829124693e-05,
"loss": 1.3661,
"step": 1810
},
{
"epoch": 0.04872563718140929,
"grad_norm": 6.178856372833252,
"learning_rate": 3.192490105728736e-05,
"loss": 1.4082,
"step": 1820
},
{
"epoch": 0.04899336046262583,
"grad_norm": 5.327315330505371,
"learning_rate": 3.17988158495185e-05,
"loss": 1.4033,
"step": 1830
},
{
"epoch": 0.04926108374384237,
"grad_norm": 5.719634532928467,
"learning_rate": 3.1672327850809405e-05,
"loss": 1.4505,
"step": 1840
},
{
"epoch": 0.0495288070250589,
"grad_norm": 6.736356735229492,
"learning_rate": 3.154544226058628e-05,
"loss": 1.4521,
"step": 1850
},
{
"epoch": 0.049796530306275434,
"grad_norm": 5.911272048950195,
"learning_rate": 3.1418164294618766e-05,
"loss": 1.452,
"step": 1860
},
{
"epoch": 0.05006425358749197,
"grad_norm": 6.058777332305908,
"learning_rate": 3.129049918480552e-05,
"loss": 1.4431,
"step": 1870
},
{
"epoch": 0.0503319768687085,
"grad_norm": 5.928140640258789,
"learning_rate": 3.116245217895918e-05,
"loss": 1.4781,
"step": 1880
},
{
"epoch": 0.050599700149925035,
"grad_norm": 5.614587783813477,
"learning_rate": 3.1034028540590635e-05,
"loss": 1.3831,
"step": 1890
},
{
"epoch": 0.05086742343114157,
"grad_norm": 6.133342742919922,
"learning_rate": 3.090523354869266e-05,
"loss": 1.4711,
"step": 1900
},
{
"epoch": 0.05113514671235811,
"grad_norm": 7.014552593231201,
"learning_rate": 3.0776072497522916e-05,
"loss": 1.4404,
"step": 1910
},
{
"epoch": 0.05140286999357464,
"grad_norm": 7.53794002532959,
"learning_rate": 3.064655069638632e-05,
"loss": 1.4262,
"step": 1920
},
{
"epoch": 0.05167059327479118,
"grad_norm": 5.97748327255249,
"learning_rate": 3.0516673469416818e-05,
"loss": 1.3836,
"step": 1930
},
{
"epoch": 0.05193831655600771,
"grad_norm": 5.628602504730225,
"learning_rate": 3.0386446155358518e-05,
"loss": 1.4083,
"step": 1940
},
{
"epoch": 0.052206039837224244,
"grad_norm": 6.5765380859375,
"learning_rate": 3.0255874107346232e-05,
"loss": 1.4374,
"step": 1950
},
{
"epoch": 0.052206039837224244,
"eval_loss": 1.4104682207107544,
"eval_runtime": 76.7665,
"eval_samples_per_second": 6.513,
"eval_steps_per_second": 6.513,
"step": 1950
},
{
"epoch": 0.05247376311844078,
"grad_norm": 6.185977458953857,
"learning_rate": 3.012496269268544e-05,
"loss": 1.4185,
"step": 1960
},
{
"epoch": 0.05274148639965731,
"grad_norm": 6.482154846191406,
"learning_rate": 2.9993717292631652e-05,
"loss": 1.4446,
"step": 1970
},
{
"epoch": 0.05300920968087385,
"grad_norm": 6.260786056518555,
"learning_rate": 2.9862143302169223e-05,
"loss": 1.4123,
"step": 1980
},
{
"epoch": 0.053276932962090386,
"grad_norm": 6.361741542816162,
"learning_rate": 2.9730246129789542e-05,
"loss": 1.4646,
"step": 1990
},
{
"epoch": 0.05354465624330692,
"grad_norm": 6.388660430908203,
"learning_rate": 2.9598031197268768e-05,
"loss": 1.4232,
"step": 2000
},
{
"epoch": 0.05381237952452345,
"grad_norm": 6.549992084503174,
"learning_rate": 2.946550393944493e-05,
"loss": 1.398,
"step": 2010
},
{
"epoch": 0.05408010280573999,
"grad_norm": 5.965968132019043,
"learning_rate": 2.933266980399452e-05,
"loss": 1.3618,
"step": 2020
},
{
"epoch": 0.05434782608695652,
"grad_norm": 6.706049919128418,
"learning_rate": 2.9199534251208573e-05,
"loss": 1.4274,
"step": 2030
},
{
"epoch": 0.054615549368173054,
"grad_norm": 5.7372355461120605,
"learning_rate": 2.9066102753768204e-05,
"loss": 1.3954,
"step": 2040
},
{
"epoch": 0.05488327264938959,
"grad_norm": 6.6037468910217285,
"learning_rate": 2.893238079651966e-05,
"loss": 1.3763,
"step": 2050
},
{
"epoch": 0.05515099593060613,
"grad_norm": 6.51908540725708,
"learning_rate": 2.8798373876248843e-05,
"loss": 1.3945,
"step": 2060
},
{
"epoch": 0.05541871921182266,
"grad_norm": 6.044327259063721,
"learning_rate": 2.8664087501455387e-05,
"loss": 1.4487,
"step": 2070
},
{
"epoch": 0.055686442493039195,
"grad_norm": 6.289717674255371,
"learning_rate": 2.852952719212619e-05,
"loss": 1.4311,
"step": 2080
},
{
"epoch": 0.05595416577425573,
"grad_norm": 5.992395401000977,
"learning_rate": 2.8394698479508542e-05,
"loss": 1.3859,
"step": 2090
},
{
"epoch": 0.05622188905547226,
"grad_norm": 6.025457382202148,
"learning_rate": 2.8259606905882712e-05,
"loss": 1.4162,
"step": 2100
},
{
"epoch": 0.05622188905547226,
"eval_loss": 1.4084678888320923,
"eval_runtime": 76.754,
"eval_samples_per_second": 6.514,
"eval_steps_per_second": 6.514,
"step": 2100
},
{
"epoch": 0.056489612336688796,
"grad_norm": 5.75090217590332,
"learning_rate": 2.8124258024334192e-05,
"loss": 1.4478,
"step": 2110
},
{
"epoch": 0.05675733561790533,
"grad_norm": 6.59517240524292,
"learning_rate": 2.7988657398525364e-05,
"loss": 1.4742,
"step": 2120
},
{
"epoch": 0.05702505889912187,
"grad_norm": 5.816342830657959,
"learning_rate": 2.785281060246685e-05,
"loss": 1.4508,
"step": 2130
},
{
"epoch": 0.057292782180338404,
"grad_norm": 5.353818416595459,
"learning_rate": 2.7716723220288365e-05,
"loss": 1.4593,
"step": 2140
},
{
"epoch": 0.05756050546155494,
"grad_norm": 5.926205635070801,
"learning_rate": 2.758040084600916e-05,
"loss": 1.4599,
"step": 2150
},
{
"epoch": 0.05782822874277147,
"grad_norm": 6.504787921905518,
"learning_rate": 2.7443849083308117e-05,
"loss": 1.3973,
"step": 2160
},
{
"epoch": 0.058095952023988005,
"grad_norm": 5.680723190307617,
"learning_rate": 2.7307073545293355e-05,
"loss": 1.4051,
"step": 2170
},
{
"epoch": 0.05836367530520454,
"grad_norm": 7.509329795837402,
"learning_rate": 2.7170079854271533e-05,
"loss": 1.3807,
"step": 2180
},
{
"epoch": 0.05863139858642107,
"grad_norm": 6.508755683898926,
"learning_rate": 2.703287364151672e-05,
"loss": 1.3869,
"step": 2190
},
{
"epoch": 0.058899121867637606,
"grad_norm": 6.544657230377197,
"learning_rate": 2.6895460547038913e-05,
"loss": 1.3409,
"step": 2200
},
{
"epoch": 0.05916684514885415,
"grad_norm": 6.107619762420654,
"learning_rate": 2.6757846219352235e-05,
"loss": 1.389,
"step": 2210
},
{
"epoch": 0.05943456843007068,
"grad_norm": 6.255703449249268,
"learning_rate": 2.6620036315242682e-05,
"loss": 1.4385,
"step": 2220
},
{
"epoch": 0.059702291711287214,
"grad_norm": 5.130046844482422,
"learning_rate": 2.6482036499535665e-05,
"loss": 1.3614,
"step": 2230
},
{
"epoch": 0.05997001499250375,
"grad_norm": 6.854607105255127,
"learning_rate": 2.6343852444863075e-05,
"loss": 1.4465,
"step": 2240
},
{
"epoch": 0.06023773827372028,
"grad_norm": 6.745285511016846,
"learning_rate": 2.6205489831430192e-05,
"loss": 1.4,
"step": 2250
},
{
"epoch": 0.06023773827372028,
"eval_loss": 1.3977503776550293,
"eval_runtime": 76.7205,
"eval_samples_per_second": 6.517,
"eval_steps_per_second": 6.517,
"step": 2250
},
{
"epoch": 0.060505461554936815,
"grad_norm": 5.628711223602295,
"learning_rate": 2.6066954346782113e-05,
"loss": 1.43,
"step": 2260
},
{
"epoch": 0.06077318483615335,
"grad_norm": 6.356362342834473,
"learning_rate": 2.5928251685570005e-05,
"loss": 1.4382,
"step": 2270
},
{
"epoch": 0.06104090811736989,
"grad_norm": 5.99081563949585,
"learning_rate": 2.5789387549317016e-05,
"loss": 1.4363,
"step": 2280
},
{
"epoch": 0.06130863139858642,
"grad_norm": 6.202702522277832,
"learning_rate": 2.5650367646183896e-05,
"loss": 1.3932,
"step": 2290
},
{
"epoch": 0.06157635467980296,
"grad_norm": 6.321465015411377,
"learning_rate": 2.5511197690734344e-05,
"loss": 1.4056,
"step": 2300
},
{
"epoch": 0.06184407796101949,
"grad_norm": 5.804145812988281,
"learning_rate": 2.5371883403700148e-05,
"loss": 1.4132,
"step": 2310
},
{
"epoch": 0.062111801242236024,
"grad_norm": 5.593542098999023,
"learning_rate": 2.5232430511745995e-05,
"loss": 1.4603,
"step": 2320
},
{
"epoch": 0.06237952452345256,
"grad_norm": 6.308177947998047,
"learning_rate": 2.5092844747234063e-05,
"loss": 1.361,
"step": 2330
},
{
"epoch": 0.0626472478046691,
"grad_norm": 5.957157611846924,
"learning_rate": 2.495313184798842e-05,
"loss": 1.435,
"step": 2340
},
{
"epoch": 0.06291497108588563,
"grad_norm": 5.485774517059326,
"learning_rate": 2.4813297557059133e-05,
"loss": 1.413,
"step": 2350
},
{
"epoch": 0.06318269436710217,
"grad_norm": 7.090158939361572,
"learning_rate": 2.467334762248621e-05,
"loss": 1.3819,
"step": 2360
},
{
"epoch": 0.0634504176483187,
"grad_norm": 6.819372653961182,
"learning_rate": 2.4533287797063308e-05,
"loss": 1.4347,
"step": 2370
},
{
"epoch": 0.06371814092953523,
"grad_norm": 5.654256820678711,
"learning_rate": 2.439312383810128e-05,
"loss": 1.3902,
"step": 2380
},
{
"epoch": 0.06398586421075177,
"grad_norm": 6.394632339477539,
"learning_rate": 2.4252861507191487e-05,
"loss": 1.4324,
"step": 2390
},
{
"epoch": 0.0642535874919683,
"grad_norm": 6.346138954162598,
"learning_rate": 2.4112506569969e-05,
"loss": 1.3853,
"step": 2400
},
{
"epoch": 0.0642535874919683,
"eval_loss": 1.389374017715454,
"eval_runtime": 76.6782,
"eval_samples_per_second": 6.521,
"eval_steps_per_second": 6.521,
"step": 2400
},
{
"epoch": 0.06452131077318483,
"grad_norm": 5.798035144805908,
"learning_rate": 2.3972064795875537e-05,
"loss": 1.3668,
"step": 2410
},
{
"epoch": 0.06478903405440137,
"grad_norm": 6.213179588317871,
"learning_rate": 2.3831541957922366e-05,
"loss": 1.3913,
"step": 2420
},
{
"epoch": 0.0650567573356179,
"grad_norm": 6.443445682525635,
"learning_rate": 2.3690943832452967e-05,
"loss": 1.4176,
"step": 2430
},
{
"epoch": 0.06532448061683443,
"grad_norm": 6.543423652648926,
"learning_rate": 2.3550276198905584e-05,
"loss": 1.5036,
"step": 2440
},
{
"epoch": 0.06559220389805097,
"grad_norm": 5.8855977058410645,
"learning_rate": 2.3409544839575687e-05,
"loss": 1.3749,
"step": 2450
},
{
"epoch": 0.0658599271792675,
"grad_norm": 6.113175868988037,
"learning_rate": 2.3268755539378238e-05,
"loss": 1.3555,
"step": 2460
},
{
"epoch": 0.06612765046048405,
"grad_norm": 6.519189357757568,
"learning_rate": 2.3127914085609943e-05,
"loss": 1.3457,
"step": 2470
},
{
"epoch": 0.06639537374170058,
"grad_norm": 6.135042667388916,
"learning_rate": 2.298702626771133e-05,
"loss": 1.4143,
"step": 2480
},
{
"epoch": 0.06666309702291712,
"grad_norm": 6.562228679656982,
"learning_rate": 2.2846097877028762e-05,
"loss": 1.4549,
"step": 2490
},
{
"epoch": 0.06693082030413365,
"grad_norm": 6.2036213874816895,
"learning_rate": 2.270513470657642e-05,
"loss": 1.3422,
"step": 2500
},
{
"epoch": 0.06719854358535018,
"grad_norm": 6.321053981781006,
"learning_rate": 2.25641425507981e-05,
"loss": 1.4206,
"step": 2510
},
{
"epoch": 0.06746626686656672,
"grad_norm": 5.922671794891357,
"learning_rate": 2.2423127205329117e-05,
"loss": 1.4368,
"step": 2520
},
{
"epoch": 0.06773399014778325,
"grad_norm": 6.139718532562256,
"learning_rate": 2.2282094466758e-05,
"loss": 1.3574,
"step": 2530
},
{
"epoch": 0.06800171342899979,
"grad_norm": 5.755593776702881,
"learning_rate": 2.2141050132388245e-05,
"loss": 1.4075,
"step": 2540
},
{
"epoch": 0.06826943671021632,
"grad_norm": 5.7373151779174805,
"learning_rate": 2.2e-05,
"loss": 1.3812,
"step": 2550
},
{
"epoch": 0.06826943671021632,
"eval_loss": 1.3869917392730713,
"eval_runtime": 76.6763,
"eval_samples_per_second": 6.521,
"eval_steps_per_second": 6.521,
"step": 2550
},
{
"epoch": 0.06853715999143285,
"grad_norm": 6.435483932495117,
"learning_rate": 2.1858949867611754e-05,
"loss": 1.3586,
"step": 2560
},
{
"epoch": 0.06880488327264939,
"grad_norm": 5.814359188079834,
"learning_rate": 2.1717905533241997e-05,
"loss": 1.3745,
"step": 2570
},
{
"epoch": 0.06907260655386592,
"grad_norm": 6.140771389007568,
"learning_rate": 2.157687279467088e-05,
"loss": 1.3296,
"step": 2580
},
{
"epoch": 0.06934032983508245,
"grad_norm": 5.861440181732178,
"learning_rate": 2.14358574492019e-05,
"loss": 1.3911,
"step": 2590
},
{
"epoch": 0.06960805311629899,
"grad_norm": 6.584283351898193,
"learning_rate": 2.1294865293423586e-05,
"loss": 1.4143,
"step": 2600
},
{
"epoch": 0.06987577639751552,
"grad_norm": 5.859135627746582,
"learning_rate": 2.1153902122971233e-05,
"loss": 1.3923,
"step": 2610
},
{
"epoch": 0.07014349967873207,
"grad_norm": 6.673269748687744,
"learning_rate": 2.101297373228868e-05,
"loss": 1.4072,
"step": 2620
},
{
"epoch": 0.0704112229599486,
"grad_norm": 5.8205180168151855,
"learning_rate": 2.087208591439006e-05,
"loss": 1.3962,
"step": 2630
},
{
"epoch": 0.07067894624116514,
"grad_norm": 5.918448448181152,
"learning_rate": 2.0731244460621764e-05,
"loss": 1.4121,
"step": 2640
},
{
"epoch": 0.07094666952238167,
"grad_norm": 6.024654865264893,
"learning_rate": 2.0590455160424316e-05,
"loss": 1.3958,
"step": 2650
},
{
"epoch": 0.0712143928035982,
"grad_norm": 6.21071195602417,
"learning_rate": 2.044972380109441e-05,
"loss": 1.4155,
"step": 2660
},
{
"epoch": 0.07148211608481474,
"grad_norm": 6.8569207191467285,
"learning_rate": 2.030905616754704e-05,
"loss": 1.3968,
"step": 2670
},
{
"epoch": 0.07174983936603127,
"grad_norm": 6.207950592041016,
"learning_rate": 2.0168458042077636e-05,
"loss": 1.3722,
"step": 2680
},
{
"epoch": 0.0720175626472478,
"grad_norm": 5.884634494781494,
"learning_rate": 2.0027935204124465e-05,
"loss": 1.4165,
"step": 2690
},
{
"epoch": 0.07228528592846434,
"grad_norm": 5.943591117858887,
"learning_rate": 1.9887493430031e-05,
"loss": 1.4054,
"step": 2700
},
{
"epoch": 0.07228528592846434,
"eval_loss": 1.3810029029846191,
"eval_runtime": 76.6801,
"eval_samples_per_second": 6.521,
"eval_steps_per_second": 6.521,
"step": 2700
},
{
"epoch": 0.07255300920968087,
"grad_norm": 6.2774457931518555,
"learning_rate": 1.9747138492808512e-05,
"loss": 1.4184,
"step": 2710
},
{
"epoch": 0.0728207324908974,
"grad_norm": 6.596461772918701,
"learning_rate": 1.960687616189872e-05,
"loss": 1.4314,
"step": 2720
},
{
"epoch": 0.07308845577211394,
"grad_norm": 6.17069149017334,
"learning_rate": 1.9466712202936694e-05,
"loss": 1.4248,
"step": 2730
},
{
"epoch": 0.07335617905333047,
"grad_norm": 6.085130214691162,
"learning_rate": 1.932665237751379e-05,
"loss": 1.3966,
"step": 2740
},
{
"epoch": 0.073623902334547,
"grad_norm": 6.430164813995361,
"learning_rate": 1.9186702442940866e-05,
"loss": 1.3521,
"step": 2750
},
{
"epoch": 0.07389162561576355,
"grad_norm": 5.946996212005615,
"learning_rate": 1.9046868152011587e-05,
"loss": 1.336,
"step": 2760
},
{
"epoch": 0.07415934889698009,
"grad_norm": 6.169567108154297,
"learning_rate": 1.8907155252765942e-05,
"loss": 1.4099,
"step": 2770
},
{
"epoch": 0.07442707217819662,
"grad_norm": 5.974761962890625,
"learning_rate": 1.8767569488254004e-05,
"loss": 1.3588,
"step": 2780
},
{
"epoch": 0.07469479545941315,
"grad_norm": 5.632639408111572,
"learning_rate": 1.8628116596299847e-05,
"loss": 1.3704,
"step": 2790
},
{
"epoch": 0.07496251874062969,
"grad_norm": 5.559203147888184,
"learning_rate": 1.848880230926566e-05,
"loss": 1.3878,
"step": 2800
},
{
"epoch": 0.07523024202184622,
"grad_norm": 6.6522674560546875,
"learning_rate": 1.8349632353816113e-05,
"loss": 1.4324,
"step": 2810
},
{
"epoch": 0.07549796530306276,
"grad_norm": 5.803051471710205,
"learning_rate": 1.8210612450682986e-05,
"loss": 1.4132,
"step": 2820
},
{
"epoch": 0.07576568858427929,
"grad_norm": 5.304571151733398,
"learning_rate": 1.8071748314429994e-05,
"loss": 1.3607,
"step": 2830
},
{
"epoch": 0.07603341186549582,
"grad_norm": 5.904123783111572,
"learning_rate": 1.7933045653217886e-05,
"loss": 1.3963,
"step": 2840
},
{
"epoch": 0.07630113514671236,
"grad_norm": 5.9972686767578125,
"learning_rate": 1.7794510168569814e-05,
"loss": 1.4353,
"step": 2850
},
{
"epoch": 0.07630113514671236,
"eval_loss": 1.3754030466079712,
"eval_runtime": 76.726,
"eval_samples_per_second": 6.517,
"eval_steps_per_second": 6.517,
"step": 2850
},
{
"epoch": 0.07656885842792889,
"grad_norm": 6.276034832000732,
"learning_rate": 1.7656147555136924e-05,
"loss": 1.3894,
"step": 2860
},
{
"epoch": 0.07683658170914542,
"grad_norm": 6.091196060180664,
"learning_rate": 1.7517963500464338e-05,
"loss": 1.3956,
"step": 2870
},
{
"epoch": 0.07710430499036196,
"grad_norm": 6.1393513679504395,
"learning_rate": 1.7379963684757313e-05,
"loss": 1.4192,
"step": 2880
},
{
"epoch": 0.07737202827157849,
"grad_norm": 6.082838535308838,
"learning_rate": 1.7242153780647764e-05,
"loss": 1.3598,
"step": 2890
},
{
"epoch": 0.07763975155279502,
"grad_norm": 7.009051322937012,
"learning_rate": 1.7104539452961086e-05,
"loss": 1.3388,
"step": 2900
},
{
"epoch": 0.07790747483401157,
"grad_norm": 6.073249340057373,
"learning_rate": 1.6967126358483283e-05,
"loss": 1.4014,
"step": 2910
},
{
"epoch": 0.0781751981152281,
"grad_norm": 5.901647567749023,
"learning_rate": 1.6829920145728465e-05,
"loss": 1.3795,
"step": 2920
},
{
"epoch": 0.07844292139644464,
"grad_norm": 5.663522243499756,
"learning_rate": 1.6692926454706644e-05,
"loss": 1.4444,
"step": 2930
},
{
"epoch": 0.07871064467766117,
"grad_norm": 6.163628578186035,
"learning_rate": 1.655615091669189e-05,
"loss": 1.3579,
"step": 2940
},
{
"epoch": 0.07897836795887771,
"grad_norm": 5.641757965087891,
"learning_rate": 1.641959915399084e-05,
"loss": 1.3816,
"step": 2950
},
{
"epoch": 0.07924609124009424,
"grad_norm": 5.533544540405273,
"learning_rate": 1.6283276779711637e-05,
"loss": 1.4021,
"step": 2960
},
{
"epoch": 0.07951381452131077,
"grad_norm": 6.252144813537598,
"learning_rate": 1.614718939753315e-05,
"loss": 1.3424,
"step": 2970
},
{
"epoch": 0.07978153780252731,
"grad_norm": 6.014984607696533,
"learning_rate": 1.6011342601474635e-05,
"loss": 1.3733,
"step": 2980
},
{
"epoch": 0.08004926108374384,
"grad_norm": 6.043126106262207,
"learning_rate": 1.5875741975665813e-05,
"loss": 1.4402,
"step": 2990
},
{
"epoch": 0.08031698436496038,
"grad_norm": 6.2684478759765625,
"learning_rate": 1.5740393094117287e-05,
"loss": 1.3955,
"step": 3000
},
{
"epoch": 0.08031698436496038,
"eval_loss": 1.3712314367294312,
"eval_runtime": 77.0642,
"eval_samples_per_second": 6.488,
"eval_steps_per_second": 6.488,
"step": 3000
},
{
"epoch": 0.08058470764617691,
"grad_norm": 6.531871795654297,
"learning_rate": 1.560530152049146e-05,
"loss": 1.3728,
"step": 3010
},
{
"epoch": 0.08085243092739344,
"grad_norm": 6.215692043304443,
"learning_rate": 1.5470472807873805e-05,
"loss": 1.322,
"step": 3020
},
{
"epoch": 0.08112015420860998,
"grad_norm": 5.670928001403809,
"learning_rate": 1.5335912498544615e-05,
"loss": 1.3643,
"step": 3030
},
{
"epoch": 0.08138787748982651,
"grad_norm": 5.801737308502197,
"learning_rate": 1.5201626123751158e-05,
"loss": 1.3653,
"step": 3040
},
{
"epoch": 0.08165560077104304,
"grad_norm": 5.651313781738281,
"learning_rate": 1.5067619203480345e-05,
"loss": 1.3818,
"step": 3050
},
{
"epoch": 0.08192332405225959,
"grad_norm": 6.425565242767334,
"learning_rate": 1.4933897246231798e-05,
"loss": 1.3276,
"step": 3060
},
{
"epoch": 0.08219104733347612,
"grad_norm": 6.21942663192749,
"learning_rate": 1.4800465748791428e-05,
"loss": 1.429,
"step": 3070
},
{
"epoch": 0.08245877061469266,
"grad_norm": 6.354944229125977,
"learning_rate": 1.4667330196005485e-05,
"loss": 1.4254,
"step": 3080
},
{
"epoch": 0.08272649389590919,
"grad_norm": 6.185739517211914,
"learning_rate": 1.4534496060555075e-05,
"loss": 1.3998,
"step": 3090
},
{
"epoch": 0.08299421717712573,
"grad_norm": 5.781863212585449,
"learning_rate": 1.4401968802731235e-05,
"loss": 1.3384,
"step": 3100
},
{
"epoch": 0.08326194045834226,
"grad_norm": 6.628792762756348,
"learning_rate": 1.4269753870210459e-05,
"loss": 1.4146,
"step": 3110
},
{
"epoch": 0.0835296637395588,
"grad_norm": 6.093694686889648,
"learning_rate": 1.4137856697830786e-05,
"loss": 1.3662,
"step": 3120
},
{
"epoch": 0.08379738702077533,
"grad_norm": 6.078185558319092,
"learning_rate": 1.4006282707368348e-05,
"loss": 1.3716,
"step": 3130
},
{
"epoch": 0.08406511030199186,
"grad_norm": 6.203483581542969,
"learning_rate": 1.3875037307314563e-05,
"loss": 1.3371,
"step": 3140
},
{
"epoch": 0.0843328335832084,
"grad_norm": 5.880634307861328,
"learning_rate": 1.374412589265377e-05,
"loss": 1.3464,
"step": 3150
},
{
"epoch": 0.0843328335832084,
"eval_loss": 1.3659946918487549,
"eval_runtime": 77.0452,
"eval_samples_per_second": 6.49,
"eval_steps_per_second": 6.49,
"step": 3150
},
{
"epoch": 0.08460055686442493,
"grad_norm": 6.3485426902771,
"learning_rate": 1.3613553844641483e-05,
"loss": 1.3366,
"step": 3160
},
{
"epoch": 0.08486828014564146,
"grad_norm": 6.721098899841309,
"learning_rate": 1.3483326530583184e-05,
"loss": 1.3628,
"step": 3170
},
{
"epoch": 0.085136003426858,
"grad_norm": 5.912144660949707,
"learning_rate": 1.3353449303613682e-05,
"loss": 1.3403,
"step": 3180
},
{
"epoch": 0.08540372670807453,
"grad_norm": 5.860577583312988,
"learning_rate": 1.3223927502477084e-05,
"loss": 1.3453,
"step": 3190
},
{
"epoch": 0.08567144998929106,
"grad_norm": 6.3982977867126465,
"learning_rate": 1.3094766451307336e-05,
"loss": 1.3556,
"step": 3200
},
{
"epoch": 0.08593917327050761,
"grad_norm": 6.073590278625488,
"learning_rate": 1.2965971459409366e-05,
"loss": 1.3984,
"step": 3210
},
{
"epoch": 0.08620689655172414,
"grad_norm": 6.372732162475586,
"learning_rate": 1.2837547821040825e-05,
"loss": 1.4089,
"step": 3220
},
{
"epoch": 0.08647461983294068,
"grad_norm": 6.449525356292725,
"learning_rate": 1.2709500815194487e-05,
"loss": 1.3884,
"step": 3230
},
{
"epoch": 0.08674234311415721,
"grad_norm": 5.904713153839111,
"learning_rate": 1.2581835705381243e-05,
"loss": 1.3976,
"step": 3240
},
{
"epoch": 0.08701006639537374,
"grad_norm": 6.398531913757324,
"learning_rate": 1.2454557739413722e-05,
"loss": 1.3942,
"step": 3250
},
{
"epoch": 0.08727778967659028,
"grad_norm": 6.1607465744018555,
"learning_rate": 1.2327672149190595e-05,
"loss": 1.3698,
"step": 3260
},
{
"epoch": 0.08754551295780681,
"grad_norm": 5.903096675872803,
"learning_rate": 1.2201184150481497e-05,
"loss": 1.4183,
"step": 3270
},
{
"epoch": 0.08781323623902335,
"grad_norm": 6.210367679595947,
"learning_rate": 1.2075098942712635e-05,
"loss": 1.3717,
"step": 3280
},
{
"epoch": 0.08808095952023988,
"grad_norm": 6.082081317901611,
"learning_rate": 1.1949421708753062e-05,
"loss": 1.3694,
"step": 3290
},
{
"epoch": 0.08834868280145641,
"grad_norm": 5.826544284820557,
"learning_rate": 1.1824157614701629e-05,
"loss": 1.4473,
"step": 3300
},
{
"epoch": 0.08834868280145641,
"eval_loss": 1.3619885444641113,
"eval_runtime": 77.1061,
"eval_samples_per_second": 6.485,
"eval_steps_per_second": 6.485,
"step": 3300
},
{
"epoch": 0.08861640608267295,
"grad_norm": 6.470825672149658,
"learning_rate": 1.1699311809674596e-05,
"loss": 1.357,
"step": 3310
},
{
"epoch": 0.08888412936388948,
"grad_norm": 5.989506244659424,
"learning_rate": 1.157488942559403e-05,
"loss": 1.322,
"step": 3320
},
{
"epoch": 0.08915185264510601,
"grad_norm": 6.708034992218018,
"learning_rate": 1.1450895576976816e-05,
"loss": 1.3652,
"step": 3330
},
{
"epoch": 0.08941957592632255,
"grad_norm": 6.264359474182129,
"learning_rate": 1.1327335360724412e-05,
"loss": 1.3661,
"step": 3340
},
{
"epoch": 0.08968729920753908,
"grad_norm": 6.633790969848633,
"learning_rate": 1.1204213855913374e-05,
"loss": 1.3522,
"step": 3350
},
{
"epoch": 0.08995502248875563,
"grad_norm": 5.526124477386475,
"learning_rate": 1.1081536123586505e-05,
"loss": 1.3492,
"step": 3360
},
{
"epoch": 0.09022274576997216,
"grad_norm": 6.267175197601318,
"learning_rate": 1.09593072065449e-05,
"loss": 1.3805,
"step": 3370
},
{
"epoch": 0.0904904690511887,
"grad_norm": 6.826523780822754,
"learning_rate": 1.0837532129140595e-05,
"loss": 1.3379,
"step": 3380
},
{
"epoch": 0.09075819233240523,
"grad_norm": 6.352426052093506,
"learning_rate": 1.0716215897070067e-05,
"loss": 1.378,
"step": 3390
},
{
"epoch": 0.09102591561362176,
"grad_norm": 6.353774547576904,
"learning_rate": 1.0595363497168449e-05,
"loss": 1.4057,
"step": 3400
},
{
"epoch": 0.0912936388948383,
"grad_norm": 6.023704528808594,
"learning_rate": 1.0474979897204557e-05,
"loss": 1.419,
"step": 3410
},
{
"epoch": 0.09156136217605483,
"grad_norm": 6.525381565093994,
"learning_rate": 1.0355070045676677e-05,
"loss": 1.3737,
"step": 3420
},
{
"epoch": 0.09182908545727136,
"grad_norm": 6.321014404296875,
"learning_rate": 1.0235638871609145e-05,
"loss": 1.3252,
"step": 3430
},
{
"epoch": 0.0920968087384879,
"grad_norm": 5.880143165588379,
"learning_rate": 1.011669128434976e-05,
"loss": 1.3581,
"step": 3440
},
{
"epoch": 0.09236453201970443,
"grad_norm": 6.851429462432861,
"learning_rate": 9.99823217336793e-06,
"loss": 1.4074,
"step": 3450
},
{
"epoch": 0.09236453201970443,
"eval_loss": 1.3579777479171753,
"eval_runtime": 76.8014,
"eval_samples_per_second": 6.51,
"eval_steps_per_second": 6.51,
"step": 3450
},
{
"epoch": 0.09263225530092097,
"grad_norm": 6.41058874130249,
"learning_rate": 9.880266408053746e-06,
"loss": 1.433,
"step": 3460
},
{
"epoch": 0.0928999785821375,
"grad_norm": 5.9317474365234375,
"learning_rate": 9.762798837517776e-06,
"loss": 1.3759,
"step": 3470
},
{
"epoch": 0.09316770186335403,
"grad_norm": 5.728269100189209,
"learning_rate": 9.645834290391754e-06,
"loss": 1.4632,
"step": 3480
},
{
"epoch": 0.09343542514457057,
"grad_norm": 5.710354328155518,
"learning_rate": 9.529377574630109e-06,
"loss": 1.422,
"step": 3490
},
{
"epoch": 0.0937031484257871,
"grad_norm": 6.150035381317139,
"learning_rate": 9.413433477312272e-06,
"loss": 1.4113,
"step": 3500
},
{
"epoch": 0.09397087170700365,
"grad_norm": 6.171891689300537,
"learning_rate": 9.298006764445976e-06,
"loss": 1.4115,
"step": 3510
},
{
"epoch": 0.09423859498822018,
"grad_norm": 6.584611415863037,
"learning_rate": 9.183102180771285e-06,
"loss": 1.3631,
"step": 3520
},
{
"epoch": 0.09450631826943671,
"grad_norm": 6.219729423522949,
"learning_rate": 9.068724449565594e-06,
"loss": 1.3497,
"step": 3530
},
{
"epoch": 0.09477404155065325,
"grad_norm": 5.961699485778809,
"learning_rate": 8.954878272449433e-06,
"loss": 1.3476,
"step": 3540
},
{
"epoch": 0.09504176483186978,
"grad_norm": 6.4813385009765625,
"learning_rate": 8.841568329193249e-06,
"loss": 1.3281,
"step": 3550
},
{
"epoch": 0.09530948811308632,
"grad_norm": 5.715578079223633,
"learning_rate": 8.728799277524998e-06,
"loss": 1.3114,
"step": 3560
},
{
"epoch": 0.09557721139430285,
"grad_norm": 5.67549467086792,
"learning_rate": 8.61657575293871e-06,
"loss": 1.3119,
"step": 3570
},
{
"epoch": 0.09584493467551938,
"grad_norm": 6.634474277496338,
"learning_rate": 8.50490236850394e-06,
"loss": 1.3587,
"step": 3580
},
{
"epoch": 0.09611265795673592,
"grad_norm": 5.7471537590026855,
"learning_rate": 8.393783714676107e-06,
"loss": 1.3607,
"step": 3590
},
{
"epoch": 0.09638038123795245,
"grad_norm": 5.866701602935791,
"learning_rate": 8.283224359107863e-06,
"loss": 1.3247,
"step": 3600
},
{
"epoch": 0.09638038123795245,
"eval_loss": 1.3553622961044312,
"eval_runtime": 76.9249,
"eval_samples_per_second": 6.5,
"eval_steps_per_second": 6.5,
"step": 3600
},
{
"epoch": 0.09664810451916898,
"grad_norm": 5.779167652130127,
"learning_rate": 8.17322884646128e-06,
"loss": 1.375,
"step": 3610
},
{
"epoch": 0.09691582780038552,
"grad_norm": 6.503204345703125,
"learning_rate": 8.06380169822107e-06,
"loss": 1.3767,
"step": 3620
},
{
"epoch": 0.09718355108160205,
"grad_norm": 5.67221212387085,
"learning_rate": 7.95494741250868e-06,
"loss": 1.2996,
"step": 3630
},
{
"epoch": 0.09745127436281859,
"grad_norm": 6.475659370422363,
"learning_rate": 7.846670463897457e-06,
"loss": 1.3827,
"step": 3640
},
{
"epoch": 0.09771899764403512,
"grad_norm": 6.146843910217285,
"learning_rate": 7.738975303228659e-06,
"loss": 1.3489,
"step": 3650
},
{
"epoch": 0.09798672092525167,
"grad_norm": 6.763230323791504,
"learning_rate": 7.631866357428526e-06,
"loss": 1.3631,
"step": 3660
},
{
"epoch": 0.0982544442064682,
"grad_norm": 6.853928565979004,
"learning_rate": 7.525348029326323e-06,
"loss": 1.3683,
"step": 3670
},
{
"epoch": 0.09852216748768473,
"grad_norm": 6.183257102966309,
"learning_rate": 7.4194246974732955e-06,
"loss": 1.3744,
"step": 3680
},
{
"epoch": 0.09878989076890127,
"grad_norm": 6.155274391174316,
"learning_rate": 7.314100715962744e-06,
"loss": 1.389,
"step": 3690
},
{
"epoch": 0.0990576140501178,
"grad_norm": 6.754117012023926,
"learning_rate": 7.209380414251028e-06,
"loss": 1.3267,
"step": 3700
},
{
"epoch": 0.09932533733133433,
"grad_norm": 6.333691596984863,
"learning_rate": 7.105268096979596e-06,
"loss": 1.3774,
"step": 3710
},
{
"epoch": 0.09959306061255087,
"grad_norm": 6.452340602874756,
"learning_rate": 7.001768043798013e-06,
"loss": 1.3038,
"step": 3720
},
{
"epoch": 0.0998607838937674,
"grad_norm": 5.832094192504883,
"learning_rate": 6.898884509188095e-06,
"loss": 1.3978,
"step": 3730
},
{
"epoch": 0.10012850717498394,
"grad_norm": 5.7019476890563965,
"learning_rate": 6.796621722288977e-06,
"loss": 1.358,
"step": 3740
},
{
"epoch": 0.10039623045620047,
"grad_norm": 5.743053913116455,
"learning_rate": 6.6949838867233e-06,
"loss": 1.3567,
"step": 3750
},
{
"epoch": 0.10039623045620047,
"eval_loss": 1.3537319898605347,
"eval_runtime": 76.8454,
"eval_samples_per_second": 6.507,
"eval_steps_per_second": 6.507,
"step": 3750
},
{
"epoch": 0.100663953737417,
"grad_norm": 6.490472793579102,
"learning_rate": 6.5939751804243974e-06,
"loss": 1.361,
"step": 3760
},
{
"epoch": 0.10093167701863354,
"grad_norm": 6.32999324798584,
"learning_rate": 6.493599755464546e-06,
"loss": 1.2968,
"step": 3770
},
{
"epoch": 0.10119940029985007,
"grad_norm": 6.559702396392822,
"learning_rate": 6.3938617378843264e-06,
"loss": 1.4176,
"step": 3780
},
{
"epoch": 0.1014671235810666,
"grad_norm": 5.832455158233643,
"learning_rate": 6.294765227523008e-06,
"loss": 1.3828,
"step": 3790
},
{
"epoch": 0.10173484686228314,
"grad_norm": 6.728024005889893,
"learning_rate": 6.196314297849995e-06,
"loss": 1.3902,
"step": 3800
},
{
"epoch": 0.10200257014349969,
"grad_norm": 6.092176914215088,
"learning_rate": 6.098512995797388e-06,
"loss": 1.3587,
"step": 3810
},
{
"epoch": 0.10227029342471622,
"grad_norm": 6.502336025238037,
"learning_rate": 6.0013653415936585e-06,
"loss": 1.3619,
"step": 3820
},
{
"epoch": 0.10253801670593275,
"grad_norm": 6.602701187133789,
"learning_rate": 5.90487532859837e-06,
"loss": 1.3325,
"step": 3830
},
{
"epoch": 0.10280573998714929,
"grad_norm": 6.637482166290283,
"learning_rate": 5.809046923138031e-06,
"loss": 1.3899,
"step": 3840
},
{
"epoch": 0.10307346326836582,
"grad_norm": 5.880363941192627,
"learning_rate": 5.713884064343061e-06,
"loss": 1.3481,
"step": 3850
},
{
"epoch": 0.10334118654958235,
"grad_norm": 7.036133289337158,
"learning_rate": 5.6193906639858486e-06,
"loss": 1.3156,
"step": 3860
},
{
"epoch": 0.10360890983079889,
"grad_norm": 5.999964714050293,
"learning_rate": 5.52557060631998e-06,
"loss": 1.3756,
"step": 3870
},
{
"epoch": 0.10387663311201542,
"grad_norm": 5.966408729553223,
"learning_rate": 5.432427747920561e-06,
"loss": 1.3588,
"step": 3880
},
{
"epoch": 0.10414435639323195,
"grad_norm": 5.987645626068115,
"learning_rate": 5.339965917525687e-06,
"loss": 1.427,
"step": 3890
},
{
"epoch": 0.10441207967444849,
"grad_norm": 5.433709621429443,
"learning_rate": 5.248188915879043e-06,
"loss": 1.3687,
"step": 3900
},
{
"epoch": 0.10441207967444849,
"eval_loss": 1.350784420967102,
"eval_runtime": 76.8402,
"eval_samples_per_second": 6.507,
"eval_steps_per_second": 6.507,
"step": 3900
},
{
"epoch": 0.10467980295566502,
"grad_norm": 6.524111270904541,
"learning_rate": 5.157100515573715e-06,
"loss": 1.3006,
"step": 3910
},
{
"epoch": 0.10494752623688156,
"grad_norm": 5.474837303161621,
"learning_rate": 5.066704460897067e-06,
"loss": 1.3463,
"step": 3920
},
{
"epoch": 0.10521524951809809,
"grad_norm": 5.868412494659424,
"learning_rate": 4.977004467676848e-06,
"loss": 1.2881,
"step": 3930
},
{
"epoch": 0.10548297279931462,
"grad_norm": 5.966287136077881,
"learning_rate": 4.888004223128458e-06,
"loss": 1.3636,
"step": 3940
},
{
"epoch": 0.10575069608053116,
"grad_norm": 5.976463794708252,
"learning_rate": 4.799707385703344e-06,
"loss": 1.3411,
"step": 3950
},
{
"epoch": 0.1060184193617477,
"grad_norm": 5.5595197677612305,
"learning_rate": 4.712117584938669e-06,
"loss": 1.3114,
"step": 3960
},
{
"epoch": 0.10628614264296424,
"grad_norm": 5.7463483810424805,
"learning_rate": 4.625238421308069e-06,
"loss": 1.3472,
"step": 3970
},
{
"epoch": 0.10655386592418077,
"grad_norm": 6.120302200317383,
"learning_rate": 4.5390734660736906e-06,
"loss": 1.4384,
"step": 3980
},
{
"epoch": 0.1068215892053973,
"grad_norm": 6.155236721038818,
"learning_rate": 4.453626261139344e-06,
"loss": 1.3494,
"step": 3990
},
{
"epoch": 0.10708931248661384,
"grad_norm": 6.032073974609375,
"learning_rate": 4.368900318904957e-06,
"loss": 1.3464,
"step": 4000
},
{
"epoch": 0.10735703576783037,
"grad_norm": 6.827203750610352,
"learning_rate": 4.284899122122165e-06,
"loss": 1.3534,
"step": 4010
},
{
"epoch": 0.1076247590490469,
"grad_norm": 5.927024841308594,
"learning_rate": 4.201626123751159e-06,
"loss": 1.333,
"step": 4020
},
{
"epoch": 0.10789248233026344,
"grad_norm": 5.960188865661621,
"learning_rate": 4.1190847468187425e-06,
"loss": 1.3458,
"step": 4030
},
{
"epoch": 0.10816020561147997,
"grad_norm": 6.299499034881592,
"learning_rate": 4.037278384277628e-06,
"loss": 1.3516,
"step": 4040
},
{
"epoch": 0.1084279288926965,
"grad_norm": 6.968238353729248,
"learning_rate": 3.956210398866969e-06,
"loss": 1.369,
"step": 4050
},
{
"epoch": 0.1084279288926965,
"eval_loss": 1.348792552947998,
"eval_runtime": 76.8298,
"eval_samples_per_second": 6.508,
"eval_steps_per_second": 6.508,
"step": 4050
},
{
"epoch": 0.10869565217391304,
"grad_norm": 6.412740707397461,
"learning_rate": 3.875884122974123e-06,
"loss": 1.3756,
"step": 4060
},
{
"epoch": 0.10896337545512957,
"grad_norm": 6.571822643280029,
"learning_rate": 3.7963028584976805e-06,
"loss": 1.3773,
"step": 4070
},
{
"epoch": 0.10923109873634611,
"grad_norm": 6.47897481918335,
"learning_rate": 3.717469876711713e-06,
"loss": 1.3746,
"step": 4080
},
{
"epoch": 0.10949882201756264,
"grad_norm": 6.563449382781982,
"learning_rate": 3.6393884181313417e-06,
"loss": 1.382,
"step": 4090
},
{
"epoch": 0.10976654529877918,
"grad_norm": 6.455676078796387,
"learning_rate": 3.562061692379507e-06,
"loss": 1.3519,
"step": 4100
},
{
"epoch": 0.11003426857999572,
"grad_norm": 5.957856178283691,
"learning_rate": 3.4854928780550306e-06,
"loss": 1.3711,
"step": 4110
},
{
"epoch": 0.11030199186121226,
"grad_norm": 6.082734107971191,
"learning_rate": 3.409685122601979e-06,
"loss": 1.3038,
"step": 4120
},
{
"epoch": 0.11056971514242879,
"grad_norm": 5.809603691101074,
"learning_rate": 3.3346415421802494e-06,
"loss": 1.3587,
"step": 4130
},
{
"epoch": 0.11083743842364532,
"grad_norm": 6.081882476806641,
"learning_rate": 3.26036522153751e-06,
"loss": 1.3672,
"step": 4140
},
{
"epoch": 0.11110516170486186,
"grad_norm": 5.788993835449219,
"learning_rate": 3.186859213882386e-06,
"loss": 1.3615,
"step": 4150
},
{
"epoch": 0.11137288498607839,
"grad_norm": 5.722326755523682,
"learning_rate": 3.114126540758946e-06,
"loss": 1.2914,
"step": 4160
},
{
"epoch": 0.11164060826729492,
"grad_norm": 6.233955383300781,
"learning_rate": 3.042170191922509e-06,
"loss": 1.3286,
"step": 4170
},
{
"epoch": 0.11190833154851146,
"grad_norm": 6.276589393615723,
"learning_rate": 2.9709931252167426e-06,
"loss": 1.3943,
"step": 4180
},
{
"epoch": 0.11217605482972799,
"grad_norm": 6.818645000457764,
"learning_rate": 2.9005982664520734e-06,
"loss": 1.3535,
"step": 4190
},
{
"epoch": 0.11244377811094453,
"grad_norm": 6.53585147857666,
"learning_rate": 2.830988509285433e-06,
"loss": 1.3412,
"step": 4200
},
{
"epoch": 0.11244377811094453,
"eval_loss": 1.3470451831817627,
"eval_runtime": 76.7654,
"eval_samples_per_second": 6.513,
"eval_steps_per_second": 6.513,
"step": 4200
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.521351998649088e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}