ErrorAI's picture
Training in progress, step 710, checkpoint
ae4aaa1 verified
raw
history blame
125 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.28035538005923,
"eval_steps": 355,
"global_step": 710,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003948667324777887,
"grad_norm": 0.5502959489822388,
"learning_rate": 2e-05,
"loss": 1.0032,
"step": 1
},
{
"epoch": 0.0003948667324777887,
"eval_loss": 1.3028720617294312,
"eval_runtime": 63.423,
"eval_samples_per_second": 16.824,
"eval_steps_per_second": 8.42,
"step": 1
},
{
"epoch": 0.0007897334649555774,
"grad_norm": 0.5348024368286133,
"learning_rate": 4e-05,
"loss": 1.2158,
"step": 2
},
{
"epoch": 0.0011846001974333662,
"grad_norm": 0.5212297439575195,
"learning_rate": 6e-05,
"loss": 1.2107,
"step": 3
},
{
"epoch": 0.0015794669299111549,
"grad_norm": 0.5010500550270081,
"learning_rate": 8e-05,
"loss": 1.7374,
"step": 4
},
{
"epoch": 0.0019743336623889436,
"grad_norm": 0.566511869430542,
"learning_rate": 0.0001,
"loss": 1.263,
"step": 5
},
{
"epoch": 0.0023692003948667323,
"grad_norm": 0.558596134185791,
"learning_rate": 0.00012,
"loss": 1.1253,
"step": 6
},
{
"epoch": 0.002764067127344521,
"grad_norm": 0.525932788848877,
"learning_rate": 0.00014,
"loss": 1.155,
"step": 7
},
{
"epoch": 0.0031589338598223098,
"grad_norm": 0.5322596430778503,
"learning_rate": 0.00016,
"loss": 1.173,
"step": 8
},
{
"epoch": 0.003553800592300099,
"grad_norm": 0.5490784049034119,
"learning_rate": 0.00018,
"loss": 1.1944,
"step": 9
},
{
"epoch": 0.003948667324777887,
"grad_norm": 0.5888460278511047,
"learning_rate": 0.0002,
"loss": 1.1346,
"step": 10
},
{
"epoch": 0.004343534057255676,
"grad_norm": 0.5463979840278625,
"learning_rate": 0.0001999997517831015,
"loss": 1.2648,
"step": 11
},
{
"epoch": 0.004738400789733465,
"grad_norm": 0.6944459676742554,
"learning_rate": 0.00019999900713363826,
"loss": 1.1832,
"step": 12
},
{
"epoch": 0.005133267522211254,
"grad_norm": 0.675365149974823,
"learning_rate": 0.0001999977660553069,
"loss": 1.1111,
"step": 13
},
{
"epoch": 0.005528134254689042,
"grad_norm": 0.6740691065788269,
"learning_rate": 0.00019999602855426865,
"loss": 1.0508,
"step": 14
},
{
"epoch": 0.005923000987166831,
"grad_norm": 0.6092358827590942,
"learning_rate": 0.00019999379463914898,
"loss": 1.048,
"step": 15
},
{
"epoch": 0.0063178677196446195,
"grad_norm": 0.5579732656478882,
"learning_rate": 0.0001999910643210378,
"loss": 0.9692,
"step": 16
},
{
"epoch": 0.006712734452122409,
"grad_norm": 0.5955824851989746,
"learning_rate": 0.0001999878376134894,
"loss": 1.3142,
"step": 17
},
{
"epoch": 0.007107601184600198,
"grad_norm": 0.49893510341644287,
"learning_rate": 0.00019998411453252217,
"loss": 1.211,
"step": 18
},
{
"epoch": 0.007502467917077986,
"grad_norm": 0.6234785914421082,
"learning_rate": 0.0001999798950966188,
"loss": 1.0711,
"step": 19
},
{
"epoch": 0.007897334649555774,
"grad_norm": 0.5721188187599182,
"learning_rate": 0.0001999751793267259,
"loss": 0.9827,
"step": 20
},
{
"epoch": 0.008292201382033564,
"grad_norm": 0.566277801990509,
"learning_rate": 0.00019996996724625426,
"loss": 0.9768,
"step": 21
},
{
"epoch": 0.008687068114511353,
"grad_norm": 0.5721827745437622,
"learning_rate": 0.0001999642588810784,
"loss": 1.0156,
"step": 22
},
{
"epoch": 0.009081934846989142,
"grad_norm": 0.4582698941230774,
"learning_rate": 0.00019995805425953648,
"loss": 1.0947,
"step": 23
},
{
"epoch": 0.00947680157946693,
"grad_norm": 0.5519172549247742,
"learning_rate": 0.00019995135341243042,
"loss": 0.9691,
"step": 24
},
{
"epoch": 0.009871668311944718,
"grad_norm": 0.5251504182815552,
"learning_rate": 0.00019994415637302547,
"loss": 1.1051,
"step": 25
},
{
"epoch": 0.010266535044422508,
"grad_norm": 0.49342384934425354,
"learning_rate": 0.00019993646317705016,
"loss": 1.1886,
"step": 26
},
{
"epoch": 0.010661401776900297,
"grad_norm": 0.458482027053833,
"learning_rate": 0.0001999282738626961,
"loss": 0.8504,
"step": 27
},
{
"epoch": 0.011056268509378084,
"grad_norm": 0.5016602873802185,
"learning_rate": 0.00019991958847061784,
"loss": 0.8907,
"step": 28
},
{
"epoch": 0.011451135241855873,
"grad_norm": 0.5376136302947998,
"learning_rate": 0.0001999104070439326,
"loss": 1.1895,
"step": 29
},
{
"epoch": 0.011846001974333662,
"grad_norm": 0.5124022364616394,
"learning_rate": 0.00019990072962822007,
"loss": 1.2014,
"step": 30
},
{
"epoch": 0.012240868706811452,
"grad_norm": 0.4343184232711792,
"learning_rate": 0.0001998905562715222,
"loss": 1.1406,
"step": 31
},
{
"epoch": 0.012635735439289239,
"grad_norm": 0.5123298764228821,
"learning_rate": 0.00019987988702434303,
"loss": 1.1627,
"step": 32
},
{
"epoch": 0.013030602171767028,
"grad_norm": 0.48822933435440063,
"learning_rate": 0.00019986872193964827,
"loss": 1.0454,
"step": 33
},
{
"epoch": 0.013425468904244817,
"grad_norm": 0.501602053642273,
"learning_rate": 0.00019985706107286514,
"loss": 1.144,
"step": 34
},
{
"epoch": 0.013820335636722606,
"grad_norm": 0.5616022348403931,
"learning_rate": 0.00019984490448188218,
"loss": 1.1829,
"step": 35
},
{
"epoch": 0.014215202369200396,
"grad_norm": 0.5326923727989197,
"learning_rate": 0.00019983225222704878,
"loss": 0.9466,
"step": 36
},
{
"epoch": 0.014610069101678183,
"grad_norm": 0.3960028886795044,
"learning_rate": 0.000199819104371175,
"loss": 0.8513,
"step": 37
},
{
"epoch": 0.015004935834155972,
"grad_norm": 0.5138500928878784,
"learning_rate": 0.00019980546097953132,
"loss": 1.02,
"step": 38
},
{
"epoch": 0.015399802566633761,
"grad_norm": 0.47606444358825684,
"learning_rate": 0.00019979132211984805,
"loss": 1.0354,
"step": 39
},
{
"epoch": 0.01579466929911155,
"grad_norm": 0.4528456926345825,
"learning_rate": 0.00019977668786231534,
"loss": 0.8414,
"step": 40
},
{
"epoch": 0.01618953603158934,
"grad_norm": 0.5217841863632202,
"learning_rate": 0.00019976155827958252,
"loss": 1.2372,
"step": 41
},
{
"epoch": 0.016584402764067127,
"grad_norm": 0.511803925037384,
"learning_rate": 0.000199745933446758,
"loss": 0.8319,
"step": 42
},
{
"epoch": 0.016979269496544915,
"grad_norm": 0.5257598757743835,
"learning_rate": 0.00019972981344140874,
"loss": 0.9624,
"step": 43
},
{
"epoch": 0.017374136229022705,
"grad_norm": 0.5112103819847107,
"learning_rate": 0.00019971319834355983,
"loss": 1.123,
"step": 44
},
{
"epoch": 0.017769002961500493,
"grad_norm": 0.5637938976287842,
"learning_rate": 0.00019969608823569433,
"loss": 1.2229,
"step": 45
},
{
"epoch": 0.018163869693978284,
"grad_norm": 0.46877309679985046,
"learning_rate": 0.0001996784832027525,
"loss": 0.9595,
"step": 46
},
{
"epoch": 0.01855873642645607,
"grad_norm": 0.45058727264404297,
"learning_rate": 0.00019966038333213177,
"loss": 1.0821,
"step": 47
},
{
"epoch": 0.01895360315893386,
"grad_norm": 0.4382037818431854,
"learning_rate": 0.00019964178871368594,
"loss": 0.9997,
"step": 48
},
{
"epoch": 0.01934846989141165,
"grad_norm": 0.44654494524002075,
"learning_rate": 0.000199622699439725,
"loss": 1.0338,
"step": 49
},
{
"epoch": 0.019743336623889437,
"grad_norm": 0.5067604184150696,
"learning_rate": 0.00019960311560501454,
"loss": 1.1121,
"step": 50
},
{
"epoch": 0.020138203356367228,
"grad_norm": 0.4290701746940613,
"learning_rate": 0.0001995830373067754,
"loss": 1.0287,
"step": 51
},
{
"epoch": 0.020533070088845015,
"grad_norm": 0.5244402885437012,
"learning_rate": 0.00019956246464468294,
"loss": 1.1912,
"step": 52
},
{
"epoch": 0.020927936821322803,
"grad_norm": 0.4742845892906189,
"learning_rate": 0.0001995413977208669,
"loss": 1.0139,
"step": 53
},
{
"epoch": 0.021322803553800593,
"grad_norm": 0.47367361187934875,
"learning_rate": 0.00019951983663991056,
"loss": 1.0935,
"step": 54
},
{
"epoch": 0.02171767028627838,
"grad_norm": 0.4405689835548401,
"learning_rate": 0.00019949778150885042,
"loss": 1.2933,
"step": 55
},
{
"epoch": 0.02211253701875617,
"grad_norm": 0.39893534779548645,
"learning_rate": 0.0001994752324371756,
"loss": 0.83,
"step": 56
},
{
"epoch": 0.02250740375123396,
"grad_norm": 0.4917256832122803,
"learning_rate": 0.00019945218953682734,
"loss": 1.0927,
"step": 57
},
{
"epoch": 0.022902270483711747,
"grad_norm": 0.453832745552063,
"learning_rate": 0.00019942865292219838,
"loss": 0.964,
"step": 58
},
{
"epoch": 0.023297137216189538,
"grad_norm": 0.4970617890357971,
"learning_rate": 0.00019940462271013238,
"loss": 1.0414,
"step": 59
},
{
"epoch": 0.023692003948667325,
"grad_norm": 0.5314046144485474,
"learning_rate": 0.0001993800990199235,
"loss": 1.2126,
"step": 60
},
{
"epoch": 0.024086870681145112,
"grad_norm": 0.5307350754737854,
"learning_rate": 0.00019935508197331555,
"loss": 1.0771,
"step": 61
},
{
"epoch": 0.024481737413622903,
"grad_norm": 0.495712548494339,
"learning_rate": 0.0001993295716945017,
"loss": 1.0686,
"step": 62
},
{
"epoch": 0.02487660414610069,
"grad_norm": 0.5226410627365112,
"learning_rate": 0.00019930356831012353,
"loss": 1.0349,
"step": 63
},
{
"epoch": 0.025271470878578478,
"grad_norm": 0.4592258334159851,
"learning_rate": 0.00019927707194927066,
"loss": 0.9929,
"step": 64
},
{
"epoch": 0.02566633761105627,
"grad_norm": 0.486198753118515,
"learning_rate": 0.00019925008274347995,
"loss": 1.0707,
"step": 65
},
{
"epoch": 0.026061204343534056,
"grad_norm": 0.5763838291168213,
"learning_rate": 0.00019922260082673497,
"loss": 0.8451,
"step": 66
},
{
"epoch": 0.026456071076011847,
"grad_norm": 0.5000368356704712,
"learning_rate": 0.00019919462633546519,
"loss": 0.8953,
"step": 67
},
{
"epoch": 0.026850937808489635,
"grad_norm": 0.5193626284599304,
"learning_rate": 0.0001991661594085455,
"loss": 0.8959,
"step": 68
},
{
"epoch": 0.027245804540967422,
"grad_norm": 0.5146979689598083,
"learning_rate": 0.00019913720018729532,
"loss": 1.1425,
"step": 69
},
{
"epoch": 0.027640671273445213,
"grad_norm": 0.4941859543323517,
"learning_rate": 0.000199107748815478,
"loss": 1.1127,
"step": 70
},
{
"epoch": 0.028035538005923,
"grad_norm": 0.5475146770477295,
"learning_rate": 0.00019907780543930014,
"loss": 1.0152,
"step": 71
},
{
"epoch": 0.02843040473840079,
"grad_norm": 0.6136332154273987,
"learning_rate": 0.00019904737020741075,
"loss": 1.1167,
"step": 72
},
{
"epoch": 0.02882527147087858,
"grad_norm": 0.531680703163147,
"learning_rate": 0.00019901644327090064,
"loss": 1.0022,
"step": 73
},
{
"epoch": 0.029220138203356366,
"grad_norm": 0.5442494750022888,
"learning_rate": 0.00019898502478330152,
"loss": 0.8863,
"step": 74
},
{
"epoch": 0.029615004935834157,
"grad_norm": 0.4275268018245697,
"learning_rate": 0.00019895311490058542,
"loss": 0.845,
"step": 75
},
{
"epoch": 0.030009871668311944,
"grad_norm": 0.5767403841018677,
"learning_rate": 0.00019892071378116376,
"loss": 1.13,
"step": 76
},
{
"epoch": 0.030404738400789732,
"grad_norm": 0.5110602974891663,
"learning_rate": 0.00019888782158588667,
"loss": 1.0784,
"step": 77
},
{
"epoch": 0.030799605133267523,
"grad_norm": 0.5388869643211365,
"learning_rate": 0.00019885443847804211,
"loss": 1.0197,
"step": 78
},
{
"epoch": 0.03119447186574531,
"grad_norm": 0.5769171118736267,
"learning_rate": 0.00019882056462335512,
"loss": 0.8093,
"step": 79
},
{
"epoch": 0.0315893385982231,
"grad_norm": 0.3854546546936035,
"learning_rate": 0.00019878620018998696,
"loss": 0.7723,
"step": 80
},
{
"epoch": 0.031984205330700885,
"grad_norm": 0.4594631791114807,
"learning_rate": 0.00019875134534853427,
"loss": 0.978,
"step": 81
},
{
"epoch": 0.03237907206317868,
"grad_norm": 0.5577380061149597,
"learning_rate": 0.0001987160002720283,
"loss": 1.0064,
"step": 82
},
{
"epoch": 0.03277393879565647,
"grad_norm": 0.4823514223098755,
"learning_rate": 0.00019868016513593391,
"loss": 0.9228,
"step": 83
},
{
"epoch": 0.033168805528134254,
"grad_norm": 0.5669511556625366,
"learning_rate": 0.0001986438401181489,
"loss": 1.2223,
"step": 84
},
{
"epoch": 0.03356367226061204,
"grad_norm": 0.48681461811065674,
"learning_rate": 0.00019860702539900287,
"loss": 1.0993,
"step": 85
},
{
"epoch": 0.03395853899308983,
"grad_norm": 0.47141095995903015,
"learning_rate": 0.00019856972116125653,
"loss": 1.1599,
"step": 86
},
{
"epoch": 0.03435340572556762,
"grad_norm": 0.5382753610610962,
"learning_rate": 0.00019853192759010076,
"loss": 1.1186,
"step": 87
},
{
"epoch": 0.03474827245804541,
"grad_norm": 0.592026948928833,
"learning_rate": 0.00019849364487315558,
"loss": 1.0947,
"step": 88
},
{
"epoch": 0.0351431391905232,
"grad_norm": 0.42034783959388733,
"learning_rate": 0.00019845487320046935,
"loss": 0.9649,
"step": 89
},
{
"epoch": 0.035538005923000986,
"grad_norm": 0.4590117633342743,
"learning_rate": 0.0001984156127645178,
"loss": 0.997,
"step": 90
},
{
"epoch": 0.03593287265547877,
"grad_norm": 0.5288587212562561,
"learning_rate": 0.00019837586376020294,
"loss": 1.2129,
"step": 91
},
{
"epoch": 0.03632773938795657,
"grad_norm": 0.4397427439689636,
"learning_rate": 0.0001983356263848523,
"loss": 0.9819,
"step": 92
},
{
"epoch": 0.036722606120434355,
"grad_norm": 0.4406636357307434,
"learning_rate": 0.00019829490083821778,
"loss": 1.074,
"step": 93
},
{
"epoch": 0.03711747285291214,
"grad_norm": 0.4988841116428375,
"learning_rate": 0.0001982536873224748,
"loss": 0.9614,
"step": 94
},
{
"epoch": 0.03751233958538993,
"grad_norm": 0.4320489466190338,
"learning_rate": 0.00019821198604222113,
"loss": 0.9872,
"step": 95
},
{
"epoch": 0.03790720631786772,
"grad_norm": 0.4227694272994995,
"learning_rate": 0.0001981697972044761,
"loss": 1.0866,
"step": 96
},
{
"epoch": 0.03830207305034551,
"grad_norm": 0.449147492647171,
"learning_rate": 0.00019812712101867922,
"loss": 1.0443,
"step": 97
},
{
"epoch": 0.0386969397828233,
"grad_norm": 0.7097704410552979,
"learning_rate": 0.00019808395769668963,
"loss": 0.9615,
"step": 98
},
{
"epoch": 0.039091806515301086,
"grad_norm": 0.4379878640174866,
"learning_rate": 0.0001980403074527846,
"loss": 1.0132,
"step": 99
},
{
"epoch": 0.039486673247778874,
"grad_norm": 0.47281649708747864,
"learning_rate": 0.0001979961705036587,
"loss": 0.9845,
"step": 100
},
{
"epoch": 0.03988153998025666,
"grad_norm": 0.424258291721344,
"learning_rate": 0.00019795154706842266,
"loss": 1.0192,
"step": 101
},
{
"epoch": 0.040276406712734455,
"grad_norm": 0.5341431498527527,
"learning_rate": 0.00019790643736860227,
"loss": 0.9863,
"step": 102
},
{
"epoch": 0.04067127344521224,
"grad_norm": 0.5369569659233093,
"learning_rate": 0.00019786084162813733,
"loss": 1.0572,
"step": 103
},
{
"epoch": 0.04106614017769003,
"grad_norm": 0.5228739380836487,
"learning_rate": 0.00019781476007338058,
"loss": 0.8181,
"step": 104
},
{
"epoch": 0.04146100691016782,
"grad_norm": 0.48687776923179626,
"learning_rate": 0.00019776819293309633,
"loss": 1.0303,
"step": 105
},
{
"epoch": 0.041855873642645605,
"grad_norm": 0.6160632967948914,
"learning_rate": 0.00019772114043845965,
"loss": 0.9719,
"step": 106
},
{
"epoch": 0.04225074037512339,
"grad_norm": 0.42757055163383484,
"learning_rate": 0.00019767360282305508,
"loss": 0.8688,
"step": 107
},
{
"epoch": 0.04264560710760119,
"grad_norm": 0.5323183536529541,
"learning_rate": 0.0001976255803228753,
"loss": 1.0611,
"step": 108
},
{
"epoch": 0.043040473840078974,
"grad_norm": 0.45643237233161926,
"learning_rate": 0.00019757707317632028,
"loss": 0.9245,
"step": 109
},
{
"epoch": 0.04343534057255676,
"grad_norm": 0.51936936378479,
"learning_rate": 0.0001975280816241959,
"loss": 1.1352,
"step": 110
},
{
"epoch": 0.04383020730503455,
"grad_norm": 0.49822020530700684,
"learning_rate": 0.0001974786059097128,
"loss": 0.9481,
"step": 111
},
{
"epoch": 0.04422507403751234,
"grad_norm": 0.5198648571968079,
"learning_rate": 0.0001974286462784851,
"loss": 1.1848,
"step": 112
},
{
"epoch": 0.04461994076999013,
"grad_norm": 0.9258118271827698,
"learning_rate": 0.0001973782029785293,
"loss": 1.1156,
"step": 113
},
{
"epoch": 0.04501480750246792,
"grad_norm": 0.5064953565597534,
"learning_rate": 0.00019732727626026305,
"loss": 0.9965,
"step": 114
},
{
"epoch": 0.045409674234945706,
"grad_norm": 0.4941990375518799,
"learning_rate": 0.00019727586637650373,
"loss": 1.1318,
"step": 115
},
{
"epoch": 0.04580454096742349,
"grad_norm": 0.61434006690979,
"learning_rate": 0.0001972239735824674,
"loss": 1.0637,
"step": 116
},
{
"epoch": 0.04619940769990128,
"grad_norm": 0.53554368019104,
"learning_rate": 0.0001971715981357674,
"loss": 0.8824,
"step": 117
},
{
"epoch": 0.046594274432379075,
"grad_norm": 0.505577802658081,
"learning_rate": 0.0001971187402964132,
"loss": 0.9145,
"step": 118
},
{
"epoch": 0.04698914116485686,
"grad_norm": 0.557715654373169,
"learning_rate": 0.00019706540032680893,
"loss": 0.9495,
"step": 119
},
{
"epoch": 0.04738400789733465,
"grad_norm": 0.5071070194244385,
"learning_rate": 0.00019701157849175228,
"loss": 0.9492,
"step": 120
},
{
"epoch": 0.04777887462981244,
"grad_norm": 0.4534873068332672,
"learning_rate": 0.00019695727505843297,
"loss": 1.1968,
"step": 121
},
{
"epoch": 0.048173741362290225,
"grad_norm": 0.46414080262184143,
"learning_rate": 0.00019690249029643162,
"loss": 0.8883,
"step": 122
},
{
"epoch": 0.04856860809476802,
"grad_norm": 0.43917688727378845,
"learning_rate": 0.00019684722447771834,
"loss": 1.0213,
"step": 123
},
{
"epoch": 0.048963474827245806,
"grad_norm": 0.46896979212760925,
"learning_rate": 0.00019679147787665126,
"loss": 0.8508,
"step": 124
},
{
"epoch": 0.049358341559723594,
"grad_norm": 0.457086443901062,
"learning_rate": 0.0001967352507699754,
"loss": 1.1217,
"step": 125
},
{
"epoch": 0.04975320829220138,
"grad_norm": 0.44028928875923157,
"learning_rate": 0.0001966785434368211,
"loss": 1.0044,
"step": 126
},
{
"epoch": 0.05014807502467917,
"grad_norm": 0.47712892293930054,
"learning_rate": 0.00019662135615870275,
"loss": 0.993,
"step": 127
},
{
"epoch": 0.050542941757156956,
"grad_norm": 0.5953882932662964,
"learning_rate": 0.00019656368921951734,
"loss": 1.2092,
"step": 128
},
{
"epoch": 0.05093780848963475,
"grad_norm": 0.4725169837474823,
"learning_rate": 0.00019650554290554298,
"loss": 0.8518,
"step": 129
},
{
"epoch": 0.05133267522211254,
"grad_norm": 0.4115935266017914,
"learning_rate": 0.00019644691750543767,
"loss": 0.86,
"step": 130
},
{
"epoch": 0.051727541954590325,
"grad_norm": 0.5736876726150513,
"learning_rate": 0.0001963878133102377,
"loss": 0.8093,
"step": 131
},
{
"epoch": 0.05212240868706811,
"grad_norm": 0.4820341169834137,
"learning_rate": 0.00019632823061335627,
"loss": 1.1891,
"step": 132
},
{
"epoch": 0.0525172754195459,
"grad_norm": 0.43938153982162476,
"learning_rate": 0.00019626816971058205,
"loss": 0.7104,
"step": 133
},
{
"epoch": 0.052912142152023695,
"grad_norm": 0.4602479040622711,
"learning_rate": 0.00019620763090007762,
"loss": 0.906,
"step": 134
},
{
"epoch": 0.05330700888450148,
"grad_norm": 0.567136824131012,
"learning_rate": 0.0001961466144823781,
"loss": 0.9195,
"step": 135
},
{
"epoch": 0.05370187561697927,
"grad_norm": 0.4483197331428528,
"learning_rate": 0.00019608512076038962,
"loss": 0.913,
"step": 136
},
{
"epoch": 0.05409674234945706,
"grad_norm": 0.42070272564888,
"learning_rate": 0.00019602315003938782,
"loss": 1.1745,
"step": 137
},
{
"epoch": 0.054491609081934844,
"grad_norm": 0.5072475671768188,
"learning_rate": 0.00019596070262701626,
"loss": 0.9904,
"step": 138
},
{
"epoch": 0.05488647581441264,
"grad_norm": 0.47331702709198,
"learning_rate": 0.00019589777883328505,
"loss": 1.1526,
"step": 139
},
{
"epoch": 0.055281342546890426,
"grad_norm": 0.5382581949234009,
"learning_rate": 0.00019583437897056915,
"loss": 0.866,
"step": 140
},
{
"epoch": 0.05567620927936821,
"grad_norm": 0.41280797123908997,
"learning_rate": 0.0001957705033536069,
"loss": 0.8771,
"step": 141
},
{
"epoch": 0.056071076011846,
"grad_norm": 0.4384588301181793,
"learning_rate": 0.00019570615229949842,
"loss": 1.1457,
"step": 142
},
{
"epoch": 0.05646594274432379,
"grad_norm": 0.3916715681552887,
"learning_rate": 0.00019564132612770414,
"loss": 0.832,
"step": 143
},
{
"epoch": 0.05686080947680158,
"grad_norm": 0.5212041139602661,
"learning_rate": 0.00019557602516004306,
"loss": 0.9689,
"step": 144
},
{
"epoch": 0.05725567620927937,
"grad_norm": 0.5447223782539368,
"learning_rate": 0.00019551024972069126,
"loss": 1.2021,
"step": 145
},
{
"epoch": 0.05765054294175716,
"grad_norm": 0.5747576355934143,
"learning_rate": 0.00019544400013618023,
"loss": 1.0035,
"step": 146
},
{
"epoch": 0.058045409674234945,
"grad_norm": 0.48325222730636597,
"learning_rate": 0.00019537727673539536,
"loss": 1.0858,
"step": 147
},
{
"epoch": 0.05844027640671273,
"grad_norm": 0.5126092433929443,
"learning_rate": 0.00019531007984957408,
"loss": 0.8908,
"step": 148
},
{
"epoch": 0.05883514313919053,
"grad_norm": 0.4576544761657715,
"learning_rate": 0.0001952424098123045,
"loss": 1.0389,
"step": 149
},
{
"epoch": 0.059230009871668314,
"grad_norm": 0.43132367730140686,
"learning_rate": 0.00019517426695952358,
"loss": 0.8228,
"step": 150
},
{
"epoch": 0.0596248766041461,
"grad_norm": 0.44958174228668213,
"learning_rate": 0.00019510565162951537,
"loss": 1.0578,
"step": 151
},
{
"epoch": 0.06001974333662389,
"grad_norm": 0.4115462899208069,
"learning_rate": 0.00019503656416290963,
"loss": 1.1849,
"step": 152
},
{
"epoch": 0.060414610069101676,
"grad_norm": 0.5049015879631042,
"learning_rate": 0.0001949670049026799,
"loss": 1.0552,
"step": 153
},
{
"epoch": 0.060809476801579464,
"grad_norm": 0.48052188754081726,
"learning_rate": 0.00019489697419414182,
"loss": 0.9705,
"step": 154
},
{
"epoch": 0.06120434353405726,
"grad_norm": 0.5106899738311768,
"learning_rate": 0.00019482647238495152,
"loss": 0.9298,
"step": 155
},
{
"epoch": 0.061599210266535045,
"grad_norm": 0.49583542346954346,
"learning_rate": 0.00019475549982510382,
"loss": 1.1084,
"step": 156
},
{
"epoch": 0.06199407699901283,
"grad_norm": 0.5219290256500244,
"learning_rate": 0.00019468405686693044,
"loss": 1.0565,
"step": 157
},
{
"epoch": 0.06238894373149062,
"grad_norm": 0.5849049687385559,
"learning_rate": 0.00019461214386509842,
"loss": 0.9961,
"step": 158
},
{
"epoch": 0.06278381046396841,
"grad_norm": 0.5726532936096191,
"learning_rate": 0.00019453976117660818,
"loss": 1.0036,
"step": 159
},
{
"epoch": 0.0631786771964462,
"grad_norm": 0.5158294439315796,
"learning_rate": 0.0001944669091607919,
"loss": 1.0737,
"step": 160
},
{
"epoch": 0.06357354392892399,
"grad_norm": 0.48837384581565857,
"learning_rate": 0.00019439358817931152,
"loss": 1.0415,
"step": 161
},
{
"epoch": 0.06396841066140177,
"grad_norm": 0.5121546387672424,
"learning_rate": 0.00019431979859615726,
"loss": 1.1167,
"step": 162
},
{
"epoch": 0.06436327739387956,
"grad_norm": 0.6468896269798279,
"learning_rate": 0.00019424554077764546,
"loss": 0.944,
"step": 163
},
{
"epoch": 0.06475814412635736,
"grad_norm": 0.4869466722011566,
"learning_rate": 0.00019417081509241714,
"loss": 1.0122,
"step": 164
},
{
"epoch": 0.06515301085883514,
"grad_norm": 0.43559664487838745,
"learning_rate": 0.00019409562191143577,
"loss": 1.101,
"step": 165
},
{
"epoch": 0.06554787759131293,
"grad_norm": 0.46581968665122986,
"learning_rate": 0.00019401996160798573,
"loss": 0.9572,
"step": 166
},
{
"epoch": 0.06594274432379071,
"grad_norm": 0.5070847868919373,
"learning_rate": 0.00019394383455767034,
"loss": 1.0415,
"step": 167
},
{
"epoch": 0.06633761105626851,
"grad_norm": 0.427746057510376,
"learning_rate": 0.00019386724113841,
"loss": 0.9545,
"step": 168
},
{
"epoch": 0.0667324777887463,
"grad_norm": 0.44796499609947205,
"learning_rate": 0.00019379018173044037,
"loss": 0.9902,
"step": 169
},
{
"epoch": 0.06712734452122408,
"grad_norm": 0.46470651030540466,
"learning_rate": 0.00019371265671631037,
"loss": 0.8932,
"step": 170
},
{
"epoch": 0.06752221125370188,
"grad_norm": 0.524315595626831,
"learning_rate": 0.00019363466648088034,
"loss": 1.1012,
"step": 171
},
{
"epoch": 0.06791707798617966,
"grad_norm": 0.39339789748191833,
"learning_rate": 0.0001935562114113202,
"loss": 0.9966,
"step": 172
},
{
"epoch": 0.06831194471865745,
"grad_norm": 0.4902956783771515,
"learning_rate": 0.00019347729189710743,
"loss": 1.0936,
"step": 173
},
{
"epoch": 0.06870681145113525,
"grad_norm": 0.44631102681159973,
"learning_rate": 0.00019339790833002515,
"loss": 1.0011,
"step": 174
},
{
"epoch": 0.06910167818361303,
"grad_norm": 0.48202449083328247,
"learning_rate": 0.00019331806110416027,
"loss": 0.9386,
"step": 175
},
{
"epoch": 0.06949654491609082,
"grad_norm": 0.5927445292472839,
"learning_rate": 0.00019323775061590135,
"loss": 0.919,
"step": 176
},
{
"epoch": 0.0698914116485686,
"grad_norm": 0.5132244229316711,
"learning_rate": 0.0001931569772639368,
"loss": 0.8995,
"step": 177
},
{
"epoch": 0.0702862783810464,
"grad_norm": 0.3917968273162842,
"learning_rate": 0.00019307574144925287,
"loss": 0.9831,
"step": 178
},
{
"epoch": 0.07068114511352419,
"grad_norm": 0.4451232850551605,
"learning_rate": 0.00019299404357513158,
"loss": 1.0076,
"step": 179
},
{
"epoch": 0.07107601184600197,
"grad_norm": 0.462495893239975,
"learning_rate": 0.00019291188404714878,
"loss": 1.1291,
"step": 180
},
{
"epoch": 0.07147087857847977,
"grad_norm": 0.4120655953884125,
"learning_rate": 0.0001928292632731721,
"loss": 0.8429,
"step": 181
},
{
"epoch": 0.07186574531095755,
"grad_norm": 0.5940248370170593,
"learning_rate": 0.00019274618166335912,
"loss": 1.078,
"step": 182
},
{
"epoch": 0.07226061204343534,
"grad_norm": 0.46714073419570923,
"learning_rate": 0.00019266263963015488,
"loss": 0.9423,
"step": 183
},
{
"epoch": 0.07265547877591313,
"grad_norm": 0.41042840480804443,
"learning_rate": 0.00019257863758829035,
"loss": 0.8366,
"step": 184
},
{
"epoch": 0.07305034550839092,
"grad_norm": 0.5920013785362244,
"learning_rate": 0.00019249417595478002,
"loss": 1.084,
"step": 185
},
{
"epoch": 0.07344521224086871,
"grad_norm": 0.5364437699317932,
"learning_rate": 0.00019240925514892,
"loss": 1.0667,
"step": 186
},
{
"epoch": 0.07384007897334649,
"grad_norm": 0.5053632855415344,
"learning_rate": 0.00019232387559228587,
"loss": 0.9369,
"step": 187
},
{
"epoch": 0.07423494570582428,
"grad_norm": 0.4612145721912384,
"learning_rate": 0.0001922380377087306,
"loss": 1.0796,
"step": 188
},
{
"epoch": 0.07462981243830208,
"grad_norm": 0.5499488711357117,
"learning_rate": 0.00019215174192438247,
"loss": 1.2071,
"step": 189
},
{
"epoch": 0.07502467917077986,
"grad_norm": 0.5043527483940125,
"learning_rate": 0.00019206498866764288,
"loss": 1.111,
"step": 190
},
{
"epoch": 0.07541954590325765,
"grad_norm": 0.557327926158905,
"learning_rate": 0.00019197777836918437,
"loss": 1.0387,
"step": 191
},
{
"epoch": 0.07581441263573543,
"grad_norm": 0.5045757293701172,
"learning_rate": 0.0001918901114619483,
"loss": 1.02,
"step": 192
},
{
"epoch": 0.07620927936821323,
"grad_norm": 0.5515264868736267,
"learning_rate": 0.00019180198838114282,
"loss": 0.8964,
"step": 193
},
{
"epoch": 0.07660414610069102,
"grad_norm": 0.4991328716278076,
"learning_rate": 0.00019171340956424074,
"loss": 0.9792,
"step": 194
},
{
"epoch": 0.0769990128331688,
"grad_norm": 0.3965533971786499,
"learning_rate": 0.00019162437545097719,
"loss": 0.9692,
"step": 195
},
{
"epoch": 0.0773938795656466,
"grad_norm": 0.49761366844177246,
"learning_rate": 0.0001915348864833476,
"loss": 0.9203,
"step": 196
},
{
"epoch": 0.07778874629812438,
"grad_norm": 0.4468221962451935,
"learning_rate": 0.00019144494310560544,
"loss": 0.8878,
"step": 197
},
{
"epoch": 0.07818361303060217,
"grad_norm": 0.4550800025463104,
"learning_rate": 0.0001913545457642601,
"loss": 0.8622,
"step": 198
},
{
"epoch": 0.07857847976307997,
"grad_norm": 0.4845464825630188,
"learning_rate": 0.00019126369490807447,
"loss": 0.9628,
"step": 199
},
{
"epoch": 0.07897334649555775,
"grad_norm": 0.4576549232006073,
"learning_rate": 0.00019117239098806295,
"loss": 1.0311,
"step": 200
},
{
"epoch": 0.07936821322803554,
"grad_norm": 0.5760570764541626,
"learning_rate": 0.00019108063445748904,
"loss": 1.1729,
"step": 201
},
{
"epoch": 0.07976307996051332,
"grad_norm": 0.6804947257041931,
"learning_rate": 0.00019098842577186314,
"loss": 0.8829,
"step": 202
},
{
"epoch": 0.08015794669299112,
"grad_norm": 0.4532022476196289,
"learning_rate": 0.00019089576538894036,
"loss": 1.0826,
"step": 203
},
{
"epoch": 0.08055281342546891,
"grad_norm": 0.5325609445571899,
"learning_rate": 0.00019080265376871815,
"loss": 1.0122,
"step": 204
},
{
"epoch": 0.08094768015794669,
"grad_norm": 0.46676936745643616,
"learning_rate": 0.00019070909137343408,
"loss": 1.0449,
"step": 205
},
{
"epoch": 0.08134254689042449,
"grad_norm": 0.4907085597515106,
"learning_rate": 0.00019061507866756347,
"loss": 0.9381,
"step": 206
},
{
"epoch": 0.08173741362290227,
"grad_norm": 0.5306389927864075,
"learning_rate": 0.0001905206161178172,
"loss": 1.0909,
"step": 207
},
{
"epoch": 0.08213228035538006,
"grad_norm": 0.48803043365478516,
"learning_rate": 0.00019042570419313925,
"loss": 1.0608,
"step": 208
},
{
"epoch": 0.08252714708785784,
"grad_norm": 0.4253416061401367,
"learning_rate": 0.0001903303433647045,
"loss": 1.0037,
"step": 209
},
{
"epoch": 0.08292201382033564,
"grad_norm": 0.5864118337631226,
"learning_rate": 0.00019023453410591635,
"loss": 0.8971,
"step": 210
},
{
"epoch": 0.08331688055281343,
"grad_norm": 0.5264946222305298,
"learning_rate": 0.00019013827689240436,
"loss": 1.0425,
"step": 211
},
{
"epoch": 0.08371174728529121,
"grad_norm": 0.5967885255813599,
"learning_rate": 0.00019004157220202185,
"loss": 0.8929,
"step": 212
},
{
"epoch": 0.084106614017769,
"grad_norm": 0.5387664437294006,
"learning_rate": 0.00018994442051484356,
"loss": 0.9526,
"step": 213
},
{
"epoch": 0.08450148075024679,
"grad_norm": 0.4813781976699829,
"learning_rate": 0.00018984682231316333,
"loss": 0.9496,
"step": 214
},
{
"epoch": 0.08489634748272458,
"grad_norm": 0.513137698173523,
"learning_rate": 0.0001897487780814916,
"loss": 0.9249,
"step": 215
},
{
"epoch": 0.08529121421520237,
"grad_norm": 0.585263192653656,
"learning_rate": 0.00018965028830655309,
"loss": 1.123,
"step": 216
},
{
"epoch": 0.08568608094768015,
"grad_norm": 0.45392006635665894,
"learning_rate": 0.00018955135347728432,
"loss": 0.9507,
"step": 217
},
{
"epoch": 0.08608094768015795,
"grad_norm": 0.7591621279716492,
"learning_rate": 0.00018945197408483123,
"loss": 0.8529,
"step": 218
},
{
"epoch": 0.08647581441263573,
"grad_norm": 0.49809765815734863,
"learning_rate": 0.0001893521506225467,
"loss": 1.0025,
"step": 219
},
{
"epoch": 0.08687068114511352,
"grad_norm": 0.4396488666534424,
"learning_rate": 0.00018925188358598813,
"loss": 0.9019,
"step": 220
},
{
"epoch": 0.08726554787759132,
"grad_norm": 0.5315730571746826,
"learning_rate": 0.000189151173472915,
"loss": 1.0926,
"step": 221
},
{
"epoch": 0.0876604146100691,
"grad_norm": 0.5290179252624512,
"learning_rate": 0.00018905002078328632,
"loss": 1.1491,
"step": 222
},
{
"epoch": 0.08805528134254689,
"grad_norm": 0.41243505477905273,
"learning_rate": 0.0001889484260192582,
"loss": 0.7891,
"step": 223
},
{
"epoch": 0.08845014807502467,
"grad_norm": 0.4659099280834198,
"learning_rate": 0.0001888463896851815,
"loss": 1.0976,
"step": 224
},
{
"epoch": 0.08884501480750247,
"grad_norm": 0.5485631823539734,
"learning_rate": 0.00018874391228759893,
"loss": 0.8974,
"step": 225
},
{
"epoch": 0.08923988153998026,
"grad_norm": 0.5084119439125061,
"learning_rate": 0.000188640994335243,
"loss": 1.0882,
"step": 226
},
{
"epoch": 0.08963474827245804,
"grad_norm": 0.4861229956150055,
"learning_rate": 0.0001885376363390332,
"loss": 1.0262,
"step": 227
},
{
"epoch": 0.09002961500493584,
"grad_norm": 0.5560560822486877,
"learning_rate": 0.00018843383881207357,
"loss": 0.9285,
"step": 228
},
{
"epoch": 0.09042448173741362,
"grad_norm": 0.5360885858535767,
"learning_rate": 0.00018832960226965008,
"loss": 1.06,
"step": 229
},
{
"epoch": 0.09081934846989141,
"grad_norm": 0.43957361578941345,
"learning_rate": 0.0001882249272292282,
"loss": 0.9748,
"step": 230
},
{
"epoch": 0.0912142152023692,
"grad_norm": 0.4349263906478882,
"learning_rate": 0.00018811981421045014,
"loss": 0.9549,
"step": 231
},
{
"epoch": 0.09160908193484699,
"grad_norm": 0.44644349813461304,
"learning_rate": 0.0001880142637351325,
"loss": 0.8965,
"step": 232
},
{
"epoch": 0.09200394866732478,
"grad_norm": 0.482573539018631,
"learning_rate": 0.0001879082763272635,
"loss": 0.9798,
"step": 233
},
{
"epoch": 0.09239881539980256,
"grad_norm": 0.4720049798488617,
"learning_rate": 0.00018780185251300046,
"loss": 0.8558,
"step": 234
},
{
"epoch": 0.09279368213228036,
"grad_norm": 0.4637092053890228,
"learning_rate": 0.00018769499282066717,
"loss": 1.02,
"step": 235
},
{
"epoch": 0.09318854886475815,
"grad_norm": 0.42427390813827515,
"learning_rate": 0.00018758769778075122,
"loss": 0.992,
"step": 236
},
{
"epoch": 0.09358341559723593,
"grad_norm": 0.5138596892356873,
"learning_rate": 0.00018747996792590148,
"loss": 0.7596,
"step": 237
},
{
"epoch": 0.09397828232971372,
"grad_norm": 0.4327022433280945,
"learning_rate": 0.00018737180379092537,
"loss": 1.1874,
"step": 238
},
{
"epoch": 0.0943731490621915,
"grad_norm": 0.43842098116874695,
"learning_rate": 0.00018726320591278616,
"loss": 1.1122,
"step": 239
},
{
"epoch": 0.0947680157946693,
"grad_norm": 0.4516022205352783,
"learning_rate": 0.0001871541748306005,
"loss": 0.8585,
"step": 240
},
{
"epoch": 0.0951628825271471,
"grad_norm": 0.48878902196884155,
"learning_rate": 0.00018704471108563548,
"loss": 1.0806,
"step": 241
},
{
"epoch": 0.09555774925962487,
"grad_norm": 0.5217946767807007,
"learning_rate": 0.0001869348152213061,
"loss": 0.9486,
"step": 242
},
{
"epoch": 0.09595261599210267,
"grad_norm": 0.5032205581665039,
"learning_rate": 0.00018682448778317262,
"loss": 0.8841,
"step": 243
},
{
"epoch": 0.09634748272458045,
"grad_norm": 0.5018641948699951,
"learning_rate": 0.00018671372931893773,
"loss": 0.7681,
"step": 244
},
{
"epoch": 0.09674234945705824,
"grad_norm": 0.4859049618244171,
"learning_rate": 0.00018660254037844388,
"loss": 1.0615,
"step": 245
},
{
"epoch": 0.09713721618953604,
"grad_norm": 0.5423186421394348,
"learning_rate": 0.0001864909215136705,
"loss": 0.9082,
"step": 246
},
{
"epoch": 0.09753208292201382,
"grad_norm": 0.6605743169784546,
"learning_rate": 0.0001863788732787314,
"loss": 1.0483,
"step": 247
},
{
"epoch": 0.09792694965449161,
"grad_norm": 0.5038883090019226,
"learning_rate": 0.0001862663962298719,
"loss": 0.9848,
"step": 248
},
{
"epoch": 0.0983218163869694,
"grad_norm": 0.4320213496685028,
"learning_rate": 0.00018615349092546604,
"loss": 0.8254,
"step": 249
},
{
"epoch": 0.09871668311944719,
"grad_norm": 0.48691487312316895,
"learning_rate": 0.00018604015792601396,
"loss": 1.0443,
"step": 250
},
{
"epoch": 0.09911154985192498,
"grad_norm": 0.5005807876586914,
"learning_rate": 0.0001859263977941389,
"loss": 0.8591,
"step": 251
},
{
"epoch": 0.09950641658440276,
"grad_norm": 0.5520577430725098,
"learning_rate": 0.0001858122110945847,
"loss": 1.0758,
"step": 252
},
{
"epoch": 0.09990128331688056,
"grad_norm": 0.44724610447883606,
"learning_rate": 0.00018569759839421265,
"loss": 1.0458,
"step": 253
},
{
"epoch": 0.10029615004935834,
"grad_norm": 0.5659752488136292,
"learning_rate": 0.00018558256026199896,
"loss": 1.0763,
"step": 254
},
{
"epoch": 0.10069101678183613,
"grad_norm": 0.4626811146736145,
"learning_rate": 0.00018546709726903178,
"loss": 0.9823,
"step": 255
},
{
"epoch": 0.10108588351431391,
"grad_norm": 0.46605491638183594,
"learning_rate": 0.00018535120998850848,
"loss": 1.2862,
"step": 256
},
{
"epoch": 0.1014807502467917,
"grad_norm": 0.6146165132522583,
"learning_rate": 0.00018523489899573262,
"loss": 1.0654,
"step": 257
},
{
"epoch": 0.1018756169792695,
"grad_norm": 0.4941771328449249,
"learning_rate": 0.00018511816486811134,
"loss": 0.6966,
"step": 258
},
{
"epoch": 0.10227048371174728,
"grad_norm": 0.4850773811340332,
"learning_rate": 0.00018500100818515222,
"loss": 1.0579,
"step": 259
},
{
"epoch": 0.10266535044422508,
"grad_norm": 0.4935731589794159,
"learning_rate": 0.00018488342952846073,
"loss": 0.916,
"step": 260
},
{
"epoch": 0.10306021717670286,
"grad_norm": 0.46898818016052246,
"learning_rate": 0.000184765429481737,
"loss": 1.0551,
"step": 261
},
{
"epoch": 0.10345508390918065,
"grad_norm": 0.4352802038192749,
"learning_rate": 0.00018464700863077312,
"loss": 0.9019,
"step": 262
},
{
"epoch": 0.10384995064165845,
"grad_norm": 0.5294659733772278,
"learning_rate": 0.0001845281675634503,
"loss": 1.0661,
"step": 263
},
{
"epoch": 0.10424481737413623,
"grad_norm": 0.5499119758605957,
"learning_rate": 0.00018440890686973572,
"loss": 1.0584,
"step": 264
},
{
"epoch": 0.10463968410661402,
"grad_norm": 0.4448906183242798,
"learning_rate": 0.0001842892271416797,
"loss": 0.9614,
"step": 265
},
{
"epoch": 0.1050345508390918,
"grad_norm": 0.5477500557899475,
"learning_rate": 0.00018416912897341295,
"loss": 0.8138,
"step": 266
},
{
"epoch": 0.1054294175715696,
"grad_norm": 0.46556559205055237,
"learning_rate": 0.00018404861296114337,
"loss": 0.9528,
"step": 267
},
{
"epoch": 0.10582428430404739,
"grad_norm": 0.5127370953559875,
"learning_rate": 0.00018392767970315313,
"loss": 1.0031,
"step": 268
},
{
"epoch": 0.10621915103652517,
"grad_norm": 0.5918512940406799,
"learning_rate": 0.0001838063297997958,
"loss": 0.8006,
"step": 269
},
{
"epoch": 0.10661401776900296,
"grad_norm": 0.5921156406402588,
"learning_rate": 0.00018368456385349334,
"loss": 0.9143,
"step": 270
},
{
"epoch": 0.10700888450148074,
"grad_norm": 0.6525917053222656,
"learning_rate": 0.000183562382468733,
"loss": 1.0162,
"step": 271
},
{
"epoch": 0.10740375123395854,
"grad_norm": 0.5337258577346802,
"learning_rate": 0.00018343978625206452,
"loss": 0.9916,
"step": 272
},
{
"epoch": 0.10779861796643633,
"grad_norm": 0.509904146194458,
"learning_rate": 0.00018331677581209696,
"loss": 0.8978,
"step": 273
},
{
"epoch": 0.10819348469891411,
"grad_norm": 0.45876345038414,
"learning_rate": 0.0001831933517594957,
"loss": 1.0373,
"step": 274
},
{
"epoch": 0.10858835143139191,
"grad_norm": 0.5002173185348511,
"learning_rate": 0.00018306951470697946,
"loss": 0.8874,
"step": 275
},
{
"epoch": 0.10898321816386969,
"grad_norm": 0.44106170535087585,
"learning_rate": 0.00018294526526931718,
"loss": 0.7961,
"step": 276
},
{
"epoch": 0.10937808489634748,
"grad_norm": 0.4849831163883209,
"learning_rate": 0.00018282060406332512,
"loss": 1.0139,
"step": 277
},
{
"epoch": 0.10977295162882528,
"grad_norm": 0.4701422452926636,
"learning_rate": 0.0001826955317078636,
"loss": 1.0458,
"step": 278
},
{
"epoch": 0.11016781836130306,
"grad_norm": 0.5966842174530029,
"learning_rate": 0.00018257004882383412,
"loss": 0.7497,
"step": 279
},
{
"epoch": 0.11056268509378085,
"grad_norm": 0.49655190110206604,
"learning_rate": 0.00018244415603417603,
"loss": 1.2634,
"step": 280
},
{
"epoch": 0.11095755182625863,
"grad_norm": 0.7236825823783875,
"learning_rate": 0.00018231785396386377,
"loss": 1.0645,
"step": 281
},
{
"epoch": 0.11135241855873643,
"grad_norm": 0.522117018699646,
"learning_rate": 0.00018219114323990345,
"loss": 0.8553,
"step": 282
},
{
"epoch": 0.11174728529121422,
"grad_norm": 0.5486408472061157,
"learning_rate": 0.00018206402449132995,
"loss": 1.0859,
"step": 283
},
{
"epoch": 0.112142152023692,
"grad_norm": 0.6336297392845154,
"learning_rate": 0.00018193649834920373,
"loss": 1.0769,
"step": 284
},
{
"epoch": 0.1125370187561698,
"grad_norm": 0.47025516629219055,
"learning_rate": 0.0001818085654466076,
"loss": 0.9662,
"step": 285
},
{
"epoch": 0.11293188548864758,
"grad_norm": 0.5352553725242615,
"learning_rate": 0.00018168022641864377,
"loss": 1.0481,
"step": 286
},
{
"epoch": 0.11332675222112537,
"grad_norm": 0.4911988377571106,
"learning_rate": 0.00018155148190243051,
"loss": 0.888,
"step": 287
},
{
"epoch": 0.11372161895360317,
"grad_norm": 0.4751187264919281,
"learning_rate": 0.00018142233253709916,
"loss": 0.9517,
"step": 288
},
{
"epoch": 0.11411648568608095,
"grad_norm": 0.4718889892101288,
"learning_rate": 0.00018129277896379077,
"loss": 1.028,
"step": 289
},
{
"epoch": 0.11451135241855874,
"grad_norm": 0.5223097205162048,
"learning_rate": 0.00018116282182565311,
"loss": 0.7671,
"step": 290
},
{
"epoch": 0.11490621915103652,
"grad_norm": 0.47218167781829834,
"learning_rate": 0.0001810324617678373,
"loss": 0.982,
"step": 291
},
{
"epoch": 0.11530108588351431,
"grad_norm": 0.5651125907897949,
"learning_rate": 0.00018090169943749476,
"loss": 0.8603,
"step": 292
},
{
"epoch": 0.11569595261599211,
"grad_norm": 0.6196467280387878,
"learning_rate": 0.00018077053548377382,
"loss": 0.9174,
"step": 293
},
{
"epoch": 0.11609081934846989,
"grad_norm": 0.5194737911224365,
"learning_rate": 0.0001806389705578168,
"loss": 1.2105,
"step": 294
},
{
"epoch": 0.11648568608094768,
"grad_norm": 0.46870988607406616,
"learning_rate": 0.0001805070053127563,
"loss": 0.9009,
"step": 295
},
{
"epoch": 0.11688055281342546,
"grad_norm": 0.5096109509468079,
"learning_rate": 0.0001803746404037125,
"loss": 1.0304,
"step": 296
},
{
"epoch": 0.11727541954590326,
"grad_norm": 0.5446627140045166,
"learning_rate": 0.00018024187648778956,
"loss": 0.9724,
"step": 297
},
{
"epoch": 0.11767028627838105,
"grad_norm": 0.45771893858909607,
"learning_rate": 0.00018010871422407236,
"loss": 1.1809,
"step": 298
},
{
"epoch": 0.11806515301085883,
"grad_norm": 0.4939158856868744,
"learning_rate": 0.0001799751542736234,
"loss": 0.9581,
"step": 299
},
{
"epoch": 0.11846001974333663,
"grad_norm": 0.43252983689308167,
"learning_rate": 0.00017984119729947944,
"loss": 0.9829,
"step": 300
},
{
"epoch": 0.11885488647581441,
"grad_norm": 0.5026227831840515,
"learning_rate": 0.00017970684396664813,
"loss": 1.0617,
"step": 301
},
{
"epoch": 0.1192497532082922,
"grad_norm": 0.5750073194503784,
"learning_rate": 0.00017957209494210493,
"loss": 0.9648,
"step": 302
},
{
"epoch": 0.11964461994076998,
"grad_norm": 0.511223316192627,
"learning_rate": 0.0001794369508947894,
"loss": 0.9457,
"step": 303
},
{
"epoch": 0.12003948667324778,
"grad_norm": 0.6075318455696106,
"learning_rate": 0.00017930141249560233,
"loss": 1.1244,
"step": 304
},
{
"epoch": 0.12043435340572557,
"grad_norm": 0.5594468712806702,
"learning_rate": 0.00017916548041740213,
"loss": 0.9975,
"step": 305
},
{
"epoch": 0.12082922013820335,
"grad_norm": 0.45040223002433777,
"learning_rate": 0.0001790291553350016,
"loss": 1.0633,
"step": 306
},
{
"epoch": 0.12122408687068115,
"grad_norm": 0.5200604200363159,
"learning_rate": 0.0001788924379251645,
"loss": 0.8929,
"step": 307
},
{
"epoch": 0.12161895360315893,
"grad_norm": 0.48910826444625854,
"learning_rate": 0.00017875532886660228,
"loss": 0.9944,
"step": 308
},
{
"epoch": 0.12201382033563672,
"grad_norm": 0.5069397687911987,
"learning_rate": 0.0001786178288399706,
"loss": 1.1026,
"step": 309
},
{
"epoch": 0.12240868706811452,
"grad_norm": 0.4457162320613861,
"learning_rate": 0.0001784799385278661,
"loss": 0.8329,
"step": 310
},
{
"epoch": 0.1228035538005923,
"grad_norm": 0.5899550318717957,
"learning_rate": 0.0001783416586148229,
"loss": 0.9725,
"step": 311
},
{
"epoch": 0.12319842053307009,
"grad_norm": 0.48378434777259827,
"learning_rate": 0.00017820298978730921,
"loss": 0.9683,
"step": 312
},
{
"epoch": 0.12359328726554787,
"grad_norm": 0.47537699341773987,
"learning_rate": 0.00017806393273372395,
"loss": 1.0343,
"step": 313
},
{
"epoch": 0.12398815399802567,
"grad_norm": 0.46815434098243713,
"learning_rate": 0.00017792448814439333,
"loss": 1.0695,
"step": 314
},
{
"epoch": 0.12438302073050346,
"grad_norm": 0.4383327066898346,
"learning_rate": 0.00017778465671156743,
"loss": 1.0047,
"step": 315
},
{
"epoch": 0.12477788746298124,
"grad_norm": 0.46698901057243347,
"learning_rate": 0.00017764443912941672,
"loss": 1.0268,
"step": 316
},
{
"epoch": 0.12517275419545904,
"grad_norm": 0.5612544417381287,
"learning_rate": 0.0001775038360940287,
"loss": 0.9519,
"step": 317
},
{
"epoch": 0.12556762092793683,
"grad_norm": 0.5520269274711609,
"learning_rate": 0.00017736284830340436,
"loss": 0.9068,
"step": 318
},
{
"epoch": 0.12596248766041462,
"grad_norm": 0.5094735622406006,
"learning_rate": 0.00017722147645745468,
"loss": 0.9915,
"step": 319
},
{
"epoch": 0.1263573543928924,
"grad_norm": 0.49084824323654175,
"learning_rate": 0.00017707972125799735,
"loss": 0.8411,
"step": 320
},
{
"epoch": 0.12675222112537018,
"grad_norm": 0.47397279739379883,
"learning_rate": 0.00017693758340875306,
"loss": 0.9266,
"step": 321
},
{
"epoch": 0.12714708785784798,
"grad_norm": 0.3962591588497162,
"learning_rate": 0.00017679506361534215,
"loss": 0.9885,
"step": 322
},
{
"epoch": 0.12754195459032577,
"grad_norm": 0.48955652117729187,
"learning_rate": 0.000176652162585281,
"loss": 1.0869,
"step": 323
},
{
"epoch": 0.12793682132280354,
"grad_norm": 0.6018503308296204,
"learning_rate": 0.00017650888102797868,
"loss": 0.8733,
"step": 324
},
{
"epoch": 0.12833168805528133,
"grad_norm": 0.6061957478523254,
"learning_rate": 0.00017636521965473323,
"loss": 1.2883,
"step": 325
},
{
"epoch": 0.12872655478775913,
"grad_norm": 0.47981151938438416,
"learning_rate": 0.00017622117917872823,
"loss": 0.9246,
"step": 326
},
{
"epoch": 0.12912142152023692,
"grad_norm": 0.5347411632537842,
"learning_rate": 0.00017607676031502933,
"loss": 1.0827,
"step": 327
},
{
"epoch": 0.12951628825271472,
"grad_norm": 0.7249376773834229,
"learning_rate": 0.0001759319637805806,
"loss": 1.0677,
"step": 328
},
{
"epoch": 0.12991115498519248,
"grad_norm": 0.5934639573097229,
"learning_rate": 0.00017578679029420092,
"loss": 0.8046,
"step": 329
},
{
"epoch": 0.13030602171767028,
"grad_norm": 0.6128519773483276,
"learning_rate": 0.00017564124057658056,
"loss": 0.8474,
"step": 330
},
{
"epoch": 0.13070088845014807,
"grad_norm": 0.49019843339920044,
"learning_rate": 0.0001754953153502775,
"loss": 0.9108,
"step": 331
},
{
"epoch": 0.13109575518262587,
"grad_norm": 0.548611044883728,
"learning_rate": 0.0001753490153397139,
"loss": 0.9954,
"step": 332
},
{
"epoch": 0.13149062191510366,
"grad_norm": 0.49729061126708984,
"learning_rate": 0.00017520234127117243,
"loss": 0.9943,
"step": 333
},
{
"epoch": 0.13188548864758143,
"grad_norm": 0.47645774483680725,
"learning_rate": 0.00017505529387279277,
"loss": 0.8234,
"step": 334
},
{
"epoch": 0.13228035538005922,
"grad_norm": 0.5200782418251038,
"learning_rate": 0.0001749078738745679,
"loss": 1.0991,
"step": 335
},
{
"epoch": 0.13267522211253702,
"grad_norm": 0.4647184908390045,
"learning_rate": 0.0001747600820083405,
"loss": 1.0169,
"step": 336
},
{
"epoch": 0.1330700888450148,
"grad_norm": 0.4864305853843689,
"learning_rate": 0.00017461191900779936,
"loss": 0.9525,
"step": 337
},
{
"epoch": 0.1334649555774926,
"grad_norm": 0.5017113089561462,
"learning_rate": 0.00017446338560847568,
"loss": 0.807,
"step": 338
},
{
"epoch": 0.13385982230997037,
"grad_norm": 0.5566651821136475,
"learning_rate": 0.00017431448254773944,
"loss": 1.2096,
"step": 339
},
{
"epoch": 0.13425468904244817,
"grad_norm": 0.48729124665260315,
"learning_rate": 0.00017416521056479577,
"loss": 0.84,
"step": 340
},
{
"epoch": 0.13464955577492596,
"grad_norm": 0.48902615904808044,
"learning_rate": 0.00017401557040068124,
"loss": 0.9068,
"step": 341
},
{
"epoch": 0.13504442250740376,
"grad_norm": 0.5371021628379822,
"learning_rate": 0.00017386556279826021,
"loss": 1.0875,
"step": 342
},
{
"epoch": 0.13543928923988155,
"grad_norm": 0.5122295618057251,
"learning_rate": 0.00017371518850222112,
"loss": 0.928,
"step": 343
},
{
"epoch": 0.13583415597235932,
"grad_norm": 0.5144253373146057,
"learning_rate": 0.00017356444825907273,
"loss": 1.1201,
"step": 344
},
{
"epoch": 0.1362290227048371,
"grad_norm": 0.6197713017463684,
"learning_rate": 0.00017341334281714064,
"loss": 1.0366,
"step": 345
},
{
"epoch": 0.1366238894373149,
"grad_norm": 0.5059978365898132,
"learning_rate": 0.00017326187292656333,
"loss": 1.0132,
"step": 346
},
{
"epoch": 0.1370187561697927,
"grad_norm": 0.45369940996170044,
"learning_rate": 0.00017311003933928847,
"loss": 1.0436,
"step": 347
},
{
"epoch": 0.1374136229022705,
"grad_norm": 0.5087475180625916,
"learning_rate": 0.00017295784280906934,
"loss": 0.9475,
"step": 348
},
{
"epoch": 0.13780848963474826,
"grad_norm": 0.48209476470947266,
"learning_rate": 0.00017280528409146094,
"loss": 1.1108,
"step": 349
},
{
"epoch": 0.13820335636722605,
"grad_norm": 0.5897043943405151,
"learning_rate": 0.00017265236394381633,
"loss": 1.0758,
"step": 350
},
{
"epoch": 0.13859822309970385,
"grad_norm": 0.4946494996547699,
"learning_rate": 0.00017249908312528276,
"loss": 0.9829,
"step": 351
},
{
"epoch": 0.13899308983218164,
"grad_norm": 0.49029871821403503,
"learning_rate": 0.00017234544239679806,
"loss": 0.8431,
"step": 352
},
{
"epoch": 0.13938795656465944,
"grad_norm": 0.5330137610435486,
"learning_rate": 0.00017219144252108673,
"loss": 1.13,
"step": 353
},
{
"epoch": 0.1397828232971372,
"grad_norm": 0.47816064953804016,
"learning_rate": 0.00017203708426265614,
"loss": 1.0986,
"step": 354
},
{
"epoch": 0.140177690029615,
"grad_norm": 0.537811815738678,
"learning_rate": 0.00017188236838779295,
"loss": 1.0599,
"step": 355
},
{
"epoch": 0.140177690029615,
"eval_loss": 0.9808822274208069,
"eval_runtime": 61.6088,
"eval_samples_per_second": 17.319,
"eval_steps_per_second": 8.668,
"step": 355
},
{
"epoch": 0.1405725567620928,
"grad_norm": 0.5210056304931641,
"learning_rate": 0.000171727295664559,
"loss": 1.1635,
"step": 356
},
{
"epoch": 0.1409674234945706,
"grad_norm": 0.5472628474235535,
"learning_rate": 0.00017157186686278766,
"loss": 1.2106,
"step": 357
},
{
"epoch": 0.14136229022704838,
"grad_norm": 0.459087073802948,
"learning_rate": 0.00017141608275408006,
"loss": 1.0337,
"step": 358
},
{
"epoch": 0.14175715695952615,
"grad_norm": 0.41874152421951294,
"learning_rate": 0.00017125994411180124,
"loss": 0.8032,
"step": 359
},
{
"epoch": 0.14215202369200394,
"grad_norm": 0.4521096348762512,
"learning_rate": 0.0001711034517110761,
"loss": 0.9549,
"step": 360
},
{
"epoch": 0.14254689042448174,
"grad_norm": 0.48767751455307007,
"learning_rate": 0.00017094660632878582,
"loss": 0.9779,
"step": 361
},
{
"epoch": 0.14294175715695953,
"grad_norm": 0.4864053428173065,
"learning_rate": 0.00017078940874356392,
"loss": 0.7642,
"step": 362
},
{
"epoch": 0.14333662388943733,
"grad_norm": 0.46765899658203125,
"learning_rate": 0.00017063185973579232,
"loss": 1.0457,
"step": 363
},
{
"epoch": 0.1437314906219151,
"grad_norm": 0.4682892858982086,
"learning_rate": 0.00017047396008759754,
"loss": 0.974,
"step": 364
},
{
"epoch": 0.1441263573543929,
"grad_norm": 0.4890439212322235,
"learning_rate": 0.00017031571058284678,
"loss": 0.9047,
"step": 365
},
{
"epoch": 0.14452122408687068,
"grad_norm": 0.4934488832950592,
"learning_rate": 0.00017015711200714414,
"loss": 1.1,
"step": 366
},
{
"epoch": 0.14491609081934848,
"grad_norm": 0.5664051175117493,
"learning_rate": 0.00016999816514782647,
"loss": 1.0985,
"step": 367
},
{
"epoch": 0.14531095755182627,
"grad_norm": 0.5750765800476074,
"learning_rate": 0.00016983887079395974,
"loss": 0.936,
"step": 368
},
{
"epoch": 0.14570582428430404,
"grad_norm": 0.49568259716033936,
"learning_rate": 0.00016967922973633494,
"loss": 1.1489,
"step": 369
},
{
"epoch": 0.14610069101678183,
"grad_norm": 0.5348165035247803,
"learning_rate": 0.00016951924276746425,
"loss": 1.1904,
"step": 370
},
{
"epoch": 0.14649555774925963,
"grad_norm": 0.4326549470424652,
"learning_rate": 0.00016935891068157704,
"loss": 0.8516,
"step": 371
},
{
"epoch": 0.14689042448173742,
"grad_norm": 0.5540488362312317,
"learning_rate": 0.000169198234274616,
"loss": 1.0405,
"step": 372
},
{
"epoch": 0.1472852912142152,
"grad_norm": 0.5236465930938721,
"learning_rate": 0.00016903721434423306,
"loss": 0.9151,
"step": 373
},
{
"epoch": 0.14768015794669298,
"grad_norm": 0.5312307476997375,
"learning_rate": 0.00016887585168978562,
"loss": 1.1763,
"step": 374
},
{
"epoch": 0.14807502467917077,
"grad_norm": 0.4888836145401001,
"learning_rate": 0.0001687141471123324,
"loss": 1.0494,
"step": 375
},
{
"epoch": 0.14846989141164857,
"grad_norm": 0.49629417061805725,
"learning_rate": 0.00016855210141462963,
"loss": 0.697,
"step": 376
},
{
"epoch": 0.14886475814412636,
"grad_norm": 0.5097388029098511,
"learning_rate": 0.0001683897154011269,
"loss": 1.0308,
"step": 377
},
{
"epoch": 0.14925962487660416,
"grad_norm": 0.4330967664718628,
"learning_rate": 0.0001682269898779632,
"loss": 0.8466,
"step": 378
},
{
"epoch": 0.14965449160908192,
"grad_norm": 0.4622458219528198,
"learning_rate": 0.00016806392565296311,
"loss": 0.849,
"step": 379
},
{
"epoch": 0.15004935834155972,
"grad_norm": 0.4904235303401947,
"learning_rate": 0.00016790052353563253,
"loss": 1.0324,
"step": 380
},
{
"epoch": 0.1504442250740375,
"grad_norm": 0.5324286222457886,
"learning_rate": 0.00016773678433715475,
"loss": 0.905,
"step": 381
},
{
"epoch": 0.1508390918065153,
"grad_norm": 0.5701109766960144,
"learning_rate": 0.00016757270887038654,
"loss": 1.1105,
"step": 382
},
{
"epoch": 0.1512339585389931,
"grad_norm": 0.45347732305526733,
"learning_rate": 0.00016740829794985394,
"loss": 0.875,
"step": 383
},
{
"epoch": 0.15162882527147087,
"grad_norm": 0.4763396084308624,
"learning_rate": 0.00016724355239174833,
"loss": 0.9732,
"step": 384
},
{
"epoch": 0.15202369200394866,
"grad_norm": 0.4246227443218231,
"learning_rate": 0.00016707847301392236,
"loss": 0.9235,
"step": 385
},
{
"epoch": 0.15241855873642646,
"grad_norm": 0.6034351587295532,
"learning_rate": 0.00016691306063588583,
"loss": 0.8553,
"step": 386
},
{
"epoch": 0.15281342546890425,
"grad_norm": 0.48924902081489563,
"learning_rate": 0.0001667473160788017,
"loss": 0.967,
"step": 387
},
{
"epoch": 0.15320829220138205,
"grad_norm": 0.5340859889984131,
"learning_rate": 0.00016658124016548197,
"loss": 1.1052,
"step": 388
},
{
"epoch": 0.1536031589338598,
"grad_norm": 0.5492063760757446,
"learning_rate": 0.0001664148337203836,
"loss": 1.1952,
"step": 389
},
{
"epoch": 0.1539980256663376,
"grad_norm": 0.4856433868408203,
"learning_rate": 0.00016624809756960444,
"loss": 0.9994,
"step": 390
},
{
"epoch": 0.1543928923988154,
"grad_norm": 0.4147414565086365,
"learning_rate": 0.00016608103254087906,
"loss": 0.9976,
"step": 391
},
{
"epoch": 0.1547877591312932,
"grad_norm": 0.5175687074661255,
"learning_rate": 0.00016591363946357474,
"loss": 1.0245,
"step": 392
},
{
"epoch": 0.155182625863771,
"grad_norm": 0.620985209941864,
"learning_rate": 0.00016574591916868728,
"loss": 1.1981,
"step": 393
},
{
"epoch": 0.15557749259624876,
"grad_norm": 0.42090892791748047,
"learning_rate": 0.00016557787248883696,
"loss": 0.9171,
"step": 394
},
{
"epoch": 0.15597235932872655,
"grad_norm": 0.6142613291740417,
"learning_rate": 0.00016540950025826422,
"loss": 1.0901,
"step": 395
},
{
"epoch": 0.15636722606120435,
"grad_norm": 0.6569430232048035,
"learning_rate": 0.00016524080331282577,
"loss": 1.0362,
"step": 396
},
{
"epoch": 0.15676209279368214,
"grad_norm": 0.5111309885978699,
"learning_rate": 0.00016507178248999024,
"loss": 0.9666,
"step": 397
},
{
"epoch": 0.15715695952615993,
"grad_norm": 0.5645838975906372,
"learning_rate": 0.00016490243862883413,
"loss": 0.9295,
"step": 398
},
{
"epoch": 0.1575518262586377,
"grad_norm": 0.46207594871520996,
"learning_rate": 0.00016473277257003757,
"loss": 1.0462,
"step": 399
},
{
"epoch": 0.1579466929911155,
"grad_norm": 0.5260053873062134,
"learning_rate": 0.00016456278515588024,
"loss": 0.9745,
"step": 400
},
{
"epoch": 0.1583415597235933,
"grad_norm": 0.5953394174575806,
"learning_rate": 0.00016439247723023712,
"loss": 1.0208,
"step": 401
},
{
"epoch": 0.15873642645607108,
"grad_norm": 0.5679177641868591,
"learning_rate": 0.00016422184963857432,
"loss": 0.8679,
"step": 402
},
{
"epoch": 0.15913129318854888,
"grad_norm": 0.47382932901382446,
"learning_rate": 0.00016405090322794483,
"loss": 0.9579,
"step": 403
},
{
"epoch": 0.15952615992102664,
"grad_norm": 0.49775707721710205,
"learning_rate": 0.00016387963884698448,
"loss": 1.2399,
"step": 404
},
{
"epoch": 0.15992102665350444,
"grad_norm": 0.49022993445396423,
"learning_rate": 0.00016370805734590747,
"loss": 1.0984,
"step": 405
},
{
"epoch": 0.16031589338598223,
"grad_norm": 0.5563966035842896,
"learning_rate": 0.00016353615957650236,
"loss": 0.9777,
"step": 406
},
{
"epoch": 0.16071076011846003,
"grad_norm": 0.46516212821006775,
"learning_rate": 0.00016336394639212783,
"loss": 0.8274,
"step": 407
},
{
"epoch": 0.16110562685093782,
"grad_norm": 0.4698953926563263,
"learning_rate": 0.00016319141864770827,
"loss": 0.9419,
"step": 408
},
{
"epoch": 0.1615004935834156,
"grad_norm": 0.4373069405555725,
"learning_rate": 0.00016301857719972976,
"loss": 0.8235,
"step": 409
},
{
"epoch": 0.16189536031589338,
"grad_norm": 0.6190903186798096,
"learning_rate": 0.00016284542290623567,
"loss": 0.9092,
"step": 410
},
{
"epoch": 0.16229022704837118,
"grad_norm": 0.5729885101318359,
"learning_rate": 0.0001626719566268224,
"loss": 0.9563,
"step": 411
},
{
"epoch": 0.16268509378084897,
"grad_norm": 0.5004550814628601,
"learning_rate": 0.00016249817922263517,
"loss": 1.1011,
"step": 412
},
{
"epoch": 0.16307996051332677,
"grad_norm": 0.4848381578922272,
"learning_rate": 0.0001623240915563638,
"loss": 1.0606,
"step": 413
},
{
"epoch": 0.16347482724580453,
"grad_norm": 0.5314562916755676,
"learning_rate": 0.00016214969449223824,
"loss": 0.7504,
"step": 414
},
{
"epoch": 0.16386969397828233,
"grad_norm": 0.5033890604972839,
"learning_rate": 0.00016197498889602448,
"loss": 1.0749,
"step": 415
},
{
"epoch": 0.16426456071076012,
"grad_norm": 0.5094935894012451,
"learning_rate": 0.0001617999756350202,
"loss": 0.8851,
"step": 416
},
{
"epoch": 0.16465942744323792,
"grad_norm": 0.4637800455093384,
"learning_rate": 0.00016162465557805034,
"loss": 0.8553,
"step": 417
},
{
"epoch": 0.16505429417571568,
"grad_norm": 0.43399858474731445,
"learning_rate": 0.00016144902959546286,
"loss": 0.9576,
"step": 418
},
{
"epoch": 0.16544916090819348,
"grad_norm": 0.5109582543373108,
"learning_rate": 0.00016127309855912457,
"loss": 1.1576,
"step": 419
},
{
"epoch": 0.16584402764067127,
"grad_norm": 0.5534142851829529,
"learning_rate": 0.00016109686334241655,
"loss": 0.7701,
"step": 420
},
{
"epoch": 0.16623889437314907,
"grad_norm": 0.46211493015289307,
"learning_rate": 0.00016092032482023,
"loss": 0.9263,
"step": 421
},
{
"epoch": 0.16663376110562686,
"grad_norm": 0.5489344596862793,
"learning_rate": 0.00016074348386896177,
"loss": 1.0165,
"step": 422
},
{
"epoch": 0.16702862783810463,
"grad_norm": 0.4893771708011627,
"learning_rate": 0.0001605663413665102,
"loss": 1.0284,
"step": 423
},
{
"epoch": 0.16742349457058242,
"grad_norm": 0.46742960810661316,
"learning_rate": 0.00016038889819227045,
"loss": 1.0603,
"step": 424
},
{
"epoch": 0.16781836130306022,
"grad_norm": 0.6020311713218689,
"learning_rate": 0.00016021115522713047,
"loss": 0.8771,
"step": 425
},
{
"epoch": 0.168213228035538,
"grad_norm": 0.5273780822753906,
"learning_rate": 0.00016003311335346636,
"loss": 0.9648,
"step": 426
},
{
"epoch": 0.1686080947680158,
"grad_norm": 0.5141377449035645,
"learning_rate": 0.00015985477345513817,
"loss": 0.8773,
"step": 427
},
{
"epoch": 0.16900296150049357,
"grad_norm": 0.4852812588214874,
"learning_rate": 0.00015967613641748542,
"loss": 0.7764,
"step": 428
},
{
"epoch": 0.16939782823297136,
"grad_norm": 0.43280699849128723,
"learning_rate": 0.0001594972031273228,
"loss": 0.7407,
"step": 429
},
{
"epoch": 0.16979269496544916,
"grad_norm": 0.5069910883903503,
"learning_rate": 0.00015931797447293552,
"loss": 0.9313,
"step": 430
},
{
"epoch": 0.17018756169792695,
"grad_norm": 0.5123517513275146,
"learning_rate": 0.00015913845134407533,
"loss": 1.0705,
"step": 431
},
{
"epoch": 0.17058242843040475,
"grad_norm": 0.724429190158844,
"learning_rate": 0.00015895863463195558,
"loss": 0.9353,
"step": 432
},
{
"epoch": 0.17097729516288251,
"grad_norm": 0.5428094267845154,
"learning_rate": 0.00015877852522924732,
"loss": 1.1227,
"step": 433
},
{
"epoch": 0.1713721618953603,
"grad_norm": 0.6102519631385803,
"learning_rate": 0.00015859812403007443,
"loss": 0.9353,
"step": 434
},
{
"epoch": 0.1717670286278381,
"grad_norm": 0.5151787400245667,
"learning_rate": 0.00015841743193000944,
"loss": 0.9646,
"step": 435
},
{
"epoch": 0.1721618953603159,
"grad_norm": 0.6272695064544678,
"learning_rate": 0.00015823644982606905,
"loss": 0.8384,
"step": 436
},
{
"epoch": 0.1725567620927937,
"grad_norm": 0.49354809522628784,
"learning_rate": 0.00015805517861670952,
"loss": 0.7855,
"step": 437
},
{
"epoch": 0.17295162882527146,
"grad_norm": 0.47559288144111633,
"learning_rate": 0.0001578736192018224,
"loss": 0.9591,
"step": 438
},
{
"epoch": 0.17334649555774925,
"grad_norm": 0.5615376830101013,
"learning_rate": 0.00015769177248273008,
"loss": 1.1537,
"step": 439
},
{
"epoch": 0.17374136229022705,
"grad_norm": 0.5301774144172668,
"learning_rate": 0.00015750963936218105,
"loss": 0.7773,
"step": 440
},
{
"epoch": 0.17413622902270484,
"grad_norm": 0.5083664059638977,
"learning_rate": 0.0001573272207443457,
"loss": 0.9705,
"step": 441
},
{
"epoch": 0.17453109575518264,
"grad_norm": 0.5633112788200378,
"learning_rate": 0.00015714451753481168,
"loss": 1.0109,
"step": 442
},
{
"epoch": 0.1749259624876604,
"grad_norm": 0.5243581533432007,
"learning_rate": 0.00015696153064057947,
"loss": 1.0258,
"step": 443
},
{
"epoch": 0.1753208292201382,
"grad_norm": 0.6054911613464355,
"learning_rate": 0.0001567782609700579,
"loss": 1.0102,
"step": 444
},
{
"epoch": 0.175715695952616,
"grad_norm": 0.5889274477958679,
"learning_rate": 0.00015659470943305955,
"loss": 1.1549,
"step": 445
},
{
"epoch": 0.17611056268509379,
"grad_norm": 0.5765202045440674,
"learning_rate": 0.0001564108769407962,
"loss": 0.7791,
"step": 446
},
{
"epoch": 0.17650542941757158,
"grad_norm": 0.5080841779708862,
"learning_rate": 0.0001562267644058746,
"loss": 0.9962,
"step": 447
},
{
"epoch": 0.17690029615004935,
"grad_norm": 0.5185093879699707,
"learning_rate": 0.00015604237274229147,
"loss": 1.1927,
"step": 448
},
{
"epoch": 0.17729516288252714,
"grad_norm": 0.5385391116142273,
"learning_rate": 0.00015585770286542945,
"loss": 1.0555,
"step": 449
},
{
"epoch": 0.17769002961500494,
"grad_norm": 0.6289413571357727,
"learning_rate": 0.00015567275569205218,
"loss": 1.0431,
"step": 450
},
{
"epoch": 0.17808489634748273,
"grad_norm": 0.6271052956581116,
"learning_rate": 0.0001554875321402999,
"loss": 1.0078,
"step": 451
},
{
"epoch": 0.17847976307996052,
"grad_norm": 0.6165266633033752,
"learning_rate": 0.00015530203312968502,
"loss": 0.9761,
"step": 452
},
{
"epoch": 0.1788746298124383,
"grad_norm": 0.4909960627555847,
"learning_rate": 0.00015511625958108719,
"loss": 1.061,
"step": 453
},
{
"epoch": 0.17926949654491608,
"grad_norm": 0.5358797311782837,
"learning_rate": 0.00015493021241674918,
"loss": 1.0878,
"step": 454
},
{
"epoch": 0.17966436327739388,
"grad_norm": 0.6802231073379517,
"learning_rate": 0.000154743892560272,
"loss": 1.0068,
"step": 455
},
{
"epoch": 0.18005923000987167,
"grad_norm": 0.6705336570739746,
"learning_rate": 0.00015455730093661034,
"loss": 1.0845,
"step": 456
},
{
"epoch": 0.18045409674234947,
"grad_norm": 0.43329593539237976,
"learning_rate": 0.0001543704384720681,
"loss": 0.8636,
"step": 457
},
{
"epoch": 0.18084896347482723,
"grad_norm": 0.5168460011482239,
"learning_rate": 0.0001541833060942937,
"loss": 0.7497,
"step": 458
},
{
"epoch": 0.18124383020730503,
"grad_norm": 0.4275985062122345,
"learning_rate": 0.0001539959047322755,
"loss": 0.9042,
"step": 459
},
{
"epoch": 0.18163869693978282,
"grad_norm": 0.44853127002716064,
"learning_rate": 0.00015380823531633729,
"loss": 0.9091,
"step": 460
},
{
"epoch": 0.18203356367226062,
"grad_norm": 0.4683418571949005,
"learning_rate": 0.00015362029877813332,
"loss": 0.8174,
"step": 461
},
{
"epoch": 0.1824284304047384,
"grad_norm": 0.4813165068626404,
"learning_rate": 0.00015343209605064422,
"loss": 0.7648,
"step": 462
},
{
"epoch": 0.18282329713721618,
"grad_norm": 0.47031188011169434,
"learning_rate": 0.00015324362806817186,
"loss": 0.8667,
"step": 463
},
{
"epoch": 0.18321816386969397,
"grad_norm": 0.5744631886482239,
"learning_rate": 0.00015305489576633504,
"loss": 0.8219,
"step": 464
},
{
"epoch": 0.18361303060217177,
"grad_norm": 0.48958727717399597,
"learning_rate": 0.00015286590008206465,
"loss": 1.018,
"step": 465
},
{
"epoch": 0.18400789733464956,
"grad_norm": 0.5247324109077454,
"learning_rate": 0.00015267664195359917,
"loss": 0.8933,
"step": 466
},
{
"epoch": 0.18440276406712736,
"grad_norm": 0.545970618724823,
"learning_rate": 0.00015248712232047992,
"loss": 1.0508,
"step": 467
},
{
"epoch": 0.18479763079960512,
"grad_norm": 0.5140126347541809,
"learning_rate": 0.0001522973421235464,
"loss": 0.7581,
"step": 468
},
{
"epoch": 0.18519249753208292,
"grad_norm": 0.4570896625518799,
"learning_rate": 0.00015210730230493162,
"loss": 1.0665,
"step": 469
},
{
"epoch": 0.1855873642645607,
"grad_norm": 0.43096470832824707,
"learning_rate": 0.00015191700380805752,
"loss": 0.8313,
"step": 470
},
{
"epoch": 0.1859822309970385,
"grad_norm": 0.460275799036026,
"learning_rate": 0.00015172644757763015,
"loss": 0.9575,
"step": 471
},
{
"epoch": 0.1863770977295163,
"grad_norm": 0.4163447618484497,
"learning_rate": 0.00015153563455963499,
"loss": 0.8838,
"step": 472
},
{
"epoch": 0.18677196446199407,
"grad_norm": 0.542241096496582,
"learning_rate": 0.0001513445657013324,
"loss": 0.8143,
"step": 473
},
{
"epoch": 0.18716683119447186,
"grad_norm": 0.6614589691162109,
"learning_rate": 0.00015115324195125274,
"loss": 0.9645,
"step": 474
},
{
"epoch": 0.18756169792694966,
"grad_norm": 0.4719527065753937,
"learning_rate": 0.00015096166425919175,
"loss": 0.9894,
"step": 475
},
{
"epoch": 0.18795656465942745,
"grad_norm": 0.4972122013568878,
"learning_rate": 0.0001507698335762059,
"loss": 0.8865,
"step": 476
},
{
"epoch": 0.18835143139190524,
"grad_norm": 0.5407273769378662,
"learning_rate": 0.00015057775085460749,
"loss": 0.8714,
"step": 477
},
{
"epoch": 0.188746298124383,
"grad_norm": 0.4692353308200836,
"learning_rate": 0.00015038541704796003,
"loss": 0.9357,
"step": 478
},
{
"epoch": 0.1891411648568608,
"grad_norm": 0.4748722314834595,
"learning_rate": 0.00015019283311107367,
"loss": 1.0376,
"step": 479
},
{
"epoch": 0.1895360315893386,
"grad_norm": 0.4381348490715027,
"learning_rate": 0.00015000000000000001,
"loss": 1.0059,
"step": 480
},
{
"epoch": 0.1899308983218164,
"grad_norm": 0.48987191915512085,
"learning_rate": 0.0001498069186720279,
"loss": 0.8901,
"step": 481
},
{
"epoch": 0.1903257650542942,
"grad_norm": 0.5566233396530151,
"learning_rate": 0.0001496135900856782,
"loss": 1.1464,
"step": 482
},
{
"epoch": 0.19072063178677195,
"grad_norm": 0.5021251440048218,
"learning_rate": 0.00014942001520069947,
"loss": 1.0947,
"step": 483
},
{
"epoch": 0.19111549851924975,
"grad_norm": 0.5681881904602051,
"learning_rate": 0.00014922619497806277,
"loss": 1.0981,
"step": 484
},
{
"epoch": 0.19151036525172754,
"grad_norm": 0.5182890892028809,
"learning_rate": 0.00014903213037995724,
"loss": 1.1017,
"step": 485
},
{
"epoch": 0.19190523198420534,
"grad_norm": 0.4682919979095459,
"learning_rate": 0.0001488378223697851,
"loss": 0.6508,
"step": 486
},
{
"epoch": 0.19230009871668313,
"grad_norm": 0.4784727096557617,
"learning_rate": 0.00014864327191215702,
"loss": 0.874,
"step": 487
},
{
"epoch": 0.1926949654491609,
"grad_norm": 0.5247599482536316,
"learning_rate": 0.00014844847997288717,
"loss": 1.1797,
"step": 488
},
{
"epoch": 0.1930898321816387,
"grad_norm": 0.48195531964302063,
"learning_rate": 0.00014825344751898863,
"loss": 1.0463,
"step": 489
},
{
"epoch": 0.1934846989141165,
"grad_norm": 0.5093549489974976,
"learning_rate": 0.00014805817551866838,
"loss": 1.0409,
"step": 490
},
{
"epoch": 0.19387956564659428,
"grad_norm": 0.5454416275024414,
"learning_rate": 0.00014786266494132267,
"loss": 1.035,
"step": 491
},
{
"epoch": 0.19427443237907208,
"grad_norm": 0.49301639199256897,
"learning_rate": 0.00014766691675753202,
"loss": 1.1046,
"step": 492
},
{
"epoch": 0.19466929911154984,
"grad_norm": 0.4534429907798767,
"learning_rate": 0.00014747093193905657,
"loss": 0.9061,
"step": 493
},
{
"epoch": 0.19506416584402764,
"grad_norm": 0.599082887172699,
"learning_rate": 0.00014727471145883127,
"loss": 1.0882,
"step": 494
},
{
"epoch": 0.19545903257650543,
"grad_norm": 0.5382128953933716,
"learning_rate": 0.00014707825629096084,
"loss": 1.0369,
"step": 495
},
{
"epoch": 0.19585389930898323,
"grad_norm": 0.5784068703651428,
"learning_rate": 0.00014688156741071514,
"loss": 0.9614,
"step": 496
},
{
"epoch": 0.19624876604146102,
"grad_norm": 0.5423576235771179,
"learning_rate": 0.00014668464579452425,
"loss": 0.9217,
"step": 497
},
{
"epoch": 0.1966436327739388,
"grad_norm": 0.47836440801620483,
"learning_rate": 0.00014648749241997363,
"loss": 0.918,
"step": 498
},
{
"epoch": 0.19703849950641658,
"grad_norm": 0.5091026425361633,
"learning_rate": 0.00014629010826579928,
"loss": 1.0415,
"step": 499
},
{
"epoch": 0.19743336623889438,
"grad_norm": 0.553871214389801,
"learning_rate": 0.00014609249431188278,
"loss": 0.8315,
"step": 500
},
{
"epoch": 0.19782823297137217,
"grad_norm": 0.4480036199092865,
"learning_rate": 0.00014589465153924672,
"loss": 0.9431,
"step": 501
},
{
"epoch": 0.19822309970384996,
"grad_norm": 0.5340292453765869,
"learning_rate": 0.00014569658093004935,
"loss": 0.8736,
"step": 502
},
{
"epoch": 0.19861796643632773,
"grad_norm": 0.5622652173042297,
"learning_rate": 0.0001454982834675802,
"loss": 1.1759,
"step": 503
},
{
"epoch": 0.19901283316880553,
"grad_norm": 0.5641468167304993,
"learning_rate": 0.00014529976013625482,
"loss": 0.9721,
"step": 504
},
{
"epoch": 0.19940769990128332,
"grad_norm": 0.49418380856513977,
"learning_rate": 0.00014510101192161018,
"loss": 0.8389,
"step": 505
},
{
"epoch": 0.19980256663376111,
"grad_norm": 0.47910451889038086,
"learning_rate": 0.0001449020398102996,
"loss": 0.9339,
"step": 506
},
{
"epoch": 0.20019743336623888,
"grad_norm": 0.5156650543212891,
"learning_rate": 0.00014470284479008782,
"loss": 0.9458,
"step": 507
},
{
"epoch": 0.20059230009871667,
"grad_norm": 0.4681549072265625,
"learning_rate": 0.00014450342784984633,
"loss": 0.8954,
"step": 508
},
{
"epoch": 0.20098716683119447,
"grad_norm": 0.5173560380935669,
"learning_rate": 0.00014430378997954817,
"loss": 1.0272,
"step": 509
},
{
"epoch": 0.20138203356367226,
"grad_norm": 0.5966143012046814,
"learning_rate": 0.00014410393217026318,
"loss": 0.8915,
"step": 510
},
{
"epoch": 0.20177690029615006,
"grad_norm": 0.5026108026504517,
"learning_rate": 0.00014390385541415308,
"loss": 0.9169,
"step": 511
},
{
"epoch": 0.20217176702862782,
"grad_norm": 0.4867290258407593,
"learning_rate": 0.00014370356070446654,
"loss": 1.133,
"step": 512
},
{
"epoch": 0.20256663376110562,
"grad_norm": 0.4962225556373596,
"learning_rate": 0.00014350304903553416,
"loss": 0.9498,
"step": 513
},
{
"epoch": 0.2029615004935834,
"grad_norm": 0.4522517919540405,
"learning_rate": 0.00014330232140276366,
"loss": 0.8796,
"step": 514
},
{
"epoch": 0.2033563672260612,
"grad_norm": 0.4583631455898285,
"learning_rate": 0.00014310137880263482,
"loss": 0.9822,
"step": 515
},
{
"epoch": 0.203751233958539,
"grad_norm": 0.5668156147003174,
"learning_rate": 0.00014290022223269463,
"loss": 0.8197,
"step": 516
},
{
"epoch": 0.20414610069101677,
"grad_norm": 0.47949913144111633,
"learning_rate": 0.0001426988526915523,
"loss": 0.9435,
"step": 517
},
{
"epoch": 0.20454096742349456,
"grad_norm": 0.497577965259552,
"learning_rate": 0.00014249727117887425,
"loss": 0.8928,
"step": 518
},
{
"epoch": 0.20493583415597236,
"grad_norm": 0.4772360622882843,
"learning_rate": 0.0001422954786953793,
"loss": 1.0171,
"step": 519
},
{
"epoch": 0.20533070088845015,
"grad_norm": 0.5616466999053955,
"learning_rate": 0.0001420934762428335,
"loss": 0.9387,
"step": 520
},
{
"epoch": 0.20572556762092795,
"grad_norm": 0.6362606287002563,
"learning_rate": 0.00014189126482404532,
"loss": 0.7291,
"step": 521
},
{
"epoch": 0.2061204343534057,
"grad_norm": 0.4801354706287384,
"learning_rate": 0.00014168884544286053,
"loss": 0.7701,
"step": 522
},
{
"epoch": 0.2065153010858835,
"grad_norm": 0.45586976408958435,
"learning_rate": 0.0001414862191041574,
"loss": 0.9861,
"step": 523
},
{
"epoch": 0.2069101678183613,
"grad_norm": 0.5662283301353455,
"learning_rate": 0.00014128338681384153,
"loss": 0.9726,
"step": 524
},
{
"epoch": 0.2073050345508391,
"grad_norm": 0.5532451868057251,
"learning_rate": 0.00014108034957884094,
"loss": 1.2107,
"step": 525
},
{
"epoch": 0.2076999012833169,
"grad_norm": 0.5659233331680298,
"learning_rate": 0.0001408771084071012,
"loss": 0.9739,
"step": 526
},
{
"epoch": 0.20809476801579466,
"grad_norm": 0.5252379775047302,
"learning_rate": 0.00014067366430758004,
"loss": 0.9744,
"step": 527
},
{
"epoch": 0.20848963474827245,
"grad_norm": 0.6081037521362305,
"learning_rate": 0.0001404700182902428,
"loss": 1.0524,
"step": 528
},
{
"epoch": 0.20888450148075025,
"grad_norm": 0.6748234033584595,
"learning_rate": 0.0001402661713660571,
"loss": 0.9468,
"step": 529
},
{
"epoch": 0.20927936821322804,
"grad_norm": 0.4868841767311096,
"learning_rate": 0.00014006212454698797,
"loss": 1.0739,
"step": 530
},
{
"epoch": 0.20967423494570583,
"grad_norm": 0.5020860433578491,
"learning_rate": 0.00013985787884599282,
"loss": 0.9586,
"step": 531
},
{
"epoch": 0.2100691016781836,
"grad_norm": 0.579229474067688,
"learning_rate": 0.00013965343527701628,
"loss": 0.8937,
"step": 532
},
{
"epoch": 0.2104639684106614,
"grad_norm": 0.48879000544548035,
"learning_rate": 0.00013944879485498538,
"loss": 0.956,
"step": 533
},
{
"epoch": 0.2108588351431392,
"grad_norm": 0.5132958292961121,
"learning_rate": 0.00013924395859580432,
"loss": 0.8762,
"step": 534
},
{
"epoch": 0.21125370187561698,
"grad_norm": 0.5426986217498779,
"learning_rate": 0.00013903892751634947,
"loss": 1.0018,
"step": 535
},
{
"epoch": 0.21164856860809478,
"grad_norm": 0.5325112342834473,
"learning_rate": 0.0001388337026344645,
"loss": 0.9227,
"step": 536
},
{
"epoch": 0.21204343534057254,
"grad_norm": 0.5148372650146484,
"learning_rate": 0.000138628284968955,
"loss": 1.0617,
"step": 537
},
{
"epoch": 0.21243830207305034,
"grad_norm": 0.4726095199584961,
"learning_rate": 0.00013842267553958371,
"loss": 0.9038,
"step": 538
},
{
"epoch": 0.21283316880552813,
"grad_norm": 0.504017174243927,
"learning_rate": 0.00013821687536706533,
"loss": 1.0946,
"step": 539
},
{
"epoch": 0.21322803553800593,
"grad_norm": 0.483732670545578,
"learning_rate": 0.00013801088547306148,
"loss": 0.7506,
"step": 540
},
{
"epoch": 0.21362290227048372,
"grad_norm": 0.49207037687301636,
"learning_rate": 0.00013780470688017562,
"loss": 0.8905,
"step": 541
},
{
"epoch": 0.2140177690029615,
"grad_norm": 0.5545893311500549,
"learning_rate": 0.00013759834061194794,
"loss": 0.808,
"step": 542
},
{
"epoch": 0.21441263573543928,
"grad_norm": 0.657805323600769,
"learning_rate": 0.00013739178769285032,
"loss": 0.8566,
"step": 543
},
{
"epoch": 0.21480750246791708,
"grad_norm": 0.5868344902992249,
"learning_rate": 0.00013718504914828135,
"loss": 0.9001,
"step": 544
},
{
"epoch": 0.21520236920039487,
"grad_norm": 0.4816092550754547,
"learning_rate": 0.00013697812600456093,
"loss": 0.993,
"step": 545
},
{
"epoch": 0.21559723593287267,
"grad_norm": 0.5755246877670288,
"learning_rate": 0.00013677101928892554,
"loss": 1.1376,
"step": 546
},
{
"epoch": 0.21599210266535043,
"grad_norm": 0.5235198736190796,
"learning_rate": 0.0001365637300295229,
"loss": 1.036,
"step": 547
},
{
"epoch": 0.21638696939782823,
"grad_norm": 0.4904315173625946,
"learning_rate": 0.00013635625925540696,
"loss": 1.0033,
"step": 548
},
{
"epoch": 0.21678183613030602,
"grad_norm": 0.49426376819610596,
"learning_rate": 0.00013614860799653276,
"loss": 1.0455,
"step": 549
},
{
"epoch": 0.21717670286278382,
"grad_norm": 0.5230404734611511,
"learning_rate": 0.00013594077728375128,
"loss": 0.8619,
"step": 550
},
{
"epoch": 0.2175715695952616,
"grad_norm": 0.4252125918865204,
"learning_rate": 0.0001357327681488045,
"loss": 0.552,
"step": 551
},
{
"epoch": 0.21796643632773938,
"grad_norm": 0.6176772117614746,
"learning_rate": 0.00013552458162432003,
"loss": 0.9374,
"step": 552
},
{
"epoch": 0.21836130306021717,
"grad_norm": 0.476182222366333,
"learning_rate": 0.00013531621874380613,
"loss": 0.9189,
"step": 553
},
{
"epoch": 0.21875616979269497,
"grad_norm": 0.5183379054069519,
"learning_rate": 0.00013510768054164653,
"loss": 0.9177,
"step": 554
},
{
"epoch": 0.21915103652517276,
"grad_norm": 0.5251573920249939,
"learning_rate": 0.00013489896805309542,
"loss": 0.8619,
"step": 555
},
{
"epoch": 0.21954590325765055,
"grad_norm": 0.508030116558075,
"learning_rate": 0.00013469008231427207,
"loss": 1.032,
"step": 556
},
{
"epoch": 0.21994076999012832,
"grad_norm": 0.5486171245574951,
"learning_rate": 0.00013448102436215592,
"loss": 0.8481,
"step": 557
},
{
"epoch": 0.22033563672260612,
"grad_norm": 0.5298491716384888,
"learning_rate": 0.00013427179523458127,
"loss": 0.7748,
"step": 558
},
{
"epoch": 0.2207305034550839,
"grad_norm": 0.6214480996131897,
"learning_rate": 0.00013406239597023225,
"loss": 1.0314,
"step": 559
},
{
"epoch": 0.2211253701875617,
"grad_norm": 0.5228508710861206,
"learning_rate": 0.00013385282760863758,
"loss": 0.9916,
"step": 560
},
{
"epoch": 0.2215202369200395,
"grad_norm": 0.5693901777267456,
"learning_rate": 0.00013364309119016538,
"loss": 1.0026,
"step": 561
},
{
"epoch": 0.22191510365251726,
"grad_norm": 0.4827022850513458,
"learning_rate": 0.0001334331877560182,
"loss": 0.8068,
"step": 562
},
{
"epoch": 0.22230997038499506,
"grad_norm": 0.4529504179954529,
"learning_rate": 0.00013322311834822756,
"loss": 0.8318,
"step": 563
},
{
"epoch": 0.22270483711747285,
"grad_norm": 0.5850614309310913,
"learning_rate": 0.00013301288400964902,
"loss": 0.8946,
"step": 564
},
{
"epoch": 0.22309970384995065,
"grad_norm": 0.5779694318771362,
"learning_rate": 0.0001328024857839569,
"loss": 0.7861,
"step": 565
},
{
"epoch": 0.22349457058242844,
"grad_norm": 0.5600647926330566,
"learning_rate": 0.00013259192471563912,
"loss": 0.7921,
"step": 566
},
{
"epoch": 0.2238894373149062,
"grad_norm": 0.5468648076057434,
"learning_rate": 0.00013238120184999195,
"loss": 0.7732,
"step": 567
},
{
"epoch": 0.224284304047384,
"grad_norm": 0.619243323802948,
"learning_rate": 0.00013217031823311488,
"loss": 1.2012,
"step": 568
},
{
"epoch": 0.2246791707798618,
"grad_norm": 0.5478827953338623,
"learning_rate": 0.00013195927491190554,
"loss": 1.0279,
"step": 569
},
{
"epoch": 0.2250740375123396,
"grad_norm": 0.520038366317749,
"learning_rate": 0.00013174807293405428,
"loss": 0.8806,
"step": 570
},
{
"epoch": 0.2254689042448174,
"grad_norm": 0.46674588322639465,
"learning_rate": 0.00013153671334803905,
"loss": 0.7596,
"step": 571
},
{
"epoch": 0.22586377097729515,
"grad_norm": 0.510772705078125,
"learning_rate": 0.0001313251972031203,
"loss": 0.873,
"step": 572
},
{
"epoch": 0.22625863770977295,
"grad_norm": 0.4360644221305847,
"learning_rate": 0.00013111352554933563,
"loss": 0.9622,
"step": 573
},
{
"epoch": 0.22665350444225074,
"grad_norm": 0.4816182255744934,
"learning_rate": 0.00013090169943749476,
"loss": 1.0487,
"step": 574
},
{
"epoch": 0.22704837117472854,
"grad_norm": 0.48583704233169556,
"learning_rate": 0.000130689719919174,
"loss": 0.8041,
"step": 575
},
{
"epoch": 0.22744323790720633,
"grad_norm": 0.5958275198936462,
"learning_rate": 0.00013047758804671136,
"loss": 0.9466,
"step": 576
},
{
"epoch": 0.2278381046396841,
"grad_norm": 0.4736819863319397,
"learning_rate": 0.00013026530487320113,
"loss": 0.9483,
"step": 577
},
{
"epoch": 0.2282329713721619,
"grad_norm": 0.5173781514167786,
"learning_rate": 0.00013005287145248878,
"loss": 0.8428,
"step": 578
},
{
"epoch": 0.22862783810463969,
"grad_norm": 0.44686052203178406,
"learning_rate": 0.00012984028883916552,
"loss": 1.009,
"step": 579
},
{
"epoch": 0.22902270483711748,
"grad_norm": 0.5016387701034546,
"learning_rate": 0.00012962755808856342,
"loss": 1.0226,
"step": 580
},
{
"epoch": 0.22941757156959527,
"grad_norm": 0.6307184100151062,
"learning_rate": 0.0001294146802567497,
"loss": 0.9179,
"step": 581
},
{
"epoch": 0.22981243830207304,
"grad_norm": 0.43451613187789917,
"learning_rate": 0.0001292016564005219,
"loss": 0.9468,
"step": 582
},
{
"epoch": 0.23020730503455084,
"grad_norm": 0.5214070081710815,
"learning_rate": 0.00012898848757740246,
"loss": 0.9226,
"step": 583
},
{
"epoch": 0.23060217176702863,
"grad_norm": 0.6036335825920105,
"learning_rate": 0.00012877517484563344,
"loss": 0.9585,
"step": 584
},
{
"epoch": 0.23099703849950642,
"grad_norm": 0.5829451084136963,
"learning_rate": 0.00012856171926417133,
"loss": 1.2637,
"step": 585
},
{
"epoch": 0.23139190523198422,
"grad_norm": 0.5726488828659058,
"learning_rate": 0.0001283481218926818,
"loss": 0.9223,
"step": 586
},
{
"epoch": 0.23178677196446199,
"grad_norm": 0.501338005065918,
"learning_rate": 0.0001281343837915344,
"loss": 0.8892,
"step": 587
},
{
"epoch": 0.23218163869693978,
"grad_norm": 0.5073863863945007,
"learning_rate": 0.00012792050602179725,
"loss": 0.8541,
"step": 588
},
{
"epoch": 0.23257650542941757,
"grad_norm": 0.6474023461341858,
"learning_rate": 0.00012770648964523194,
"loss": 1.1276,
"step": 589
},
{
"epoch": 0.23297137216189537,
"grad_norm": 0.48500892519950867,
"learning_rate": 0.00012749233572428804,
"loss": 1.0324,
"step": 590
},
{
"epoch": 0.23336623889437316,
"grad_norm": 0.5808454155921936,
"learning_rate": 0.00012727804532209803,
"loss": 1.0817,
"step": 591
},
{
"epoch": 0.23376110562685093,
"grad_norm": 0.49606916308403015,
"learning_rate": 0.0001270636195024719,
"loss": 0.9889,
"step": 592
},
{
"epoch": 0.23415597235932872,
"grad_norm": 0.6760202050209045,
"learning_rate": 0.00012684905932989186,
"loss": 1.1171,
"step": 593
},
{
"epoch": 0.23455083909180652,
"grad_norm": 0.44945040345191956,
"learning_rate": 0.00012663436586950714,
"loss": 0.9308,
"step": 594
},
{
"epoch": 0.2349457058242843,
"grad_norm": 0.6141867637634277,
"learning_rate": 0.00012641954018712863,
"loss": 1.1256,
"step": 595
},
{
"epoch": 0.2353405725567621,
"grad_norm": 0.5368825197219849,
"learning_rate": 0.0001262045833492236,
"loss": 1.0181,
"step": 596
},
{
"epoch": 0.23573543928923987,
"grad_norm": 0.5402265787124634,
"learning_rate": 0.00012598949642291047,
"loss": 1.2135,
"step": 597
},
{
"epoch": 0.23613030602171767,
"grad_norm": 0.5296156406402588,
"learning_rate": 0.00012577428047595344,
"loss": 0.8361,
"step": 598
},
{
"epoch": 0.23652517275419546,
"grad_norm": 0.4741549491882324,
"learning_rate": 0.00012555893657675718,
"loss": 0.9022,
"step": 599
},
{
"epoch": 0.23692003948667326,
"grad_norm": 0.4711611866950989,
"learning_rate": 0.0001253434657943616,
"loss": 1.0165,
"step": 600
},
{
"epoch": 0.23731490621915102,
"grad_norm": 0.4527783989906311,
"learning_rate": 0.00012512786919843648,
"loss": 0.8954,
"step": 601
},
{
"epoch": 0.23770977295162882,
"grad_norm": 0.5278099775314331,
"learning_rate": 0.0001249121478592762,
"loss": 1.0807,
"step": 602
},
{
"epoch": 0.2381046396841066,
"grad_norm": 0.45427364110946655,
"learning_rate": 0.00012469630284779438,
"loss": 0.9407,
"step": 603
},
{
"epoch": 0.2384995064165844,
"grad_norm": 0.5772978663444519,
"learning_rate": 0.00012448033523551865,
"loss": 1.2734,
"step": 604
},
{
"epoch": 0.2388943731490622,
"grad_norm": 0.5643810033798218,
"learning_rate": 0.00012426424609458518,
"loss": 1.0125,
"step": 605
},
{
"epoch": 0.23928923988153997,
"grad_norm": 0.5393880009651184,
"learning_rate": 0.0001240480364977335,
"loss": 0.8613,
"step": 606
},
{
"epoch": 0.23968410661401776,
"grad_norm": 0.49924036860466003,
"learning_rate": 0.0001238317075183011,
"loss": 0.9862,
"step": 607
},
{
"epoch": 0.24007897334649556,
"grad_norm": 0.4382825493812561,
"learning_rate": 0.00012361526023021822,
"loss": 0.8692,
"step": 608
},
{
"epoch": 0.24047384007897335,
"grad_norm": 0.5655290484428406,
"learning_rate": 0.00012339869570800232,
"loss": 0.9122,
"step": 609
},
{
"epoch": 0.24086870681145114,
"grad_norm": 0.5245158076286316,
"learning_rate": 0.00012318201502675285,
"loss": 0.8347,
"step": 610
},
{
"epoch": 0.2412635735439289,
"grad_norm": 0.5116402506828308,
"learning_rate": 0.00012296521926214596,
"loss": 1.0317,
"step": 611
},
{
"epoch": 0.2416584402764067,
"grad_norm": 0.4878910779953003,
"learning_rate": 0.00012274830949042908,
"loss": 0.8947,
"step": 612
},
{
"epoch": 0.2420533070088845,
"grad_norm": 0.5890231132507324,
"learning_rate": 0.00012253128678841568,
"loss": 1.0683,
"step": 613
},
{
"epoch": 0.2424481737413623,
"grad_norm": 0.4757651388645172,
"learning_rate": 0.00012231415223347972,
"loss": 0.9816,
"step": 614
},
{
"epoch": 0.2428430404738401,
"grad_norm": 0.5415259599685669,
"learning_rate": 0.0001220969069035506,
"loss": 0.9583,
"step": 615
},
{
"epoch": 0.24323790720631785,
"grad_norm": 0.5389688014984131,
"learning_rate": 0.0001218795518771075,
"loss": 0.7091,
"step": 616
},
{
"epoch": 0.24363277393879565,
"grad_norm": 0.8125389814376831,
"learning_rate": 0.00012166208823317427,
"loss": 1.0042,
"step": 617
},
{
"epoch": 0.24402764067127344,
"grad_norm": 0.5463893413543701,
"learning_rate": 0.0001214445170513139,
"loss": 1.0685,
"step": 618
},
{
"epoch": 0.24442250740375124,
"grad_norm": 0.4813181161880493,
"learning_rate": 0.0001212268394116233,
"loss": 0.8353,
"step": 619
},
{
"epoch": 0.24481737413622903,
"grad_norm": 0.4448351263999939,
"learning_rate": 0.00012100905639472779,
"loss": 1.0261,
"step": 620
},
{
"epoch": 0.2452122408687068,
"grad_norm": 0.509598433971405,
"learning_rate": 0.00012079116908177593,
"loss": 1.0274,
"step": 621
},
{
"epoch": 0.2456071076011846,
"grad_norm": 0.5108718276023865,
"learning_rate": 0.00012057317855443395,
"loss": 0.8464,
"step": 622
},
{
"epoch": 0.2460019743336624,
"grad_norm": 0.4307985007762909,
"learning_rate": 0.00012035508589488053,
"loss": 0.9281,
"step": 623
},
{
"epoch": 0.24639684106614018,
"grad_norm": 0.5565152764320374,
"learning_rate": 0.00012013689218580132,
"loss": 0.9916,
"step": 624
},
{
"epoch": 0.24679170779861798,
"grad_norm": 0.7218130826950073,
"learning_rate": 0.0001199185985103836,
"loss": 1.0378,
"step": 625
},
{
"epoch": 0.24718657453109574,
"grad_norm": 0.5790725350379944,
"learning_rate": 0.00011970020595231101,
"loss": 1.0886,
"step": 626
},
{
"epoch": 0.24758144126357354,
"grad_norm": 0.5239992141723633,
"learning_rate": 0.000119481715595758,
"loss": 1.0763,
"step": 627
},
{
"epoch": 0.24797630799605133,
"grad_norm": 0.5147425532341003,
"learning_rate": 0.00011926312852538455,
"loss": 1.1421,
"step": 628
},
{
"epoch": 0.24837117472852913,
"grad_norm": 0.6474115252494812,
"learning_rate": 0.0001190444458263307,
"loss": 0.8813,
"step": 629
},
{
"epoch": 0.24876604146100692,
"grad_norm": 0.46953749656677246,
"learning_rate": 0.00011882566858421135,
"loss": 0.6636,
"step": 630
},
{
"epoch": 0.2491609081934847,
"grad_norm": 0.6197345852851868,
"learning_rate": 0.00011860679788511064,
"loss": 0.8935,
"step": 631
},
{
"epoch": 0.24955577492596248,
"grad_norm": 0.5545244812965393,
"learning_rate": 0.00011838783481557664,
"loss": 0.7358,
"step": 632
},
{
"epoch": 0.24995064165844028,
"grad_norm": 0.5150088667869568,
"learning_rate": 0.00011816878046261615,
"loss": 0.8935,
"step": 633
},
{
"epoch": 0.25034550839091807,
"grad_norm": 0.5777970552444458,
"learning_rate": 0.00011794963591368893,
"loss": 0.9984,
"step": 634
},
{
"epoch": 0.25074037512339586,
"grad_norm": 0.6271523237228394,
"learning_rate": 0.00011773040225670256,
"loss": 1.0425,
"step": 635
},
{
"epoch": 0.25113524185587366,
"grad_norm": 0.5507854223251343,
"learning_rate": 0.00011751108058000706,
"loss": 1.017,
"step": 636
},
{
"epoch": 0.25153010858835145,
"grad_norm": 0.5012635588645935,
"learning_rate": 0.00011729167197238935,
"loss": 1.0421,
"step": 637
},
{
"epoch": 0.25192497532082925,
"grad_norm": 0.5927073955535889,
"learning_rate": 0.0001170721775230679,
"loss": 0.8634,
"step": 638
},
{
"epoch": 0.252319842053307,
"grad_norm": 0.5467817783355713,
"learning_rate": 0.0001168525983216873,
"loss": 0.828,
"step": 639
},
{
"epoch": 0.2527147087857848,
"grad_norm": 0.5477814078330994,
"learning_rate": 0.00011663293545831302,
"loss": 0.8878,
"step": 640
},
{
"epoch": 0.2531095755182626,
"grad_norm": 0.5831199884414673,
"learning_rate": 0.00011641319002342568,
"loss": 0.9146,
"step": 641
},
{
"epoch": 0.25350444225074037,
"grad_norm": 0.4861442446708679,
"learning_rate": 0.00011619336310791586,
"loss": 0.9527,
"step": 642
},
{
"epoch": 0.25389930898321816,
"grad_norm": 0.4937233030796051,
"learning_rate": 0.00011597345580307875,
"loss": 0.8631,
"step": 643
},
{
"epoch": 0.25429417571569596,
"grad_norm": 0.5312657952308655,
"learning_rate": 0.00011575346920060846,
"loss": 1.0702,
"step": 644
},
{
"epoch": 0.25468904244817375,
"grad_norm": 0.5473558306694031,
"learning_rate": 0.00011553340439259286,
"loss": 0.8867,
"step": 645
},
{
"epoch": 0.25508390918065155,
"grad_norm": 0.5094785094261169,
"learning_rate": 0.00011531326247150803,
"loss": 0.9356,
"step": 646
},
{
"epoch": 0.25547877591312934,
"grad_norm": 0.4709779620170593,
"learning_rate": 0.00011509304453021288,
"loss": 0.8547,
"step": 647
},
{
"epoch": 0.2558736426456071,
"grad_norm": 0.5478555560112,
"learning_rate": 0.00011487275166194367,
"loss": 1.013,
"step": 648
},
{
"epoch": 0.2562685093780849,
"grad_norm": 0.5569909811019897,
"learning_rate": 0.00011465238496030868,
"loss": 0.9655,
"step": 649
},
{
"epoch": 0.25666337611056267,
"grad_norm": 0.6182945370674133,
"learning_rate": 0.00011443194551928266,
"loss": 1.0553,
"step": 650
},
{
"epoch": 0.25705824284304046,
"grad_norm": 0.49789944291114807,
"learning_rate": 0.00011421143443320155,
"loss": 1.0104,
"step": 651
},
{
"epoch": 0.25745310957551826,
"grad_norm": 0.5329926013946533,
"learning_rate": 0.00011399085279675687,
"loss": 1.1935,
"step": 652
},
{
"epoch": 0.25784797630799605,
"grad_norm": 0.6216697692871094,
"learning_rate": 0.0001137702017049904,
"loss": 0.9502,
"step": 653
},
{
"epoch": 0.25824284304047385,
"grad_norm": 0.46455976366996765,
"learning_rate": 0.00011354948225328877,
"loss": 1.0217,
"step": 654
},
{
"epoch": 0.25863770977295164,
"grad_norm": 0.5468802452087402,
"learning_rate": 0.0001133286955373779,
"loss": 0.9669,
"step": 655
},
{
"epoch": 0.25903257650542943,
"grad_norm": 0.6737267971038818,
"learning_rate": 0.00011310784265331769,
"loss": 0.828,
"step": 656
},
{
"epoch": 0.25942744323790723,
"grad_norm": 0.5070087909698486,
"learning_rate": 0.00011288692469749649,
"loss": 0.9463,
"step": 657
},
{
"epoch": 0.25982230997038497,
"grad_norm": 0.5430976152420044,
"learning_rate": 0.0001126659427666257,
"loss": 0.9986,
"step": 658
},
{
"epoch": 0.26021717670286276,
"grad_norm": 0.4886017143726349,
"learning_rate": 0.00011244489795773432,
"loss": 1.0702,
"step": 659
},
{
"epoch": 0.26061204343534056,
"grad_norm": 0.4727013111114502,
"learning_rate": 0.00011222379136816345,
"loss": 0.9179,
"step": 660
},
{
"epoch": 0.26100691016781835,
"grad_norm": 0.48751479387283325,
"learning_rate": 0.00011200262409556097,
"loss": 0.9176,
"step": 661
},
{
"epoch": 0.26140177690029615,
"grad_norm": 0.511191189289093,
"learning_rate": 0.00011178139723787597,
"loss": 1.0286,
"step": 662
},
{
"epoch": 0.26179664363277394,
"grad_norm": 0.48295167088508606,
"learning_rate": 0.00011156011189335332,
"loss": 0.9306,
"step": 663
},
{
"epoch": 0.26219151036525173,
"grad_norm": 0.4702792167663574,
"learning_rate": 0.00011133876916052821,
"loss": 0.9666,
"step": 664
},
{
"epoch": 0.26258637709772953,
"grad_norm": 0.5391889810562134,
"learning_rate": 0.00011111737013822088,
"loss": 0.9043,
"step": 665
},
{
"epoch": 0.2629812438302073,
"grad_norm": 0.6235263347625732,
"learning_rate": 0.00011089591592553082,
"loss": 1.0091,
"step": 666
},
{
"epoch": 0.2633761105626851,
"grad_norm": 0.5476643443107605,
"learning_rate": 0.00011067440762183164,
"loss": 0.9399,
"step": 667
},
{
"epoch": 0.26377097729516286,
"grad_norm": 0.42856502532958984,
"learning_rate": 0.00011045284632676536,
"loss": 0.8856,
"step": 668
},
{
"epoch": 0.26416584402764065,
"grad_norm": 0.6507248878479004,
"learning_rate": 0.00011023123314023717,
"loss": 1.0047,
"step": 669
},
{
"epoch": 0.26456071076011844,
"grad_norm": 0.5612763166427612,
"learning_rate": 0.00011000956916240985,
"loss": 0.9552,
"step": 670
},
{
"epoch": 0.26495557749259624,
"grad_norm": 0.6156942844390869,
"learning_rate": 0.00010978785549369823,
"loss": 0.7834,
"step": 671
},
{
"epoch": 0.26535044422507403,
"grad_norm": 0.5807977914810181,
"learning_rate": 0.00010956609323476399,
"loss": 1.0304,
"step": 672
},
{
"epoch": 0.26574531095755183,
"grad_norm": 0.4886478781700134,
"learning_rate": 0.00010934428348650986,
"loss": 1.0725,
"step": 673
},
{
"epoch": 0.2661401776900296,
"grad_norm": 0.6015396118164062,
"learning_rate": 0.00010912242735007441,
"loss": 0.9459,
"step": 674
},
{
"epoch": 0.2665350444225074,
"grad_norm": 0.5361436605453491,
"learning_rate": 0.0001089005259268265,
"loss": 1.0034,
"step": 675
},
{
"epoch": 0.2669299111549852,
"grad_norm": 0.5702396035194397,
"learning_rate": 0.00010867858031835975,
"loss": 1.1566,
"step": 676
},
{
"epoch": 0.267324777887463,
"grad_norm": 0.5034629702568054,
"learning_rate": 0.00010845659162648723,
"loss": 0.9551,
"step": 677
},
{
"epoch": 0.26771964461994074,
"grad_norm": 0.49666622281074524,
"learning_rate": 0.00010823456095323579,
"loss": 0.9059,
"step": 678
},
{
"epoch": 0.26811451135241854,
"grad_norm": 0.5842538475990295,
"learning_rate": 0.00010801248940084074,
"loss": 0.8893,
"step": 679
},
{
"epoch": 0.26850937808489633,
"grad_norm": 0.5735609531402588,
"learning_rate": 0.00010779037807174033,
"loss": 0.983,
"step": 680
},
{
"epoch": 0.2689042448173741,
"grad_norm": 0.5265182256698608,
"learning_rate": 0.00010756822806857028,
"loss": 0.8681,
"step": 681
},
{
"epoch": 0.2692991115498519,
"grad_norm": 0.5432460308074951,
"learning_rate": 0.00010734604049415822,
"loss": 0.7613,
"step": 682
},
{
"epoch": 0.2696939782823297,
"grad_norm": 0.5309749245643616,
"learning_rate": 0.00010712381645151844,
"loss": 1.1094,
"step": 683
},
{
"epoch": 0.2700888450148075,
"grad_norm": 0.588448703289032,
"learning_rate": 0.00010690155704384615,
"loss": 1.082,
"step": 684
},
{
"epoch": 0.2704837117472853,
"grad_norm": 0.4991750717163086,
"learning_rate": 0.00010667926337451217,
"loss": 0.8744,
"step": 685
},
{
"epoch": 0.2708785784797631,
"grad_norm": 0.4775620698928833,
"learning_rate": 0.0001064569365470574,
"loss": 1.025,
"step": 686
},
{
"epoch": 0.2712734452122409,
"grad_norm": 0.5142999291419983,
"learning_rate": 0.00010623457766518736,
"loss": 0.9704,
"step": 687
},
{
"epoch": 0.27166831194471863,
"grad_norm": 0.48825737833976746,
"learning_rate": 0.00010601218783276672,
"loss": 1.122,
"step": 688
},
{
"epoch": 0.2720631786771964,
"grad_norm": 0.5517759919166565,
"learning_rate": 0.00010578976815381372,
"loss": 0.8859,
"step": 689
},
{
"epoch": 0.2724580454096742,
"grad_norm": 0.5661524534225464,
"learning_rate": 0.00010556731973249485,
"loss": 0.89,
"step": 690
},
{
"epoch": 0.272852912142152,
"grad_norm": 0.5802194476127625,
"learning_rate": 0.00010534484367311923,
"loss": 0.8048,
"step": 691
},
{
"epoch": 0.2732477788746298,
"grad_norm": 0.5603414177894592,
"learning_rate": 0.00010512234108013319,
"loss": 1.0869,
"step": 692
},
{
"epoch": 0.2736426456071076,
"grad_norm": 0.5277503728866577,
"learning_rate": 0.00010489981305811487,
"loss": 0.7287,
"step": 693
},
{
"epoch": 0.2740375123395854,
"grad_norm": 0.6045374870300293,
"learning_rate": 0.00010467726071176853,
"loss": 0.9417,
"step": 694
},
{
"epoch": 0.2744323790720632,
"grad_norm": 0.6021410822868347,
"learning_rate": 0.00010445468514591925,
"loss": 1.1399,
"step": 695
},
{
"epoch": 0.274827245804541,
"grad_norm": 0.5544920563697815,
"learning_rate": 0.00010423208746550732,
"loss": 0.7279,
"step": 696
},
{
"epoch": 0.2752221125370188,
"grad_norm": 0.5612072348594666,
"learning_rate": 0.00010400946877558293,
"loss": 0.8715,
"step": 697
},
{
"epoch": 0.2756169792694965,
"grad_norm": 0.5820972323417664,
"learning_rate": 0.00010378683018130047,
"loss": 0.9456,
"step": 698
},
{
"epoch": 0.2760118460019743,
"grad_norm": 0.43969717621803284,
"learning_rate": 0.0001035641727879131,
"loss": 0.8192,
"step": 699
},
{
"epoch": 0.2764067127344521,
"grad_norm": 0.6528908610343933,
"learning_rate": 0.00010334149770076747,
"loss": 0.9742,
"step": 700
},
{
"epoch": 0.2768015794669299,
"grad_norm": 0.4668010473251343,
"learning_rate": 0.00010311880602529794,
"loss": 0.9471,
"step": 701
},
{
"epoch": 0.2771964461994077,
"grad_norm": 0.5474753379821777,
"learning_rate": 0.0001028960988670212,
"loss": 0.9224,
"step": 702
},
{
"epoch": 0.2775913129318855,
"grad_norm": 0.5672122240066528,
"learning_rate": 0.00010267337733153089,
"loss": 0.8441,
"step": 703
},
{
"epoch": 0.2779861796643633,
"grad_norm": 0.5570011138916016,
"learning_rate": 0.00010245064252449201,
"loss": 0.9897,
"step": 704
},
{
"epoch": 0.2783810463968411,
"grad_norm": 0.5113812685012817,
"learning_rate": 0.0001022278955516354,
"loss": 1.0066,
"step": 705
},
{
"epoch": 0.2787759131293189,
"grad_norm": 0.48175865411758423,
"learning_rate": 0.00010200513751875227,
"loss": 0.9212,
"step": 706
},
{
"epoch": 0.27917077986179667,
"grad_norm": 0.5643198490142822,
"learning_rate": 0.00010178236953168885,
"loss": 1.2371,
"step": 707
},
{
"epoch": 0.2795656465942744,
"grad_norm": 0.5115451216697693,
"learning_rate": 0.00010155959269634068,
"loss": 0.9701,
"step": 708
},
{
"epoch": 0.2799605133267522,
"grad_norm": 0.5253522396087646,
"learning_rate": 0.00010133680811864727,
"loss": 1.0685,
"step": 709
},
{
"epoch": 0.28035538005923,
"grad_norm": 0.7160750031471252,
"learning_rate": 0.00010111401690458654,
"loss": 1.0837,
"step": 710
},
{
"epoch": 0.28035538005923,
"eval_loss": 0.9457208514213562,
"eval_runtime": 61.5761,
"eval_samples_per_second": 17.328,
"eval_steps_per_second": 8.672,
"step": 710
}
],
"logging_steps": 1,
"max_steps": 1420,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 355,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.602799288203346e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}