{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.28035538005923, "eval_steps": 355, "global_step": 710, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003948667324777887, "grad_norm": 0.5502959489822388, "learning_rate": 2e-05, "loss": 1.0032, "step": 1 }, { "epoch": 0.0003948667324777887, "eval_loss": 1.3028720617294312, "eval_runtime": 63.423, "eval_samples_per_second": 16.824, "eval_steps_per_second": 8.42, "step": 1 }, { "epoch": 0.0007897334649555774, "grad_norm": 0.5348024368286133, "learning_rate": 4e-05, "loss": 1.2158, "step": 2 }, { "epoch": 0.0011846001974333662, "grad_norm": 0.5212297439575195, "learning_rate": 6e-05, "loss": 1.2107, "step": 3 }, { "epoch": 0.0015794669299111549, "grad_norm": 0.5010500550270081, "learning_rate": 8e-05, "loss": 1.7374, "step": 4 }, { "epoch": 0.0019743336623889436, "grad_norm": 0.566511869430542, "learning_rate": 0.0001, "loss": 1.263, "step": 5 }, { "epoch": 0.0023692003948667323, "grad_norm": 0.558596134185791, "learning_rate": 0.00012, "loss": 1.1253, "step": 6 }, { "epoch": 0.002764067127344521, "grad_norm": 0.525932788848877, "learning_rate": 0.00014, "loss": 1.155, "step": 7 }, { "epoch": 0.0031589338598223098, "grad_norm": 0.5322596430778503, "learning_rate": 0.00016, "loss": 1.173, "step": 8 }, { "epoch": 0.003553800592300099, "grad_norm": 0.5490784049034119, "learning_rate": 0.00018, "loss": 1.1944, "step": 9 }, { "epoch": 0.003948667324777887, "grad_norm": 0.5888460278511047, "learning_rate": 0.0002, "loss": 1.1346, "step": 10 }, { "epoch": 0.004343534057255676, "grad_norm": 0.5463979840278625, "learning_rate": 0.0001999997517831015, "loss": 1.2648, "step": 11 }, { "epoch": 0.004738400789733465, "grad_norm": 0.6944459676742554, "learning_rate": 0.00019999900713363826, "loss": 1.1832, "step": 12 }, { "epoch": 0.005133267522211254, "grad_norm": 0.675365149974823, "learning_rate": 0.0001999977660553069, "loss": 1.1111, "step": 13 }, { "epoch": 0.005528134254689042, "grad_norm": 0.6740691065788269, "learning_rate": 0.00019999602855426865, "loss": 1.0508, "step": 14 }, { "epoch": 0.005923000987166831, "grad_norm": 0.6092358827590942, "learning_rate": 0.00019999379463914898, "loss": 1.048, "step": 15 }, { "epoch": 0.0063178677196446195, "grad_norm": 0.5579732656478882, "learning_rate": 0.0001999910643210378, "loss": 0.9692, "step": 16 }, { "epoch": 0.006712734452122409, "grad_norm": 0.5955824851989746, "learning_rate": 0.0001999878376134894, "loss": 1.3142, "step": 17 }, { "epoch": 0.007107601184600198, "grad_norm": 0.49893510341644287, "learning_rate": 0.00019998411453252217, "loss": 1.211, "step": 18 }, { "epoch": 0.007502467917077986, "grad_norm": 0.6234785914421082, "learning_rate": 0.0001999798950966188, "loss": 1.0711, "step": 19 }, { "epoch": 0.007897334649555774, "grad_norm": 0.5721188187599182, "learning_rate": 0.0001999751793267259, "loss": 0.9827, "step": 20 }, { "epoch": 0.008292201382033564, "grad_norm": 0.566277801990509, "learning_rate": 0.00019996996724625426, "loss": 0.9768, "step": 21 }, { "epoch": 0.008687068114511353, "grad_norm": 0.5721827745437622, "learning_rate": 0.0001999642588810784, "loss": 1.0156, "step": 22 }, { "epoch": 0.009081934846989142, "grad_norm": 0.4582698941230774, "learning_rate": 0.00019995805425953648, "loss": 1.0947, "step": 23 }, { "epoch": 0.00947680157946693, "grad_norm": 0.5519172549247742, "learning_rate": 0.00019995135341243042, "loss": 0.9691, "step": 24 }, { "epoch": 0.009871668311944718, "grad_norm": 0.5251504182815552, "learning_rate": 0.00019994415637302547, "loss": 1.1051, "step": 25 }, { "epoch": 0.010266535044422508, "grad_norm": 0.49342384934425354, "learning_rate": 0.00019993646317705016, "loss": 1.1886, "step": 26 }, { "epoch": 0.010661401776900297, "grad_norm": 0.458482027053833, "learning_rate": 0.0001999282738626961, "loss": 0.8504, "step": 27 }, { "epoch": 0.011056268509378084, "grad_norm": 0.5016602873802185, "learning_rate": 0.00019991958847061784, "loss": 0.8907, "step": 28 }, { "epoch": 0.011451135241855873, "grad_norm": 0.5376136302947998, "learning_rate": 0.0001999104070439326, "loss": 1.1895, "step": 29 }, { "epoch": 0.011846001974333662, "grad_norm": 0.5124022364616394, "learning_rate": 0.00019990072962822007, "loss": 1.2014, "step": 30 }, { "epoch": 0.012240868706811452, "grad_norm": 0.4343184232711792, "learning_rate": 0.0001998905562715222, "loss": 1.1406, "step": 31 }, { "epoch": 0.012635735439289239, "grad_norm": 0.5123298764228821, "learning_rate": 0.00019987988702434303, "loss": 1.1627, "step": 32 }, { "epoch": 0.013030602171767028, "grad_norm": 0.48822933435440063, "learning_rate": 0.00019986872193964827, "loss": 1.0454, "step": 33 }, { "epoch": 0.013425468904244817, "grad_norm": 0.501602053642273, "learning_rate": 0.00019985706107286514, "loss": 1.144, "step": 34 }, { "epoch": 0.013820335636722606, "grad_norm": 0.5616022348403931, "learning_rate": 0.00019984490448188218, "loss": 1.1829, "step": 35 }, { "epoch": 0.014215202369200396, "grad_norm": 0.5326923727989197, "learning_rate": 0.00019983225222704878, "loss": 0.9466, "step": 36 }, { "epoch": 0.014610069101678183, "grad_norm": 0.3960028886795044, "learning_rate": 0.000199819104371175, "loss": 0.8513, "step": 37 }, { "epoch": 0.015004935834155972, "grad_norm": 0.5138500928878784, "learning_rate": 0.00019980546097953132, "loss": 1.02, "step": 38 }, { "epoch": 0.015399802566633761, "grad_norm": 0.47606444358825684, "learning_rate": 0.00019979132211984805, "loss": 1.0354, "step": 39 }, { "epoch": 0.01579466929911155, "grad_norm": 0.4528456926345825, "learning_rate": 0.00019977668786231534, "loss": 0.8414, "step": 40 }, { "epoch": 0.01618953603158934, "grad_norm": 0.5217841863632202, "learning_rate": 0.00019976155827958252, "loss": 1.2372, "step": 41 }, { "epoch": 0.016584402764067127, "grad_norm": 0.511803925037384, "learning_rate": 0.000199745933446758, "loss": 0.8319, "step": 42 }, { "epoch": 0.016979269496544915, "grad_norm": 0.5257598757743835, "learning_rate": 0.00019972981344140874, "loss": 0.9624, "step": 43 }, { "epoch": 0.017374136229022705, "grad_norm": 0.5112103819847107, "learning_rate": 0.00019971319834355983, "loss": 1.123, "step": 44 }, { "epoch": 0.017769002961500493, "grad_norm": 0.5637938976287842, "learning_rate": 0.00019969608823569433, "loss": 1.2229, "step": 45 }, { "epoch": 0.018163869693978284, "grad_norm": 0.46877309679985046, "learning_rate": 0.0001996784832027525, "loss": 0.9595, "step": 46 }, { "epoch": 0.01855873642645607, "grad_norm": 0.45058727264404297, "learning_rate": 0.00019966038333213177, "loss": 1.0821, "step": 47 }, { "epoch": 0.01895360315893386, "grad_norm": 0.4382037818431854, "learning_rate": 0.00019964178871368594, "loss": 0.9997, "step": 48 }, { "epoch": 0.01934846989141165, "grad_norm": 0.44654494524002075, "learning_rate": 0.000199622699439725, "loss": 1.0338, "step": 49 }, { "epoch": 0.019743336623889437, "grad_norm": 0.5067604184150696, "learning_rate": 0.00019960311560501454, "loss": 1.1121, "step": 50 }, { "epoch": 0.020138203356367228, "grad_norm": 0.4290701746940613, "learning_rate": 0.0001995830373067754, "loss": 1.0287, "step": 51 }, { "epoch": 0.020533070088845015, "grad_norm": 0.5244402885437012, "learning_rate": 0.00019956246464468294, "loss": 1.1912, "step": 52 }, { "epoch": 0.020927936821322803, "grad_norm": 0.4742845892906189, "learning_rate": 0.0001995413977208669, "loss": 1.0139, "step": 53 }, { "epoch": 0.021322803553800593, "grad_norm": 0.47367361187934875, "learning_rate": 0.00019951983663991056, "loss": 1.0935, "step": 54 }, { "epoch": 0.02171767028627838, "grad_norm": 0.4405689835548401, "learning_rate": 0.00019949778150885042, "loss": 1.2933, "step": 55 }, { "epoch": 0.02211253701875617, "grad_norm": 0.39893534779548645, "learning_rate": 0.0001994752324371756, "loss": 0.83, "step": 56 }, { "epoch": 0.02250740375123396, "grad_norm": 0.4917256832122803, "learning_rate": 0.00019945218953682734, "loss": 1.0927, "step": 57 }, { "epoch": 0.022902270483711747, "grad_norm": 0.453832745552063, "learning_rate": 0.00019942865292219838, "loss": 0.964, "step": 58 }, { "epoch": 0.023297137216189538, "grad_norm": 0.4970617890357971, "learning_rate": 0.00019940462271013238, "loss": 1.0414, "step": 59 }, { "epoch": 0.023692003948667325, "grad_norm": 0.5314046144485474, "learning_rate": 0.0001993800990199235, "loss": 1.2126, "step": 60 }, { "epoch": 0.024086870681145112, "grad_norm": 0.5307350754737854, "learning_rate": 0.00019935508197331555, "loss": 1.0771, "step": 61 }, { "epoch": 0.024481737413622903, "grad_norm": 0.495712548494339, "learning_rate": 0.0001993295716945017, "loss": 1.0686, "step": 62 }, { "epoch": 0.02487660414610069, "grad_norm": 0.5226410627365112, "learning_rate": 0.00019930356831012353, "loss": 1.0349, "step": 63 }, { "epoch": 0.025271470878578478, "grad_norm": 0.4592258334159851, "learning_rate": 0.00019927707194927066, "loss": 0.9929, "step": 64 }, { "epoch": 0.02566633761105627, "grad_norm": 0.486198753118515, "learning_rate": 0.00019925008274347995, "loss": 1.0707, "step": 65 }, { "epoch": 0.026061204343534056, "grad_norm": 0.5763838291168213, "learning_rate": 0.00019922260082673497, "loss": 0.8451, "step": 66 }, { "epoch": 0.026456071076011847, "grad_norm": 0.5000368356704712, "learning_rate": 0.00019919462633546519, "loss": 0.8953, "step": 67 }, { "epoch": 0.026850937808489635, "grad_norm": 0.5193626284599304, "learning_rate": 0.0001991661594085455, "loss": 0.8959, "step": 68 }, { "epoch": 0.027245804540967422, "grad_norm": 0.5146979689598083, "learning_rate": 0.00019913720018729532, "loss": 1.1425, "step": 69 }, { "epoch": 0.027640671273445213, "grad_norm": 0.4941859543323517, "learning_rate": 0.000199107748815478, "loss": 1.1127, "step": 70 }, { "epoch": 0.028035538005923, "grad_norm": 0.5475146770477295, "learning_rate": 0.00019907780543930014, "loss": 1.0152, "step": 71 }, { "epoch": 0.02843040473840079, "grad_norm": 0.6136332154273987, "learning_rate": 0.00019904737020741075, "loss": 1.1167, "step": 72 }, { "epoch": 0.02882527147087858, "grad_norm": 0.531680703163147, "learning_rate": 0.00019901644327090064, "loss": 1.0022, "step": 73 }, { "epoch": 0.029220138203356366, "grad_norm": 0.5442494750022888, "learning_rate": 0.00019898502478330152, "loss": 0.8863, "step": 74 }, { "epoch": 0.029615004935834157, "grad_norm": 0.4275268018245697, "learning_rate": 0.00019895311490058542, "loss": 0.845, "step": 75 }, { "epoch": 0.030009871668311944, "grad_norm": 0.5767403841018677, "learning_rate": 0.00019892071378116376, "loss": 1.13, "step": 76 }, { "epoch": 0.030404738400789732, "grad_norm": 0.5110602974891663, "learning_rate": 0.00019888782158588667, "loss": 1.0784, "step": 77 }, { "epoch": 0.030799605133267523, "grad_norm": 0.5388869643211365, "learning_rate": 0.00019885443847804211, "loss": 1.0197, "step": 78 }, { "epoch": 0.03119447186574531, "grad_norm": 0.5769171118736267, "learning_rate": 0.00019882056462335512, "loss": 0.8093, "step": 79 }, { "epoch": 0.0315893385982231, "grad_norm": 0.3854546546936035, "learning_rate": 0.00019878620018998696, "loss": 0.7723, "step": 80 }, { "epoch": 0.031984205330700885, "grad_norm": 0.4594631791114807, "learning_rate": 0.00019875134534853427, "loss": 0.978, "step": 81 }, { "epoch": 0.03237907206317868, "grad_norm": 0.5577380061149597, "learning_rate": 0.0001987160002720283, "loss": 1.0064, "step": 82 }, { "epoch": 0.03277393879565647, "grad_norm": 0.4823514223098755, "learning_rate": 0.00019868016513593391, "loss": 0.9228, "step": 83 }, { "epoch": 0.033168805528134254, "grad_norm": 0.5669511556625366, "learning_rate": 0.0001986438401181489, "loss": 1.2223, "step": 84 }, { "epoch": 0.03356367226061204, "grad_norm": 0.48681461811065674, "learning_rate": 0.00019860702539900287, "loss": 1.0993, "step": 85 }, { "epoch": 0.03395853899308983, "grad_norm": 0.47141095995903015, "learning_rate": 0.00019856972116125653, "loss": 1.1599, "step": 86 }, { "epoch": 0.03435340572556762, "grad_norm": 0.5382753610610962, "learning_rate": 0.00019853192759010076, "loss": 1.1186, "step": 87 }, { "epoch": 0.03474827245804541, "grad_norm": 0.592026948928833, "learning_rate": 0.00019849364487315558, "loss": 1.0947, "step": 88 }, { "epoch": 0.0351431391905232, "grad_norm": 0.42034783959388733, "learning_rate": 0.00019845487320046935, "loss": 0.9649, "step": 89 }, { "epoch": 0.035538005923000986, "grad_norm": 0.4590117633342743, "learning_rate": 0.0001984156127645178, "loss": 0.997, "step": 90 }, { "epoch": 0.03593287265547877, "grad_norm": 0.5288587212562561, "learning_rate": 0.00019837586376020294, "loss": 1.2129, "step": 91 }, { "epoch": 0.03632773938795657, "grad_norm": 0.4397427439689636, "learning_rate": 0.0001983356263848523, "loss": 0.9819, "step": 92 }, { "epoch": 0.036722606120434355, "grad_norm": 0.4406636357307434, "learning_rate": 0.00019829490083821778, "loss": 1.074, "step": 93 }, { "epoch": 0.03711747285291214, "grad_norm": 0.4988841116428375, "learning_rate": 0.0001982536873224748, "loss": 0.9614, "step": 94 }, { "epoch": 0.03751233958538993, "grad_norm": 0.4320489466190338, "learning_rate": 0.00019821198604222113, "loss": 0.9872, "step": 95 }, { "epoch": 0.03790720631786772, "grad_norm": 0.4227694272994995, "learning_rate": 0.0001981697972044761, "loss": 1.0866, "step": 96 }, { "epoch": 0.03830207305034551, "grad_norm": 0.449147492647171, "learning_rate": 0.00019812712101867922, "loss": 1.0443, "step": 97 }, { "epoch": 0.0386969397828233, "grad_norm": 0.7097704410552979, "learning_rate": 0.00019808395769668963, "loss": 0.9615, "step": 98 }, { "epoch": 0.039091806515301086, "grad_norm": 0.4379878640174866, "learning_rate": 0.0001980403074527846, "loss": 1.0132, "step": 99 }, { "epoch": 0.039486673247778874, "grad_norm": 0.47281649708747864, "learning_rate": 0.0001979961705036587, "loss": 0.9845, "step": 100 }, { "epoch": 0.03988153998025666, "grad_norm": 0.424258291721344, "learning_rate": 0.00019795154706842266, "loss": 1.0192, "step": 101 }, { "epoch": 0.040276406712734455, "grad_norm": 0.5341431498527527, "learning_rate": 0.00019790643736860227, "loss": 0.9863, "step": 102 }, { "epoch": 0.04067127344521224, "grad_norm": 0.5369569659233093, "learning_rate": 0.00019786084162813733, "loss": 1.0572, "step": 103 }, { "epoch": 0.04106614017769003, "grad_norm": 0.5228739380836487, "learning_rate": 0.00019781476007338058, "loss": 0.8181, "step": 104 }, { "epoch": 0.04146100691016782, "grad_norm": 0.48687776923179626, "learning_rate": 0.00019776819293309633, "loss": 1.0303, "step": 105 }, { "epoch": 0.041855873642645605, "grad_norm": 0.6160632967948914, "learning_rate": 0.00019772114043845965, "loss": 0.9719, "step": 106 }, { "epoch": 0.04225074037512339, "grad_norm": 0.42757055163383484, "learning_rate": 0.00019767360282305508, "loss": 0.8688, "step": 107 }, { "epoch": 0.04264560710760119, "grad_norm": 0.5323183536529541, "learning_rate": 0.0001976255803228753, "loss": 1.0611, "step": 108 }, { "epoch": 0.043040473840078974, "grad_norm": 0.45643237233161926, "learning_rate": 0.00019757707317632028, "loss": 0.9245, "step": 109 }, { "epoch": 0.04343534057255676, "grad_norm": 0.51936936378479, "learning_rate": 0.0001975280816241959, "loss": 1.1352, "step": 110 }, { "epoch": 0.04383020730503455, "grad_norm": 0.49822020530700684, "learning_rate": 0.0001974786059097128, "loss": 0.9481, "step": 111 }, { "epoch": 0.04422507403751234, "grad_norm": 0.5198648571968079, "learning_rate": 0.0001974286462784851, "loss": 1.1848, "step": 112 }, { "epoch": 0.04461994076999013, "grad_norm": 0.9258118271827698, "learning_rate": 0.0001973782029785293, "loss": 1.1156, "step": 113 }, { "epoch": 0.04501480750246792, "grad_norm": 0.5064953565597534, "learning_rate": 0.00019732727626026305, "loss": 0.9965, "step": 114 }, { "epoch": 0.045409674234945706, "grad_norm": 0.4941990375518799, "learning_rate": 0.00019727586637650373, "loss": 1.1318, "step": 115 }, { "epoch": 0.04580454096742349, "grad_norm": 0.61434006690979, "learning_rate": 0.0001972239735824674, "loss": 1.0637, "step": 116 }, { "epoch": 0.04619940769990128, "grad_norm": 0.53554368019104, "learning_rate": 0.0001971715981357674, "loss": 0.8824, "step": 117 }, { "epoch": 0.046594274432379075, "grad_norm": 0.505577802658081, "learning_rate": 0.0001971187402964132, "loss": 0.9145, "step": 118 }, { "epoch": 0.04698914116485686, "grad_norm": 0.557715654373169, "learning_rate": 0.00019706540032680893, "loss": 0.9495, "step": 119 }, { "epoch": 0.04738400789733465, "grad_norm": 0.5071070194244385, "learning_rate": 0.00019701157849175228, "loss": 0.9492, "step": 120 }, { "epoch": 0.04777887462981244, "grad_norm": 0.4534873068332672, "learning_rate": 0.00019695727505843297, "loss": 1.1968, "step": 121 }, { "epoch": 0.048173741362290225, "grad_norm": 0.46414080262184143, "learning_rate": 0.00019690249029643162, "loss": 0.8883, "step": 122 }, { "epoch": 0.04856860809476802, "grad_norm": 0.43917688727378845, "learning_rate": 0.00019684722447771834, "loss": 1.0213, "step": 123 }, { "epoch": 0.048963474827245806, "grad_norm": 0.46896979212760925, "learning_rate": 0.00019679147787665126, "loss": 0.8508, "step": 124 }, { "epoch": 0.049358341559723594, "grad_norm": 0.457086443901062, "learning_rate": 0.0001967352507699754, "loss": 1.1217, "step": 125 }, { "epoch": 0.04975320829220138, "grad_norm": 0.44028928875923157, "learning_rate": 0.0001966785434368211, "loss": 1.0044, "step": 126 }, { "epoch": 0.05014807502467917, "grad_norm": 0.47712892293930054, "learning_rate": 0.00019662135615870275, "loss": 0.993, "step": 127 }, { "epoch": 0.050542941757156956, "grad_norm": 0.5953882932662964, "learning_rate": 0.00019656368921951734, "loss": 1.2092, "step": 128 }, { "epoch": 0.05093780848963475, "grad_norm": 0.4725169837474823, "learning_rate": 0.00019650554290554298, "loss": 0.8518, "step": 129 }, { "epoch": 0.05133267522211254, "grad_norm": 0.4115935266017914, "learning_rate": 0.00019644691750543767, "loss": 0.86, "step": 130 }, { "epoch": 0.051727541954590325, "grad_norm": 0.5736876726150513, "learning_rate": 0.0001963878133102377, "loss": 0.8093, "step": 131 }, { "epoch": 0.05212240868706811, "grad_norm": 0.4820341169834137, "learning_rate": 0.00019632823061335627, "loss": 1.1891, "step": 132 }, { "epoch": 0.0525172754195459, "grad_norm": 0.43938153982162476, "learning_rate": 0.00019626816971058205, "loss": 0.7104, "step": 133 }, { "epoch": 0.052912142152023695, "grad_norm": 0.4602479040622711, "learning_rate": 0.00019620763090007762, "loss": 0.906, "step": 134 }, { "epoch": 0.05330700888450148, "grad_norm": 0.567136824131012, "learning_rate": 0.0001961466144823781, "loss": 0.9195, "step": 135 }, { "epoch": 0.05370187561697927, "grad_norm": 0.4483197331428528, "learning_rate": 0.00019608512076038962, "loss": 0.913, "step": 136 }, { "epoch": 0.05409674234945706, "grad_norm": 0.42070272564888, "learning_rate": 0.00019602315003938782, "loss": 1.1745, "step": 137 }, { "epoch": 0.054491609081934844, "grad_norm": 0.5072475671768188, "learning_rate": 0.00019596070262701626, "loss": 0.9904, "step": 138 }, { "epoch": 0.05488647581441264, "grad_norm": 0.47331702709198, "learning_rate": 0.00019589777883328505, "loss": 1.1526, "step": 139 }, { "epoch": 0.055281342546890426, "grad_norm": 0.5382581949234009, "learning_rate": 0.00019583437897056915, "loss": 0.866, "step": 140 }, { "epoch": 0.05567620927936821, "grad_norm": 0.41280797123908997, "learning_rate": 0.0001957705033536069, "loss": 0.8771, "step": 141 }, { "epoch": 0.056071076011846, "grad_norm": 0.4384588301181793, "learning_rate": 0.00019570615229949842, "loss": 1.1457, "step": 142 }, { "epoch": 0.05646594274432379, "grad_norm": 0.3916715681552887, "learning_rate": 0.00019564132612770414, "loss": 0.832, "step": 143 }, { "epoch": 0.05686080947680158, "grad_norm": 0.5212041139602661, "learning_rate": 0.00019557602516004306, "loss": 0.9689, "step": 144 }, { "epoch": 0.05725567620927937, "grad_norm": 0.5447223782539368, "learning_rate": 0.00019551024972069126, "loss": 1.2021, "step": 145 }, { "epoch": 0.05765054294175716, "grad_norm": 0.5747576355934143, "learning_rate": 0.00019544400013618023, "loss": 1.0035, "step": 146 }, { "epoch": 0.058045409674234945, "grad_norm": 0.48325222730636597, "learning_rate": 0.00019537727673539536, "loss": 1.0858, "step": 147 }, { "epoch": 0.05844027640671273, "grad_norm": 0.5126092433929443, "learning_rate": 0.00019531007984957408, "loss": 0.8908, "step": 148 }, { "epoch": 0.05883514313919053, "grad_norm": 0.4576544761657715, "learning_rate": 0.0001952424098123045, "loss": 1.0389, "step": 149 }, { "epoch": 0.059230009871668314, "grad_norm": 0.43132367730140686, "learning_rate": 0.00019517426695952358, "loss": 0.8228, "step": 150 }, { "epoch": 0.0596248766041461, "grad_norm": 0.44958174228668213, "learning_rate": 0.00019510565162951537, "loss": 1.0578, "step": 151 }, { "epoch": 0.06001974333662389, "grad_norm": 0.4115462899208069, "learning_rate": 0.00019503656416290963, "loss": 1.1849, "step": 152 }, { "epoch": 0.060414610069101676, "grad_norm": 0.5049015879631042, "learning_rate": 0.0001949670049026799, "loss": 1.0552, "step": 153 }, { "epoch": 0.060809476801579464, "grad_norm": 0.48052188754081726, "learning_rate": 0.00019489697419414182, "loss": 0.9705, "step": 154 }, { "epoch": 0.06120434353405726, "grad_norm": 0.5106899738311768, "learning_rate": 0.00019482647238495152, "loss": 0.9298, "step": 155 }, { "epoch": 0.061599210266535045, "grad_norm": 0.49583542346954346, "learning_rate": 0.00019475549982510382, "loss": 1.1084, "step": 156 }, { "epoch": 0.06199407699901283, "grad_norm": 0.5219290256500244, "learning_rate": 0.00019468405686693044, "loss": 1.0565, "step": 157 }, { "epoch": 0.06238894373149062, "grad_norm": 0.5849049687385559, "learning_rate": 0.00019461214386509842, "loss": 0.9961, "step": 158 }, { "epoch": 0.06278381046396841, "grad_norm": 0.5726532936096191, "learning_rate": 0.00019453976117660818, "loss": 1.0036, "step": 159 }, { "epoch": 0.0631786771964462, "grad_norm": 0.5158294439315796, "learning_rate": 0.0001944669091607919, "loss": 1.0737, "step": 160 }, { "epoch": 0.06357354392892399, "grad_norm": 0.48837384581565857, "learning_rate": 0.00019439358817931152, "loss": 1.0415, "step": 161 }, { "epoch": 0.06396841066140177, "grad_norm": 0.5121546387672424, "learning_rate": 0.00019431979859615726, "loss": 1.1167, "step": 162 }, { "epoch": 0.06436327739387956, "grad_norm": 0.6468896269798279, "learning_rate": 0.00019424554077764546, "loss": 0.944, "step": 163 }, { "epoch": 0.06475814412635736, "grad_norm": 0.4869466722011566, "learning_rate": 0.00019417081509241714, "loss": 1.0122, "step": 164 }, { "epoch": 0.06515301085883514, "grad_norm": 0.43559664487838745, "learning_rate": 0.00019409562191143577, "loss": 1.101, "step": 165 }, { "epoch": 0.06554787759131293, "grad_norm": 0.46581968665122986, "learning_rate": 0.00019401996160798573, "loss": 0.9572, "step": 166 }, { "epoch": 0.06594274432379071, "grad_norm": 0.5070847868919373, "learning_rate": 0.00019394383455767034, "loss": 1.0415, "step": 167 }, { "epoch": 0.06633761105626851, "grad_norm": 0.427746057510376, "learning_rate": 0.00019386724113841, "loss": 0.9545, "step": 168 }, { "epoch": 0.0667324777887463, "grad_norm": 0.44796499609947205, "learning_rate": 0.00019379018173044037, "loss": 0.9902, "step": 169 }, { "epoch": 0.06712734452122408, "grad_norm": 0.46470651030540466, "learning_rate": 0.00019371265671631037, "loss": 0.8932, "step": 170 }, { "epoch": 0.06752221125370188, "grad_norm": 0.524315595626831, "learning_rate": 0.00019363466648088034, "loss": 1.1012, "step": 171 }, { "epoch": 0.06791707798617966, "grad_norm": 0.39339789748191833, "learning_rate": 0.0001935562114113202, "loss": 0.9966, "step": 172 }, { "epoch": 0.06831194471865745, "grad_norm": 0.4902956783771515, "learning_rate": 0.00019347729189710743, "loss": 1.0936, "step": 173 }, { "epoch": 0.06870681145113525, "grad_norm": 0.44631102681159973, "learning_rate": 0.00019339790833002515, "loss": 1.0011, "step": 174 }, { "epoch": 0.06910167818361303, "grad_norm": 0.48202449083328247, "learning_rate": 0.00019331806110416027, "loss": 0.9386, "step": 175 }, { "epoch": 0.06949654491609082, "grad_norm": 0.5927445292472839, "learning_rate": 0.00019323775061590135, "loss": 0.919, "step": 176 }, { "epoch": 0.0698914116485686, "grad_norm": 0.5132244229316711, "learning_rate": 0.0001931569772639368, "loss": 0.8995, "step": 177 }, { "epoch": 0.0702862783810464, "grad_norm": 0.3917968273162842, "learning_rate": 0.00019307574144925287, "loss": 0.9831, "step": 178 }, { "epoch": 0.07068114511352419, "grad_norm": 0.4451232850551605, "learning_rate": 0.00019299404357513158, "loss": 1.0076, "step": 179 }, { "epoch": 0.07107601184600197, "grad_norm": 0.462495893239975, "learning_rate": 0.00019291188404714878, "loss": 1.1291, "step": 180 }, { "epoch": 0.07147087857847977, "grad_norm": 0.4120655953884125, "learning_rate": 0.0001928292632731721, "loss": 0.8429, "step": 181 }, { "epoch": 0.07186574531095755, "grad_norm": 0.5940248370170593, "learning_rate": 0.00019274618166335912, "loss": 1.078, "step": 182 }, { "epoch": 0.07226061204343534, "grad_norm": 0.46714073419570923, "learning_rate": 0.00019266263963015488, "loss": 0.9423, "step": 183 }, { "epoch": 0.07265547877591313, "grad_norm": 0.41042840480804443, "learning_rate": 0.00019257863758829035, "loss": 0.8366, "step": 184 }, { "epoch": 0.07305034550839092, "grad_norm": 0.5920013785362244, "learning_rate": 0.00019249417595478002, "loss": 1.084, "step": 185 }, { "epoch": 0.07344521224086871, "grad_norm": 0.5364437699317932, "learning_rate": 0.00019240925514892, "loss": 1.0667, "step": 186 }, { "epoch": 0.07384007897334649, "grad_norm": 0.5053632855415344, "learning_rate": 0.00019232387559228587, "loss": 0.9369, "step": 187 }, { "epoch": 0.07423494570582428, "grad_norm": 0.4612145721912384, "learning_rate": 0.0001922380377087306, "loss": 1.0796, "step": 188 }, { "epoch": 0.07462981243830208, "grad_norm": 0.5499488711357117, "learning_rate": 0.00019215174192438247, "loss": 1.2071, "step": 189 }, { "epoch": 0.07502467917077986, "grad_norm": 0.5043527483940125, "learning_rate": 0.00019206498866764288, "loss": 1.111, "step": 190 }, { "epoch": 0.07541954590325765, "grad_norm": 0.557327926158905, "learning_rate": 0.00019197777836918437, "loss": 1.0387, "step": 191 }, { "epoch": 0.07581441263573543, "grad_norm": 0.5045757293701172, "learning_rate": 0.0001918901114619483, "loss": 1.02, "step": 192 }, { "epoch": 0.07620927936821323, "grad_norm": 0.5515264868736267, "learning_rate": 0.00019180198838114282, "loss": 0.8964, "step": 193 }, { "epoch": 0.07660414610069102, "grad_norm": 0.4991328716278076, "learning_rate": 0.00019171340956424074, "loss": 0.9792, "step": 194 }, { "epoch": 0.0769990128331688, "grad_norm": 0.3965533971786499, "learning_rate": 0.00019162437545097719, "loss": 0.9692, "step": 195 }, { "epoch": 0.0773938795656466, "grad_norm": 0.49761366844177246, "learning_rate": 0.0001915348864833476, "loss": 0.9203, "step": 196 }, { "epoch": 0.07778874629812438, "grad_norm": 0.4468221962451935, "learning_rate": 0.00019144494310560544, "loss": 0.8878, "step": 197 }, { "epoch": 0.07818361303060217, "grad_norm": 0.4550800025463104, "learning_rate": 0.0001913545457642601, "loss": 0.8622, "step": 198 }, { "epoch": 0.07857847976307997, "grad_norm": 0.4845464825630188, "learning_rate": 0.00019126369490807447, "loss": 0.9628, "step": 199 }, { "epoch": 0.07897334649555775, "grad_norm": 0.4576549232006073, "learning_rate": 0.00019117239098806295, "loss": 1.0311, "step": 200 }, { "epoch": 0.07936821322803554, "grad_norm": 0.5760570764541626, "learning_rate": 0.00019108063445748904, "loss": 1.1729, "step": 201 }, { "epoch": 0.07976307996051332, "grad_norm": 0.6804947257041931, "learning_rate": 0.00019098842577186314, "loss": 0.8829, "step": 202 }, { "epoch": 0.08015794669299112, "grad_norm": 0.4532022476196289, "learning_rate": 0.00019089576538894036, "loss": 1.0826, "step": 203 }, { "epoch": 0.08055281342546891, "grad_norm": 0.5325609445571899, "learning_rate": 0.00019080265376871815, "loss": 1.0122, "step": 204 }, { "epoch": 0.08094768015794669, "grad_norm": 0.46676936745643616, "learning_rate": 0.00019070909137343408, "loss": 1.0449, "step": 205 }, { "epoch": 0.08134254689042449, "grad_norm": 0.4907085597515106, "learning_rate": 0.00019061507866756347, "loss": 0.9381, "step": 206 }, { "epoch": 0.08173741362290227, "grad_norm": 0.5306389927864075, "learning_rate": 0.0001905206161178172, "loss": 1.0909, "step": 207 }, { "epoch": 0.08213228035538006, "grad_norm": 0.48803043365478516, "learning_rate": 0.00019042570419313925, "loss": 1.0608, "step": 208 }, { "epoch": 0.08252714708785784, "grad_norm": 0.4253416061401367, "learning_rate": 0.0001903303433647045, "loss": 1.0037, "step": 209 }, { "epoch": 0.08292201382033564, "grad_norm": 0.5864118337631226, "learning_rate": 0.00019023453410591635, "loss": 0.8971, "step": 210 }, { "epoch": 0.08331688055281343, "grad_norm": 0.5264946222305298, "learning_rate": 0.00019013827689240436, "loss": 1.0425, "step": 211 }, { "epoch": 0.08371174728529121, "grad_norm": 0.5967885255813599, "learning_rate": 0.00019004157220202185, "loss": 0.8929, "step": 212 }, { "epoch": 0.084106614017769, "grad_norm": 0.5387664437294006, "learning_rate": 0.00018994442051484356, "loss": 0.9526, "step": 213 }, { "epoch": 0.08450148075024679, "grad_norm": 0.4813781976699829, "learning_rate": 0.00018984682231316333, "loss": 0.9496, "step": 214 }, { "epoch": 0.08489634748272458, "grad_norm": 0.513137698173523, "learning_rate": 0.0001897487780814916, "loss": 0.9249, "step": 215 }, { "epoch": 0.08529121421520237, "grad_norm": 0.585263192653656, "learning_rate": 0.00018965028830655309, "loss": 1.123, "step": 216 }, { "epoch": 0.08568608094768015, "grad_norm": 0.45392006635665894, "learning_rate": 0.00018955135347728432, "loss": 0.9507, "step": 217 }, { "epoch": 0.08608094768015795, "grad_norm": 0.7591621279716492, "learning_rate": 0.00018945197408483123, "loss": 0.8529, "step": 218 }, { "epoch": 0.08647581441263573, "grad_norm": 0.49809765815734863, "learning_rate": 0.0001893521506225467, "loss": 1.0025, "step": 219 }, { "epoch": 0.08687068114511352, "grad_norm": 0.4396488666534424, "learning_rate": 0.00018925188358598813, "loss": 0.9019, "step": 220 }, { "epoch": 0.08726554787759132, "grad_norm": 0.5315730571746826, "learning_rate": 0.000189151173472915, "loss": 1.0926, "step": 221 }, { "epoch": 0.0876604146100691, "grad_norm": 0.5290179252624512, "learning_rate": 0.00018905002078328632, "loss": 1.1491, "step": 222 }, { "epoch": 0.08805528134254689, "grad_norm": 0.41243505477905273, "learning_rate": 0.0001889484260192582, "loss": 0.7891, "step": 223 }, { "epoch": 0.08845014807502467, "grad_norm": 0.4659099280834198, "learning_rate": 0.0001888463896851815, "loss": 1.0976, "step": 224 }, { "epoch": 0.08884501480750247, "grad_norm": 0.5485631823539734, "learning_rate": 0.00018874391228759893, "loss": 0.8974, "step": 225 }, { "epoch": 0.08923988153998026, "grad_norm": 0.5084119439125061, "learning_rate": 0.000188640994335243, "loss": 1.0882, "step": 226 }, { "epoch": 0.08963474827245804, "grad_norm": 0.4861229956150055, "learning_rate": 0.0001885376363390332, "loss": 1.0262, "step": 227 }, { "epoch": 0.09002961500493584, "grad_norm": 0.5560560822486877, "learning_rate": 0.00018843383881207357, "loss": 0.9285, "step": 228 }, { "epoch": 0.09042448173741362, "grad_norm": 0.5360885858535767, "learning_rate": 0.00018832960226965008, "loss": 1.06, "step": 229 }, { "epoch": 0.09081934846989141, "grad_norm": 0.43957361578941345, "learning_rate": 0.0001882249272292282, "loss": 0.9748, "step": 230 }, { "epoch": 0.0912142152023692, "grad_norm": 0.4349263906478882, "learning_rate": 0.00018811981421045014, "loss": 0.9549, "step": 231 }, { "epoch": 0.09160908193484699, "grad_norm": 0.44644349813461304, "learning_rate": 0.0001880142637351325, "loss": 0.8965, "step": 232 }, { "epoch": 0.09200394866732478, "grad_norm": 0.482573539018631, "learning_rate": 0.0001879082763272635, "loss": 0.9798, "step": 233 }, { "epoch": 0.09239881539980256, "grad_norm": 0.4720049798488617, "learning_rate": 0.00018780185251300046, "loss": 0.8558, "step": 234 }, { "epoch": 0.09279368213228036, "grad_norm": 0.4637092053890228, "learning_rate": 0.00018769499282066717, "loss": 1.02, "step": 235 }, { "epoch": 0.09318854886475815, "grad_norm": 0.42427390813827515, "learning_rate": 0.00018758769778075122, "loss": 0.992, "step": 236 }, { "epoch": 0.09358341559723593, "grad_norm": 0.5138596892356873, "learning_rate": 0.00018747996792590148, "loss": 0.7596, "step": 237 }, { "epoch": 0.09397828232971372, "grad_norm": 0.4327022433280945, "learning_rate": 0.00018737180379092537, "loss": 1.1874, "step": 238 }, { "epoch": 0.0943731490621915, "grad_norm": 0.43842098116874695, "learning_rate": 0.00018726320591278616, "loss": 1.1122, "step": 239 }, { "epoch": 0.0947680157946693, "grad_norm": 0.4516022205352783, "learning_rate": 0.0001871541748306005, "loss": 0.8585, "step": 240 }, { "epoch": 0.0951628825271471, "grad_norm": 0.48878902196884155, "learning_rate": 0.00018704471108563548, "loss": 1.0806, "step": 241 }, { "epoch": 0.09555774925962487, "grad_norm": 0.5217946767807007, "learning_rate": 0.0001869348152213061, "loss": 0.9486, "step": 242 }, { "epoch": 0.09595261599210267, "grad_norm": 0.5032205581665039, "learning_rate": 0.00018682448778317262, "loss": 0.8841, "step": 243 }, { "epoch": 0.09634748272458045, "grad_norm": 0.5018641948699951, "learning_rate": 0.00018671372931893773, "loss": 0.7681, "step": 244 }, { "epoch": 0.09674234945705824, "grad_norm": 0.4859049618244171, "learning_rate": 0.00018660254037844388, "loss": 1.0615, "step": 245 }, { "epoch": 0.09713721618953604, "grad_norm": 0.5423186421394348, "learning_rate": 0.0001864909215136705, "loss": 0.9082, "step": 246 }, { "epoch": 0.09753208292201382, "grad_norm": 0.6605743169784546, "learning_rate": 0.0001863788732787314, "loss": 1.0483, "step": 247 }, { "epoch": 0.09792694965449161, "grad_norm": 0.5038883090019226, "learning_rate": 0.0001862663962298719, "loss": 0.9848, "step": 248 }, { "epoch": 0.0983218163869694, "grad_norm": 0.4320213496685028, "learning_rate": 0.00018615349092546604, "loss": 0.8254, "step": 249 }, { "epoch": 0.09871668311944719, "grad_norm": 0.48691487312316895, "learning_rate": 0.00018604015792601396, "loss": 1.0443, "step": 250 }, { "epoch": 0.09911154985192498, "grad_norm": 0.5005807876586914, "learning_rate": 0.0001859263977941389, "loss": 0.8591, "step": 251 }, { "epoch": 0.09950641658440276, "grad_norm": 0.5520577430725098, "learning_rate": 0.0001858122110945847, "loss": 1.0758, "step": 252 }, { "epoch": 0.09990128331688056, "grad_norm": 0.44724610447883606, "learning_rate": 0.00018569759839421265, "loss": 1.0458, "step": 253 }, { "epoch": 0.10029615004935834, "grad_norm": 0.5659752488136292, "learning_rate": 0.00018558256026199896, "loss": 1.0763, "step": 254 }, { "epoch": 0.10069101678183613, "grad_norm": 0.4626811146736145, "learning_rate": 0.00018546709726903178, "loss": 0.9823, "step": 255 }, { "epoch": 0.10108588351431391, "grad_norm": 0.46605491638183594, "learning_rate": 0.00018535120998850848, "loss": 1.2862, "step": 256 }, { "epoch": 0.1014807502467917, "grad_norm": 0.6146165132522583, "learning_rate": 0.00018523489899573262, "loss": 1.0654, "step": 257 }, { "epoch": 0.1018756169792695, "grad_norm": 0.4941771328449249, "learning_rate": 0.00018511816486811134, "loss": 0.6966, "step": 258 }, { "epoch": 0.10227048371174728, "grad_norm": 0.4850773811340332, "learning_rate": 0.00018500100818515222, "loss": 1.0579, "step": 259 }, { "epoch": 0.10266535044422508, "grad_norm": 0.4935731589794159, "learning_rate": 0.00018488342952846073, "loss": 0.916, "step": 260 }, { "epoch": 0.10306021717670286, "grad_norm": 0.46898818016052246, "learning_rate": 0.000184765429481737, "loss": 1.0551, "step": 261 }, { "epoch": 0.10345508390918065, "grad_norm": 0.4352802038192749, "learning_rate": 0.00018464700863077312, "loss": 0.9019, "step": 262 }, { "epoch": 0.10384995064165845, "grad_norm": 0.5294659733772278, "learning_rate": 0.0001845281675634503, "loss": 1.0661, "step": 263 }, { "epoch": 0.10424481737413623, "grad_norm": 0.5499119758605957, "learning_rate": 0.00018440890686973572, "loss": 1.0584, "step": 264 }, { "epoch": 0.10463968410661402, "grad_norm": 0.4448906183242798, "learning_rate": 0.0001842892271416797, "loss": 0.9614, "step": 265 }, { "epoch": 0.1050345508390918, "grad_norm": 0.5477500557899475, "learning_rate": 0.00018416912897341295, "loss": 0.8138, "step": 266 }, { "epoch": 0.1054294175715696, "grad_norm": 0.46556559205055237, "learning_rate": 0.00018404861296114337, "loss": 0.9528, "step": 267 }, { "epoch": 0.10582428430404739, "grad_norm": 0.5127370953559875, "learning_rate": 0.00018392767970315313, "loss": 1.0031, "step": 268 }, { "epoch": 0.10621915103652517, "grad_norm": 0.5918512940406799, "learning_rate": 0.0001838063297997958, "loss": 0.8006, "step": 269 }, { "epoch": 0.10661401776900296, "grad_norm": 0.5921156406402588, "learning_rate": 0.00018368456385349334, "loss": 0.9143, "step": 270 }, { "epoch": 0.10700888450148074, "grad_norm": 0.6525917053222656, "learning_rate": 0.000183562382468733, "loss": 1.0162, "step": 271 }, { "epoch": 0.10740375123395854, "grad_norm": 0.5337258577346802, "learning_rate": 0.00018343978625206452, "loss": 0.9916, "step": 272 }, { "epoch": 0.10779861796643633, "grad_norm": 0.509904146194458, "learning_rate": 0.00018331677581209696, "loss": 0.8978, "step": 273 }, { "epoch": 0.10819348469891411, "grad_norm": 0.45876345038414, "learning_rate": 0.0001831933517594957, "loss": 1.0373, "step": 274 }, { "epoch": 0.10858835143139191, "grad_norm": 0.5002173185348511, "learning_rate": 0.00018306951470697946, "loss": 0.8874, "step": 275 }, { "epoch": 0.10898321816386969, "grad_norm": 0.44106170535087585, "learning_rate": 0.00018294526526931718, "loss": 0.7961, "step": 276 }, { "epoch": 0.10937808489634748, "grad_norm": 0.4849831163883209, "learning_rate": 0.00018282060406332512, "loss": 1.0139, "step": 277 }, { "epoch": 0.10977295162882528, "grad_norm": 0.4701422452926636, "learning_rate": 0.0001826955317078636, "loss": 1.0458, "step": 278 }, { "epoch": 0.11016781836130306, "grad_norm": 0.5966842174530029, "learning_rate": 0.00018257004882383412, "loss": 0.7497, "step": 279 }, { "epoch": 0.11056268509378085, "grad_norm": 0.49655190110206604, "learning_rate": 0.00018244415603417603, "loss": 1.2634, "step": 280 }, { "epoch": 0.11095755182625863, "grad_norm": 0.7236825823783875, "learning_rate": 0.00018231785396386377, "loss": 1.0645, "step": 281 }, { "epoch": 0.11135241855873643, "grad_norm": 0.522117018699646, "learning_rate": 0.00018219114323990345, "loss": 0.8553, "step": 282 }, { "epoch": 0.11174728529121422, "grad_norm": 0.5486408472061157, "learning_rate": 0.00018206402449132995, "loss": 1.0859, "step": 283 }, { "epoch": 0.112142152023692, "grad_norm": 0.6336297392845154, "learning_rate": 0.00018193649834920373, "loss": 1.0769, "step": 284 }, { "epoch": 0.1125370187561698, "grad_norm": 0.47025516629219055, "learning_rate": 0.0001818085654466076, "loss": 0.9662, "step": 285 }, { "epoch": 0.11293188548864758, "grad_norm": 0.5352553725242615, "learning_rate": 0.00018168022641864377, "loss": 1.0481, "step": 286 }, { "epoch": 0.11332675222112537, "grad_norm": 0.4911988377571106, "learning_rate": 0.00018155148190243051, "loss": 0.888, "step": 287 }, { "epoch": 0.11372161895360317, "grad_norm": 0.4751187264919281, "learning_rate": 0.00018142233253709916, "loss": 0.9517, "step": 288 }, { "epoch": 0.11411648568608095, "grad_norm": 0.4718889892101288, "learning_rate": 0.00018129277896379077, "loss": 1.028, "step": 289 }, { "epoch": 0.11451135241855874, "grad_norm": 0.5223097205162048, "learning_rate": 0.00018116282182565311, "loss": 0.7671, "step": 290 }, { "epoch": 0.11490621915103652, "grad_norm": 0.47218167781829834, "learning_rate": 0.0001810324617678373, "loss": 0.982, "step": 291 }, { "epoch": 0.11530108588351431, "grad_norm": 0.5651125907897949, "learning_rate": 0.00018090169943749476, "loss": 0.8603, "step": 292 }, { "epoch": 0.11569595261599211, "grad_norm": 0.6196467280387878, "learning_rate": 0.00018077053548377382, "loss": 0.9174, "step": 293 }, { "epoch": 0.11609081934846989, "grad_norm": 0.5194737911224365, "learning_rate": 0.0001806389705578168, "loss": 1.2105, "step": 294 }, { "epoch": 0.11648568608094768, "grad_norm": 0.46870988607406616, "learning_rate": 0.0001805070053127563, "loss": 0.9009, "step": 295 }, { "epoch": 0.11688055281342546, "grad_norm": 0.5096109509468079, "learning_rate": 0.0001803746404037125, "loss": 1.0304, "step": 296 }, { "epoch": 0.11727541954590326, "grad_norm": 0.5446627140045166, "learning_rate": 0.00018024187648778956, "loss": 0.9724, "step": 297 }, { "epoch": 0.11767028627838105, "grad_norm": 0.45771893858909607, "learning_rate": 0.00018010871422407236, "loss": 1.1809, "step": 298 }, { "epoch": 0.11806515301085883, "grad_norm": 0.4939158856868744, "learning_rate": 0.0001799751542736234, "loss": 0.9581, "step": 299 }, { "epoch": 0.11846001974333663, "grad_norm": 0.43252983689308167, "learning_rate": 0.00017984119729947944, "loss": 0.9829, "step": 300 }, { "epoch": 0.11885488647581441, "grad_norm": 0.5026227831840515, "learning_rate": 0.00017970684396664813, "loss": 1.0617, "step": 301 }, { "epoch": 0.1192497532082922, "grad_norm": 0.5750073194503784, "learning_rate": 0.00017957209494210493, "loss": 0.9648, "step": 302 }, { "epoch": 0.11964461994076998, "grad_norm": 0.511223316192627, "learning_rate": 0.0001794369508947894, "loss": 0.9457, "step": 303 }, { "epoch": 0.12003948667324778, "grad_norm": 0.6075318455696106, "learning_rate": 0.00017930141249560233, "loss": 1.1244, "step": 304 }, { "epoch": 0.12043435340572557, "grad_norm": 0.5594468712806702, "learning_rate": 0.00017916548041740213, "loss": 0.9975, "step": 305 }, { "epoch": 0.12082922013820335, "grad_norm": 0.45040223002433777, "learning_rate": 0.0001790291553350016, "loss": 1.0633, "step": 306 }, { "epoch": 0.12122408687068115, "grad_norm": 0.5200604200363159, "learning_rate": 0.0001788924379251645, "loss": 0.8929, "step": 307 }, { "epoch": 0.12161895360315893, "grad_norm": 0.48910826444625854, "learning_rate": 0.00017875532886660228, "loss": 0.9944, "step": 308 }, { "epoch": 0.12201382033563672, "grad_norm": 0.5069397687911987, "learning_rate": 0.0001786178288399706, "loss": 1.1026, "step": 309 }, { "epoch": 0.12240868706811452, "grad_norm": 0.4457162320613861, "learning_rate": 0.0001784799385278661, "loss": 0.8329, "step": 310 }, { "epoch": 0.1228035538005923, "grad_norm": 0.5899550318717957, "learning_rate": 0.0001783416586148229, "loss": 0.9725, "step": 311 }, { "epoch": 0.12319842053307009, "grad_norm": 0.48378434777259827, "learning_rate": 0.00017820298978730921, "loss": 0.9683, "step": 312 }, { "epoch": 0.12359328726554787, "grad_norm": 0.47537699341773987, "learning_rate": 0.00017806393273372395, "loss": 1.0343, "step": 313 }, { "epoch": 0.12398815399802567, "grad_norm": 0.46815434098243713, "learning_rate": 0.00017792448814439333, "loss": 1.0695, "step": 314 }, { "epoch": 0.12438302073050346, "grad_norm": 0.4383327066898346, "learning_rate": 0.00017778465671156743, "loss": 1.0047, "step": 315 }, { "epoch": 0.12477788746298124, "grad_norm": 0.46698901057243347, "learning_rate": 0.00017764443912941672, "loss": 1.0268, "step": 316 }, { "epoch": 0.12517275419545904, "grad_norm": 0.5612544417381287, "learning_rate": 0.0001775038360940287, "loss": 0.9519, "step": 317 }, { "epoch": 0.12556762092793683, "grad_norm": 0.5520269274711609, "learning_rate": 0.00017736284830340436, "loss": 0.9068, "step": 318 }, { "epoch": 0.12596248766041462, "grad_norm": 0.5094735622406006, "learning_rate": 0.00017722147645745468, "loss": 0.9915, "step": 319 }, { "epoch": 0.1263573543928924, "grad_norm": 0.49084824323654175, "learning_rate": 0.00017707972125799735, "loss": 0.8411, "step": 320 }, { "epoch": 0.12675222112537018, "grad_norm": 0.47397279739379883, "learning_rate": 0.00017693758340875306, "loss": 0.9266, "step": 321 }, { "epoch": 0.12714708785784798, "grad_norm": 0.3962591588497162, "learning_rate": 0.00017679506361534215, "loss": 0.9885, "step": 322 }, { "epoch": 0.12754195459032577, "grad_norm": 0.48955652117729187, "learning_rate": 0.000176652162585281, "loss": 1.0869, "step": 323 }, { "epoch": 0.12793682132280354, "grad_norm": 0.6018503308296204, "learning_rate": 0.00017650888102797868, "loss": 0.8733, "step": 324 }, { "epoch": 0.12833168805528133, "grad_norm": 0.6061957478523254, "learning_rate": 0.00017636521965473323, "loss": 1.2883, "step": 325 }, { "epoch": 0.12872655478775913, "grad_norm": 0.47981151938438416, "learning_rate": 0.00017622117917872823, "loss": 0.9246, "step": 326 }, { "epoch": 0.12912142152023692, "grad_norm": 0.5347411632537842, "learning_rate": 0.00017607676031502933, "loss": 1.0827, "step": 327 }, { "epoch": 0.12951628825271472, "grad_norm": 0.7249376773834229, "learning_rate": 0.0001759319637805806, "loss": 1.0677, "step": 328 }, { "epoch": 0.12991115498519248, "grad_norm": 0.5934639573097229, "learning_rate": 0.00017578679029420092, "loss": 0.8046, "step": 329 }, { "epoch": 0.13030602171767028, "grad_norm": 0.6128519773483276, "learning_rate": 0.00017564124057658056, "loss": 0.8474, "step": 330 }, { "epoch": 0.13070088845014807, "grad_norm": 0.49019843339920044, "learning_rate": 0.0001754953153502775, "loss": 0.9108, "step": 331 }, { "epoch": 0.13109575518262587, "grad_norm": 0.548611044883728, "learning_rate": 0.0001753490153397139, "loss": 0.9954, "step": 332 }, { "epoch": 0.13149062191510366, "grad_norm": 0.49729061126708984, "learning_rate": 0.00017520234127117243, "loss": 0.9943, "step": 333 }, { "epoch": 0.13188548864758143, "grad_norm": 0.47645774483680725, "learning_rate": 0.00017505529387279277, "loss": 0.8234, "step": 334 }, { "epoch": 0.13228035538005922, "grad_norm": 0.5200782418251038, "learning_rate": 0.0001749078738745679, "loss": 1.0991, "step": 335 }, { "epoch": 0.13267522211253702, "grad_norm": 0.4647184908390045, "learning_rate": 0.0001747600820083405, "loss": 1.0169, "step": 336 }, { "epoch": 0.1330700888450148, "grad_norm": 0.4864305853843689, "learning_rate": 0.00017461191900779936, "loss": 0.9525, "step": 337 }, { "epoch": 0.1334649555774926, "grad_norm": 0.5017113089561462, "learning_rate": 0.00017446338560847568, "loss": 0.807, "step": 338 }, { "epoch": 0.13385982230997037, "grad_norm": 0.5566651821136475, "learning_rate": 0.00017431448254773944, "loss": 1.2096, "step": 339 }, { "epoch": 0.13425468904244817, "grad_norm": 0.48729124665260315, "learning_rate": 0.00017416521056479577, "loss": 0.84, "step": 340 }, { "epoch": 0.13464955577492596, "grad_norm": 0.48902615904808044, "learning_rate": 0.00017401557040068124, "loss": 0.9068, "step": 341 }, { "epoch": 0.13504442250740376, "grad_norm": 0.5371021628379822, "learning_rate": 0.00017386556279826021, "loss": 1.0875, "step": 342 }, { "epoch": 0.13543928923988155, "grad_norm": 0.5122295618057251, "learning_rate": 0.00017371518850222112, "loss": 0.928, "step": 343 }, { "epoch": 0.13583415597235932, "grad_norm": 0.5144253373146057, "learning_rate": 0.00017356444825907273, "loss": 1.1201, "step": 344 }, { "epoch": 0.1362290227048371, "grad_norm": 0.6197713017463684, "learning_rate": 0.00017341334281714064, "loss": 1.0366, "step": 345 }, { "epoch": 0.1366238894373149, "grad_norm": 0.5059978365898132, "learning_rate": 0.00017326187292656333, "loss": 1.0132, "step": 346 }, { "epoch": 0.1370187561697927, "grad_norm": 0.45369940996170044, "learning_rate": 0.00017311003933928847, "loss": 1.0436, "step": 347 }, { "epoch": 0.1374136229022705, "grad_norm": 0.5087475180625916, "learning_rate": 0.00017295784280906934, "loss": 0.9475, "step": 348 }, { "epoch": 0.13780848963474826, "grad_norm": 0.48209476470947266, "learning_rate": 0.00017280528409146094, "loss": 1.1108, "step": 349 }, { "epoch": 0.13820335636722605, "grad_norm": 0.5897043943405151, "learning_rate": 0.00017265236394381633, "loss": 1.0758, "step": 350 }, { "epoch": 0.13859822309970385, "grad_norm": 0.4946494996547699, "learning_rate": 0.00017249908312528276, "loss": 0.9829, "step": 351 }, { "epoch": 0.13899308983218164, "grad_norm": 0.49029871821403503, "learning_rate": 0.00017234544239679806, "loss": 0.8431, "step": 352 }, { "epoch": 0.13938795656465944, "grad_norm": 0.5330137610435486, "learning_rate": 0.00017219144252108673, "loss": 1.13, "step": 353 }, { "epoch": 0.1397828232971372, "grad_norm": 0.47816064953804016, "learning_rate": 0.00017203708426265614, "loss": 1.0986, "step": 354 }, { "epoch": 0.140177690029615, "grad_norm": 0.537811815738678, "learning_rate": 0.00017188236838779295, "loss": 1.0599, "step": 355 }, { "epoch": 0.140177690029615, "eval_loss": 0.9808822274208069, "eval_runtime": 61.6088, "eval_samples_per_second": 17.319, "eval_steps_per_second": 8.668, "step": 355 }, { "epoch": 0.1405725567620928, "grad_norm": 0.5210056304931641, "learning_rate": 0.000171727295664559, "loss": 1.1635, "step": 356 }, { "epoch": 0.1409674234945706, "grad_norm": 0.5472628474235535, "learning_rate": 0.00017157186686278766, "loss": 1.2106, "step": 357 }, { "epoch": 0.14136229022704838, "grad_norm": 0.459087073802948, "learning_rate": 0.00017141608275408006, "loss": 1.0337, "step": 358 }, { "epoch": 0.14175715695952615, "grad_norm": 0.41874152421951294, "learning_rate": 0.00017125994411180124, "loss": 0.8032, "step": 359 }, { "epoch": 0.14215202369200394, "grad_norm": 0.4521096348762512, "learning_rate": 0.0001711034517110761, "loss": 0.9549, "step": 360 }, { "epoch": 0.14254689042448174, "grad_norm": 0.48767751455307007, "learning_rate": 0.00017094660632878582, "loss": 0.9779, "step": 361 }, { "epoch": 0.14294175715695953, "grad_norm": 0.4864053428173065, "learning_rate": 0.00017078940874356392, "loss": 0.7642, "step": 362 }, { "epoch": 0.14333662388943733, "grad_norm": 0.46765899658203125, "learning_rate": 0.00017063185973579232, "loss": 1.0457, "step": 363 }, { "epoch": 0.1437314906219151, "grad_norm": 0.4682892858982086, "learning_rate": 0.00017047396008759754, "loss": 0.974, "step": 364 }, { "epoch": 0.1441263573543929, "grad_norm": 0.4890439212322235, "learning_rate": 0.00017031571058284678, "loss": 0.9047, "step": 365 }, { "epoch": 0.14452122408687068, "grad_norm": 0.4934488832950592, "learning_rate": 0.00017015711200714414, "loss": 1.1, "step": 366 }, { "epoch": 0.14491609081934848, "grad_norm": 0.5664051175117493, "learning_rate": 0.00016999816514782647, "loss": 1.0985, "step": 367 }, { "epoch": 0.14531095755182627, "grad_norm": 0.5750765800476074, "learning_rate": 0.00016983887079395974, "loss": 0.936, "step": 368 }, { "epoch": 0.14570582428430404, "grad_norm": 0.49568259716033936, "learning_rate": 0.00016967922973633494, "loss": 1.1489, "step": 369 }, { "epoch": 0.14610069101678183, "grad_norm": 0.5348165035247803, "learning_rate": 0.00016951924276746425, "loss": 1.1904, "step": 370 }, { "epoch": 0.14649555774925963, "grad_norm": 0.4326549470424652, "learning_rate": 0.00016935891068157704, "loss": 0.8516, "step": 371 }, { "epoch": 0.14689042448173742, "grad_norm": 0.5540488362312317, "learning_rate": 0.000169198234274616, "loss": 1.0405, "step": 372 }, { "epoch": 0.1472852912142152, "grad_norm": 0.5236465930938721, "learning_rate": 0.00016903721434423306, "loss": 0.9151, "step": 373 }, { "epoch": 0.14768015794669298, "grad_norm": 0.5312307476997375, "learning_rate": 0.00016887585168978562, "loss": 1.1763, "step": 374 }, { "epoch": 0.14807502467917077, "grad_norm": 0.4888836145401001, "learning_rate": 0.0001687141471123324, "loss": 1.0494, "step": 375 }, { "epoch": 0.14846989141164857, "grad_norm": 0.49629417061805725, "learning_rate": 0.00016855210141462963, "loss": 0.697, "step": 376 }, { "epoch": 0.14886475814412636, "grad_norm": 0.5097388029098511, "learning_rate": 0.0001683897154011269, "loss": 1.0308, "step": 377 }, { "epoch": 0.14925962487660416, "grad_norm": 0.4330967664718628, "learning_rate": 0.0001682269898779632, "loss": 0.8466, "step": 378 }, { "epoch": 0.14965449160908192, "grad_norm": 0.4622458219528198, "learning_rate": 0.00016806392565296311, "loss": 0.849, "step": 379 }, { "epoch": 0.15004935834155972, "grad_norm": 0.4904235303401947, "learning_rate": 0.00016790052353563253, "loss": 1.0324, "step": 380 }, { "epoch": 0.1504442250740375, "grad_norm": 0.5324286222457886, "learning_rate": 0.00016773678433715475, "loss": 0.905, "step": 381 }, { "epoch": 0.1508390918065153, "grad_norm": 0.5701109766960144, "learning_rate": 0.00016757270887038654, "loss": 1.1105, "step": 382 }, { "epoch": 0.1512339585389931, "grad_norm": 0.45347732305526733, "learning_rate": 0.00016740829794985394, "loss": 0.875, "step": 383 }, { "epoch": 0.15162882527147087, "grad_norm": 0.4763396084308624, "learning_rate": 0.00016724355239174833, "loss": 0.9732, "step": 384 }, { "epoch": 0.15202369200394866, "grad_norm": 0.4246227443218231, "learning_rate": 0.00016707847301392236, "loss": 0.9235, "step": 385 }, { "epoch": 0.15241855873642646, "grad_norm": 0.6034351587295532, "learning_rate": 0.00016691306063588583, "loss": 0.8553, "step": 386 }, { "epoch": 0.15281342546890425, "grad_norm": 0.48924902081489563, "learning_rate": 0.0001667473160788017, "loss": 0.967, "step": 387 }, { "epoch": 0.15320829220138205, "grad_norm": 0.5340859889984131, "learning_rate": 0.00016658124016548197, "loss": 1.1052, "step": 388 }, { "epoch": 0.1536031589338598, "grad_norm": 0.5492063760757446, "learning_rate": 0.0001664148337203836, "loss": 1.1952, "step": 389 }, { "epoch": 0.1539980256663376, "grad_norm": 0.4856433868408203, "learning_rate": 0.00016624809756960444, "loss": 0.9994, "step": 390 }, { "epoch": 0.1543928923988154, "grad_norm": 0.4147414565086365, "learning_rate": 0.00016608103254087906, "loss": 0.9976, "step": 391 }, { "epoch": 0.1547877591312932, "grad_norm": 0.5175687074661255, "learning_rate": 0.00016591363946357474, "loss": 1.0245, "step": 392 }, { "epoch": 0.155182625863771, "grad_norm": 0.620985209941864, "learning_rate": 0.00016574591916868728, "loss": 1.1981, "step": 393 }, { "epoch": 0.15557749259624876, "grad_norm": 0.42090892791748047, "learning_rate": 0.00016557787248883696, "loss": 0.9171, "step": 394 }, { "epoch": 0.15597235932872655, "grad_norm": 0.6142613291740417, "learning_rate": 0.00016540950025826422, "loss": 1.0901, "step": 395 }, { "epoch": 0.15636722606120435, "grad_norm": 0.6569430232048035, "learning_rate": 0.00016524080331282577, "loss": 1.0362, "step": 396 }, { "epoch": 0.15676209279368214, "grad_norm": 0.5111309885978699, "learning_rate": 0.00016507178248999024, "loss": 0.9666, "step": 397 }, { "epoch": 0.15715695952615993, "grad_norm": 0.5645838975906372, "learning_rate": 0.00016490243862883413, "loss": 0.9295, "step": 398 }, { "epoch": 0.1575518262586377, "grad_norm": 0.46207594871520996, "learning_rate": 0.00016473277257003757, "loss": 1.0462, "step": 399 }, { "epoch": 0.1579466929911155, "grad_norm": 0.5260053873062134, "learning_rate": 0.00016456278515588024, "loss": 0.9745, "step": 400 }, { "epoch": 0.1583415597235933, "grad_norm": 0.5953394174575806, "learning_rate": 0.00016439247723023712, "loss": 1.0208, "step": 401 }, { "epoch": 0.15873642645607108, "grad_norm": 0.5679177641868591, "learning_rate": 0.00016422184963857432, "loss": 0.8679, "step": 402 }, { "epoch": 0.15913129318854888, "grad_norm": 0.47382932901382446, "learning_rate": 0.00016405090322794483, "loss": 0.9579, "step": 403 }, { "epoch": 0.15952615992102664, "grad_norm": 0.49775707721710205, "learning_rate": 0.00016387963884698448, "loss": 1.2399, "step": 404 }, { "epoch": 0.15992102665350444, "grad_norm": 0.49022993445396423, "learning_rate": 0.00016370805734590747, "loss": 1.0984, "step": 405 }, { "epoch": 0.16031589338598223, "grad_norm": 0.5563966035842896, "learning_rate": 0.00016353615957650236, "loss": 0.9777, "step": 406 }, { "epoch": 0.16071076011846003, "grad_norm": 0.46516212821006775, "learning_rate": 0.00016336394639212783, "loss": 0.8274, "step": 407 }, { "epoch": 0.16110562685093782, "grad_norm": 0.4698953926563263, "learning_rate": 0.00016319141864770827, "loss": 0.9419, "step": 408 }, { "epoch": 0.1615004935834156, "grad_norm": 0.4373069405555725, "learning_rate": 0.00016301857719972976, "loss": 0.8235, "step": 409 }, { "epoch": 0.16189536031589338, "grad_norm": 0.6190903186798096, "learning_rate": 0.00016284542290623567, "loss": 0.9092, "step": 410 }, { "epoch": 0.16229022704837118, "grad_norm": 0.5729885101318359, "learning_rate": 0.0001626719566268224, "loss": 0.9563, "step": 411 }, { "epoch": 0.16268509378084897, "grad_norm": 0.5004550814628601, "learning_rate": 0.00016249817922263517, "loss": 1.1011, "step": 412 }, { "epoch": 0.16307996051332677, "grad_norm": 0.4848381578922272, "learning_rate": 0.0001623240915563638, "loss": 1.0606, "step": 413 }, { "epoch": 0.16347482724580453, "grad_norm": 0.5314562916755676, "learning_rate": 0.00016214969449223824, "loss": 0.7504, "step": 414 }, { "epoch": 0.16386969397828233, "grad_norm": 0.5033890604972839, "learning_rate": 0.00016197498889602448, "loss": 1.0749, "step": 415 }, { "epoch": 0.16426456071076012, "grad_norm": 0.5094935894012451, "learning_rate": 0.0001617999756350202, "loss": 0.8851, "step": 416 }, { "epoch": 0.16465942744323792, "grad_norm": 0.4637800455093384, "learning_rate": 0.00016162465557805034, "loss": 0.8553, "step": 417 }, { "epoch": 0.16505429417571568, "grad_norm": 0.43399858474731445, "learning_rate": 0.00016144902959546286, "loss": 0.9576, "step": 418 }, { "epoch": 0.16544916090819348, "grad_norm": 0.5109582543373108, "learning_rate": 0.00016127309855912457, "loss": 1.1576, "step": 419 }, { "epoch": 0.16584402764067127, "grad_norm": 0.5534142851829529, "learning_rate": 0.00016109686334241655, "loss": 0.7701, "step": 420 }, { "epoch": 0.16623889437314907, "grad_norm": 0.46211493015289307, "learning_rate": 0.00016092032482023, "loss": 0.9263, "step": 421 }, { "epoch": 0.16663376110562686, "grad_norm": 0.5489344596862793, "learning_rate": 0.00016074348386896177, "loss": 1.0165, "step": 422 }, { "epoch": 0.16702862783810463, "grad_norm": 0.4893771708011627, "learning_rate": 0.0001605663413665102, "loss": 1.0284, "step": 423 }, { "epoch": 0.16742349457058242, "grad_norm": 0.46742960810661316, "learning_rate": 0.00016038889819227045, "loss": 1.0603, "step": 424 }, { "epoch": 0.16781836130306022, "grad_norm": 0.6020311713218689, "learning_rate": 0.00016021115522713047, "loss": 0.8771, "step": 425 }, { "epoch": 0.168213228035538, "grad_norm": 0.5273780822753906, "learning_rate": 0.00016003311335346636, "loss": 0.9648, "step": 426 }, { "epoch": 0.1686080947680158, "grad_norm": 0.5141377449035645, "learning_rate": 0.00015985477345513817, "loss": 0.8773, "step": 427 }, { "epoch": 0.16900296150049357, "grad_norm": 0.4852812588214874, "learning_rate": 0.00015967613641748542, "loss": 0.7764, "step": 428 }, { "epoch": 0.16939782823297136, "grad_norm": 0.43280699849128723, "learning_rate": 0.0001594972031273228, "loss": 0.7407, "step": 429 }, { "epoch": 0.16979269496544916, "grad_norm": 0.5069910883903503, "learning_rate": 0.00015931797447293552, "loss": 0.9313, "step": 430 }, { "epoch": 0.17018756169792695, "grad_norm": 0.5123517513275146, "learning_rate": 0.00015913845134407533, "loss": 1.0705, "step": 431 }, { "epoch": 0.17058242843040475, "grad_norm": 0.724429190158844, "learning_rate": 0.00015895863463195558, "loss": 0.9353, "step": 432 }, { "epoch": 0.17097729516288251, "grad_norm": 0.5428094267845154, "learning_rate": 0.00015877852522924732, "loss": 1.1227, "step": 433 }, { "epoch": 0.1713721618953603, "grad_norm": 0.6102519631385803, "learning_rate": 0.00015859812403007443, "loss": 0.9353, "step": 434 }, { "epoch": 0.1717670286278381, "grad_norm": 0.5151787400245667, "learning_rate": 0.00015841743193000944, "loss": 0.9646, "step": 435 }, { "epoch": 0.1721618953603159, "grad_norm": 0.6272695064544678, "learning_rate": 0.00015823644982606905, "loss": 0.8384, "step": 436 }, { "epoch": 0.1725567620927937, "grad_norm": 0.49354809522628784, "learning_rate": 0.00015805517861670952, "loss": 0.7855, "step": 437 }, { "epoch": 0.17295162882527146, "grad_norm": 0.47559288144111633, "learning_rate": 0.0001578736192018224, "loss": 0.9591, "step": 438 }, { "epoch": 0.17334649555774925, "grad_norm": 0.5615376830101013, "learning_rate": 0.00015769177248273008, "loss": 1.1537, "step": 439 }, { "epoch": 0.17374136229022705, "grad_norm": 0.5301774144172668, "learning_rate": 0.00015750963936218105, "loss": 0.7773, "step": 440 }, { "epoch": 0.17413622902270484, "grad_norm": 0.5083664059638977, "learning_rate": 0.0001573272207443457, "loss": 0.9705, "step": 441 }, { "epoch": 0.17453109575518264, "grad_norm": 0.5633112788200378, "learning_rate": 0.00015714451753481168, "loss": 1.0109, "step": 442 }, { "epoch": 0.1749259624876604, "grad_norm": 0.5243581533432007, "learning_rate": 0.00015696153064057947, "loss": 1.0258, "step": 443 }, { "epoch": 0.1753208292201382, "grad_norm": 0.6054911613464355, "learning_rate": 0.0001567782609700579, "loss": 1.0102, "step": 444 }, { "epoch": 0.175715695952616, "grad_norm": 0.5889274477958679, "learning_rate": 0.00015659470943305955, "loss": 1.1549, "step": 445 }, { "epoch": 0.17611056268509379, "grad_norm": 0.5765202045440674, "learning_rate": 0.0001564108769407962, "loss": 0.7791, "step": 446 }, { "epoch": 0.17650542941757158, "grad_norm": 0.5080841779708862, "learning_rate": 0.0001562267644058746, "loss": 0.9962, "step": 447 }, { "epoch": 0.17690029615004935, "grad_norm": 0.5185093879699707, "learning_rate": 0.00015604237274229147, "loss": 1.1927, "step": 448 }, { "epoch": 0.17729516288252714, "grad_norm": 0.5385391116142273, "learning_rate": 0.00015585770286542945, "loss": 1.0555, "step": 449 }, { "epoch": 0.17769002961500494, "grad_norm": 0.6289413571357727, "learning_rate": 0.00015567275569205218, "loss": 1.0431, "step": 450 }, { "epoch": 0.17808489634748273, "grad_norm": 0.6271052956581116, "learning_rate": 0.0001554875321402999, "loss": 1.0078, "step": 451 }, { "epoch": 0.17847976307996052, "grad_norm": 0.6165266633033752, "learning_rate": 0.00015530203312968502, "loss": 0.9761, "step": 452 }, { "epoch": 0.1788746298124383, "grad_norm": 0.4909960627555847, "learning_rate": 0.00015511625958108719, "loss": 1.061, "step": 453 }, { "epoch": 0.17926949654491608, "grad_norm": 0.5358797311782837, "learning_rate": 0.00015493021241674918, "loss": 1.0878, "step": 454 }, { "epoch": 0.17966436327739388, "grad_norm": 0.6802231073379517, "learning_rate": 0.000154743892560272, "loss": 1.0068, "step": 455 }, { "epoch": 0.18005923000987167, "grad_norm": 0.6705336570739746, "learning_rate": 0.00015455730093661034, "loss": 1.0845, "step": 456 }, { "epoch": 0.18045409674234947, "grad_norm": 0.43329593539237976, "learning_rate": 0.0001543704384720681, "loss": 0.8636, "step": 457 }, { "epoch": 0.18084896347482723, "grad_norm": 0.5168460011482239, "learning_rate": 0.0001541833060942937, "loss": 0.7497, "step": 458 }, { "epoch": 0.18124383020730503, "grad_norm": 0.4275985062122345, "learning_rate": 0.0001539959047322755, "loss": 0.9042, "step": 459 }, { "epoch": 0.18163869693978282, "grad_norm": 0.44853127002716064, "learning_rate": 0.00015380823531633729, "loss": 0.9091, "step": 460 }, { "epoch": 0.18203356367226062, "grad_norm": 0.4683418571949005, "learning_rate": 0.00015362029877813332, "loss": 0.8174, "step": 461 }, { "epoch": 0.1824284304047384, "grad_norm": 0.4813165068626404, "learning_rate": 0.00015343209605064422, "loss": 0.7648, "step": 462 }, { "epoch": 0.18282329713721618, "grad_norm": 0.47031188011169434, "learning_rate": 0.00015324362806817186, "loss": 0.8667, "step": 463 }, { "epoch": 0.18321816386969397, "grad_norm": 0.5744631886482239, "learning_rate": 0.00015305489576633504, "loss": 0.8219, "step": 464 }, { "epoch": 0.18361303060217177, "grad_norm": 0.48958727717399597, "learning_rate": 0.00015286590008206465, "loss": 1.018, "step": 465 }, { "epoch": 0.18400789733464956, "grad_norm": 0.5247324109077454, "learning_rate": 0.00015267664195359917, "loss": 0.8933, "step": 466 }, { "epoch": 0.18440276406712736, "grad_norm": 0.545970618724823, "learning_rate": 0.00015248712232047992, "loss": 1.0508, "step": 467 }, { "epoch": 0.18479763079960512, "grad_norm": 0.5140126347541809, "learning_rate": 0.0001522973421235464, "loss": 0.7581, "step": 468 }, { "epoch": 0.18519249753208292, "grad_norm": 0.4570896625518799, "learning_rate": 0.00015210730230493162, "loss": 1.0665, "step": 469 }, { "epoch": 0.1855873642645607, "grad_norm": 0.43096470832824707, "learning_rate": 0.00015191700380805752, "loss": 0.8313, "step": 470 }, { "epoch": 0.1859822309970385, "grad_norm": 0.460275799036026, "learning_rate": 0.00015172644757763015, "loss": 0.9575, "step": 471 }, { "epoch": 0.1863770977295163, "grad_norm": 0.4163447618484497, "learning_rate": 0.00015153563455963499, "loss": 0.8838, "step": 472 }, { "epoch": 0.18677196446199407, "grad_norm": 0.542241096496582, "learning_rate": 0.0001513445657013324, "loss": 0.8143, "step": 473 }, { "epoch": 0.18716683119447186, "grad_norm": 0.6614589691162109, "learning_rate": 0.00015115324195125274, "loss": 0.9645, "step": 474 }, { "epoch": 0.18756169792694966, "grad_norm": 0.4719527065753937, "learning_rate": 0.00015096166425919175, "loss": 0.9894, "step": 475 }, { "epoch": 0.18795656465942745, "grad_norm": 0.4972122013568878, "learning_rate": 0.0001507698335762059, "loss": 0.8865, "step": 476 }, { "epoch": 0.18835143139190524, "grad_norm": 0.5407273769378662, "learning_rate": 0.00015057775085460749, "loss": 0.8714, "step": 477 }, { "epoch": 0.188746298124383, "grad_norm": 0.4692353308200836, "learning_rate": 0.00015038541704796003, "loss": 0.9357, "step": 478 }, { "epoch": 0.1891411648568608, "grad_norm": 0.4748722314834595, "learning_rate": 0.00015019283311107367, "loss": 1.0376, "step": 479 }, { "epoch": 0.1895360315893386, "grad_norm": 0.4381348490715027, "learning_rate": 0.00015000000000000001, "loss": 1.0059, "step": 480 }, { "epoch": 0.1899308983218164, "grad_norm": 0.48987191915512085, "learning_rate": 0.0001498069186720279, "loss": 0.8901, "step": 481 }, { "epoch": 0.1903257650542942, "grad_norm": 0.5566233396530151, "learning_rate": 0.0001496135900856782, "loss": 1.1464, "step": 482 }, { "epoch": 0.19072063178677195, "grad_norm": 0.5021251440048218, "learning_rate": 0.00014942001520069947, "loss": 1.0947, "step": 483 }, { "epoch": 0.19111549851924975, "grad_norm": 0.5681881904602051, "learning_rate": 0.00014922619497806277, "loss": 1.0981, "step": 484 }, { "epoch": 0.19151036525172754, "grad_norm": 0.5182890892028809, "learning_rate": 0.00014903213037995724, "loss": 1.1017, "step": 485 }, { "epoch": 0.19190523198420534, "grad_norm": 0.4682919979095459, "learning_rate": 0.0001488378223697851, "loss": 0.6508, "step": 486 }, { "epoch": 0.19230009871668313, "grad_norm": 0.4784727096557617, "learning_rate": 0.00014864327191215702, "loss": 0.874, "step": 487 }, { "epoch": 0.1926949654491609, "grad_norm": 0.5247599482536316, "learning_rate": 0.00014844847997288717, "loss": 1.1797, "step": 488 }, { "epoch": 0.1930898321816387, "grad_norm": 0.48195531964302063, "learning_rate": 0.00014825344751898863, "loss": 1.0463, "step": 489 }, { "epoch": 0.1934846989141165, "grad_norm": 0.5093549489974976, "learning_rate": 0.00014805817551866838, "loss": 1.0409, "step": 490 }, { "epoch": 0.19387956564659428, "grad_norm": 0.5454416275024414, "learning_rate": 0.00014786266494132267, "loss": 1.035, "step": 491 }, { "epoch": 0.19427443237907208, "grad_norm": 0.49301639199256897, "learning_rate": 0.00014766691675753202, "loss": 1.1046, "step": 492 }, { "epoch": 0.19466929911154984, "grad_norm": 0.4534429907798767, "learning_rate": 0.00014747093193905657, "loss": 0.9061, "step": 493 }, { "epoch": 0.19506416584402764, "grad_norm": 0.599082887172699, "learning_rate": 0.00014727471145883127, "loss": 1.0882, "step": 494 }, { "epoch": 0.19545903257650543, "grad_norm": 0.5382128953933716, "learning_rate": 0.00014707825629096084, "loss": 1.0369, "step": 495 }, { "epoch": 0.19585389930898323, "grad_norm": 0.5784068703651428, "learning_rate": 0.00014688156741071514, "loss": 0.9614, "step": 496 }, { "epoch": 0.19624876604146102, "grad_norm": 0.5423576235771179, "learning_rate": 0.00014668464579452425, "loss": 0.9217, "step": 497 }, { "epoch": 0.1966436327739388, "grad_norm": 0.47836440801620483, "learning_rate": 0.00014648749241997363, "loss": 0.918, "step": 498 }, { "epoch": 0.19703849950641658, "grad_norm": 0.5091026425361633, "learning_rate": 0.00014629010826579928, "loss": 1.0415, "step": 499 }, { "epoch": 0.19743336623889438, "grad_norm": 0.553871214389801, "learning_rate": 0.00014609249431188278, "loss": 0.8315, "step": 500 }, { "epoch": 0.19782823297137217, "grad_norm": 0.4480036199092865, "learning_rate": 0.00014589465153924672, "loss": 0.9431, "step": 501 }, { "epoch": 0.19822309970384996, "grad_norm": 0.5340292453765869, "learning_rate": 0.00014569658093004935, "loss": 0.8736, "step": 502 }, { "epoch": 0.19861796643632773, "grad_norm": 0.5622652173042297, "learning_rate": 0.0001454982834675802, "loss": 1.1759, "step": 503 }, { "epoch": 0.19901283316880553, "grad_norm": 0.5641468167304993, "learning_rate": 0.00014529976013625482, "loss": 0.9721, "step": 504 }, { "epoch": 0.19940769990128332, "grad_norm": 0.49418380856513977, "learning_rate": 0.00014510101192161018, "loss": 0.8389, "step": 505 }, { "epoch": 0.19980256663376111, "grad_norm": 0.47910451889038086, "learning_rate": 0.0001449020398102996, "loss": 0.9339, "step": 506 }, { "epoch": 0.20019743336623888, "grad_norm": 0.5156650543212891, "learning_rate": 0.00014470284479008782, "loss": 0.9458, "step": 507 }, { "epoch": 0.20059230009871667, "grad_norm": 0.4681549072265625, "learning_rate": 0.00014450342784984633, "loss": 0.8954, "step": 508 }, { "epoch": 0.20098716683119447, "grad_norm": 0.5173560380935669, "learning_rate": 0.00014430378997954817, "loss": 1.0272, "step": 509 }, { "epoch": 0.20138203356367226, "grad_norm": 0.5966143012046814, "learning_rate": 0.00014410393217026318, "loss": 0.8915, "step": 510 }, { "epoch": 0.20177690029615006, "grad_norm": 0.5026108026504517, "learning_rate": 0.00014390385541415308, "loss": 0.9169, "step": 511 }, { "epoch": 0.20217176702862782, "grad_norm": 0.4867290258407593, "learning_rate": 0.00014370356070446654, "loss": 1.133, "step": 512 }, { "epoch": 0.20256663376110562, "grad_norm": 0.4962225556373596, "learning_rate": 0.00014350304903553416, "loss": 0.9498, "step": 513 }, { "epoch": 0.2029615004935834, "grad_norm": 0.4522517919540405, "learning_rate": 0.00014330232140276366, "loss": 0.8796, "step": 514 }, { "epoch": 0.2033563672260612, "grad_norm": 0.4583631455898285, "learning_rate": 0.00014310137880263482, "loss": 0.9822, "step": 515 }, { "epoch": 0.203751233958539, "grad_norm": 0.5668156147003174, "learning_rate": 0.00014290022223269463, "loss": 0.8197, "step": 516 }, { "epoch": 0.20414610069101677, "grad_norm": 0.47949913144111633, "learning_rate": 0.0001426988526915523, "loss": 0.9435, "step": 517 }, { "epoch": 0.20454096742349456, "grad_norm": 0.497577965259552, "learning_rate": 0.00014249727117887425, "loss": 0.8928, "step": 518 }, { "epoch": 0.20493583415597236, "grad_norm": 0.4772360622882843, "learning_rate": 0.0001422954786953793, "loss": 1.0171, "step": 519 }, { "epoch": 0.20533070088845015, "grad_norm": 0.5616466999053955, "learning_rate": 0.0001420934762428335, "loss": 0.9387, "step": 520 }, { "epoch": 0.20572556762092795, "grad_norm": 0.6362606287002563, "learning_rate": 0.00014189126482404532, "loss": 0.7291, "step": 521 }, { "epoch": 0.2061204343534057, "grad_norm": 0.4801354706287384, "learning_rate": 0.00014168884544286053, "loss": 0.7701, "step": 522 }, { "epoch": 0.2065153010858835, "grad_norm": 0.45586976408958435, "learning_rate": 0.0001414862191041574, "loss": 0.9861, "step": 523 }, { "epoch": 0.2069101678183613, "grad_norm": 0.5662283301353455, "learning_rate": 0.00014128338681384153, "loss": 0.9726, "step": 524 }, { "epoch": 0.2073050345508391, "grad_norm": 0.5532451868057251, "learning_rate": 0.00014108034957884094, "loss": 1.2107, "step": 525 }, { "epoch": 0.2076999012833169, "grad_norm": 0.5659233331680298, "learning_rate": 0.0001408771084071012, "loss": 0.9739, "step": 526 }, { "epoch": 0.20809476801579466, "grad_norm": 0.5252379775047302, "learning_rate": 0.00014067366430758004, "loss": 0.9744, "step": 527 }, { "epoch": 0.20848963474827245, "grad_norm": 0.6081037521362305, "learning_rate": 0.0001404700182902428, "loss": 1.0524, "step": 528 }, { "epoch": 0.20888450148075025, "grad_norm": 0.6748234033584595, "learning_rate": 0.0001402661713660571, "loss": 0.9468, "step": 529 }, { "epoch": 0.20927936821322804, "grad_norm": 0.4868841767311096, "learning_rate": 0.00014006212454698797, "loss": 1.0739, "step": 530 }, { "epoch": 0.20967423494570583, "grad_norm": 0.5020860433578491, "learning_rate": 0.00013985787884599282, "loss": 0.9586, "step": 531 }, { "epoch": 0.2100691016781836, "grad_norm": 0.579229474067688, "learning_rate": 0.00013965343527701628, "loss": 0.8937, "step": 532 }, { "epoch": 0.2104639684106614, "grad_norm": 0.48879000544548035, "learning_rate": 0.00013944879485498538, "loss": 0.956, "step": 533 }, { "epoch": 0.2108588351431392, "grad_norm": 0.5132958292961121, "learning_rate": 0.00013924395859580432, "loss": 0.8762, "step": 534 }, { "epoch": 0.21125370187561698, "grad_norm": 0.5426986217498779, "learning_rate": 0.00013903892751634947, "loss": 1.0018, "step": 535 }, { "epoch": 0.21164856860809478, "grad_norm": 0.5325112342834473, "learning_rate": 0.0001388337026344645, "loss": 0.9227, "step": 536 }, { "epoch": 0.21204343534057254, "grad_norm": 0.5148372650146484, "learning_rate": 0.000138628284968955, "loss": 1.0617, "step": 537 }, { "epoch": 0.21243830207305034, "grad_norm": 0.4726095199584961, "learning_rate": 0.00013842267553958371, "loss": 0.9038, "step": 538 }, { "epoch": 0.21283316880552813, "grad_norm": 0.504017174243927, "learning_rate": 0.00013821687536706533, "loss": 1.0946, "step": 539 }, { "epoch": 0.21322803553800593, "grad_norm": 0.483732670545578, "learning_rate": 0.00013801088547306148, "loss": 0.7506, "step": 540 }, { "epoch": 0.21362290227048372, "grad_norm": 0.49207037687301636, "learning_rate": 0.00013780470688017562, "loss": 0.8905, "step": 541 }, { "epoch": 0.2140177690029615, "grad_norm": 0.5545893311500549, "learning_rate": 0.00013759834061194794, "loss": 0.808, "step": 542 }, { "epoch": 0.21441263573543928, "grad_norm": 0.657805323600769, "learning_rate": 0.00013739178769285032, "loss": 0.8566, "step": 543 }, { "epoch": 0.21480750246791708, "grad_norm": 0.5868344902992249, "learning_rate": 0.00013718504914828135, "loss": 0.9001, "step": 544 }, { "epoch": 0.21520236920039487, "grad_norm": 0.4816092550754547, "learning_rate": 0.00013697812600456093, "loss": 0.993, "step": 545 }, { "epoch": 0.21559723593287267, "grad_norm": 0.5755246877670288, "learning_rate": 0.00013677101928892554, "loss": 1.1376, "step": 546 }, { "epoch": 0.21599210266535043, "grad_norm": 0.5235198736190796, "learning_rate": 0.0001365637300295229, "loss": 1.036, "step": 547 }, { "epoch": 0.21638696939782823, "grad_norm": 0.4904315173625946, "learning_rate": 0.00013635625925540696, "loss": 1.0033, "step": 548 }, { "epoch": 0.21678183613030602, "grad_norm": 0.49426376819610596, "learning_rate": 0.00013614860799653276, "loss": 1.0455, "step": 549 }, { "epoch": 0.21717670286278382, "grad_norm": 0.5230404734611511, "learning_rate": 0.00013594077728375128, "loss": 0.8619, "step": 550 }, { "epoch": 0.2175715695952616, "grad_norm": 0.4252125918865204, "learning_rate": 0.0001357327681488045, "loss": 0.552, "step": 551 }, { "epoch": 0.21796643632773938, "grad_norm": 0.6176772117614746, "learning_rate": 0.00013552458162432003, "loss": 0.9374, "step": 552 }, { "epoch": 0.21836130306021717, "grad_norm": 0.476182222366333, "learning_rate": 0.00013531621874380613, "loss": 0.9189, "step": 553 }, { "epoch": 0.21875616979269497, "grad_norm": 0.5183379054069519, "learning_rate": 0.00013510768054164653, "loss": 0.9177, "step": 554 }, { "epoch": 0.21915103652517276, "grad_norm": 0.5251573920249939, "learning_rate": 0.00013489896805309542, "loss": 0.8619, "step": 555 }, { "epoch": 0.21954590325765055, "grad_norm": 0.508030116558075, "learning_rate": 0.00013469008231427207, "loss": 1.032, "step": 556 }, { "epoch": 0.21994076999012832, "grad_norm": 0.5486171245574951, "learning_rate": 0.00013448102436215592, "loss": 0.8481, "step": 557 }, { "epoch": 0.22033563672260612, "grad_norm": 0.5298491716384888, "learning_rate": 0.00013427179523458127, "loss": 0.7748, "step": 558 }, { "epoch": 0.2207305034550839, "grad_norm": 0.6214480996131897, "learning_rate": 0.00013406239597023225, "loss": 1.0314, "step": 559 }, { "epoch": 0.2211253701875617, "grad_norm": 0.5228508710861206, "learning_rate": 0.00013385282760863758, "loss": 0.9916, "step": 560 }, { "epoch": 0.2215202369200395, "grad_norm": 0.5693901777267456, "learning_rate": 0.00013364309119016538, "loss": 1.0026, "step": 561 }, { "epoch": 0.22191510365251726, "grad_norm": 0.4827022850513458, "learning_rate": 0.0001334331877560182, "loss": 0.8068, "step": 562 }, { "epoch": 0.22230997038499506, "grad_norm": 0.4529504179954529, "learning_rate": 0.00013322311834822756, "loss": 0.8318, "step": 563 }, { "epoch": 0.22270483711747285, "grad_norm": 0.5850614309310913, "learning_rate": 0.00013301288400964902, "loss": 0.8946, "step": 564 }, { "epoch": 0.22309970384995065, "grad_norm": 0.5779694318771362, "learning_rate": 0.0001328024857839569, "loss": 0.7861, "step": 565 }, { "epoch": 0.22349457058242844, "grad_norm": 0.5600647926330566, "learning_rate": 0.00013259192471563912, "loss": 0.7921, "step": 566 }, { "epoch": 0.2238894373149062, "grad_norm": 0.5468648076057434, "learning_rate": 0.00013238120184999195, "loss": 0.7732, "step": 567 }, { "epoch": 0.224284304047384, "grad_norm": 0.619243323802948, "learning_rate": 0.00013217031823311488, "loss": 1.2012, "step": 568 }, { "epoch": 0.2246791707798618, "grad_norm": 0.5478827953338623, "learning_rate": 0.00013195927491190554, "loss": 1.0279, "step": 569 }, { "epoch": 0.2250740375123396, "grad_norm": 0.520038366317749, "learning_rate": 0.00013174807293405428, "loss": 0.8806, "step": 570 }, { "epoch": 0.2254689042448174, "grad_norm": 0.46674588322639465, "learning_rate": 0.00013153671334803905, "loss": 0.7596, "step": 571 }, { "epoch": 0.22586377097729515, "grad_norm": 0.510772705078125, "learning_rate": 0.0001313251972031203, "loss": 0.873, "step": 572 }, { "epoch": 0.22625863770977295, "grad_norm": 0.4360644221305847, "learning_rate": 0.00013111352554933563, "loss": 0.9622, "step": 573 }, { "epoch": 0.22665350444225074, "grad_norm": 0.4816182255744934, "learning_rate": 0.00013090169943749476, "loss": 1.0487, "step": 574 }, { "epoch": 0.22704837117472854, "grad_norm": 0.48583704233169556, "learning_rate": 0.000130689719919174, "loss": 0.8041, "step": 575 }, { "epoch": 0.22744323790720633, "grad_norm": 0.5958275198936462, "learning_rate": 0.00013047758804671136, "loss": 0.9466, "step": 576 }, { "epoch": 0.2278381046396841, "grad_norm": 0.4736819863319397, "learning_rate": 0.00013026530487320113, "loss": 0.9483, "step": 577 }, { "epoch": 0.2282329713721619, "grad_norm": 0.5173781514167786, "learning_rate": 0.00013005287145248878, "loss": 0.8428, "step": 578 }, { "epoch": 0.22862783810463969, "grad_norm": 0.44686052203178406, "learning_rate": 0.00012984028883916552, "loss": 1.009, "step": 579 }, { "epoch": 0.22902270483711748, "grad_norm": 0.5016387701034546, "learning_rate": 0.00012962755808856342, "loss": 1.0226, "step": 580 }, { "epoch": 0.22941757156959527, "grad_norm": 0.6307184100151062, "learning_rate": 0.0001294146802567497, "loss": 0.9179, "step": 581 }, { "epoch": 0.22981243830207304, "grad_norm": 0.43451613187789917, "learning_rate": 0.0001292016564005219, "loss": 0.9468, "step": 582 }, { "epoch": 0.23020730503455084, "grad_norm": 0.5214070081710815, "learning_rate": 0.00012898848757740246, "loss": 0.9226, "step": 583 }, { "epoch": 0.23060217176702863, "grad_norm": 0.6036335825920105, "learning_rate": 0.00012877517484563344, "loss": 0.9585, "step": 584 }, { "epoch": 0.23099703849950642, "grad_norm": 0.5829451084136963, "learning_rate": 0.00012856171926417133, "loss": 1.2637, "step": 585 }, { "epoch": 0.23139190523198422, "grad_norm": 0.5726488828659058, "learning_rate": 0.0001283481218926818, "loss": 0.9223, "step": 586 }, { "epoch": 0.23178677196446199, "grad_norm": 0.501338005065918, "learning_rate": 0.0001281343837915344, "loss": 0.8892, "step": 587 }, { "epoch": 0.23218163869693978, "grad_norm": 0.5073863863945007, "learning_rate": 0.00012792050602179725, "loss": 0.8541, "step": 588 }, { "epoch": 0.23257650542941757, "grad_norm": 0.6474023461341858, "learning_rate": 0.00012770648964523194, "loss": 1.1276, "step": 589 }, { "epoch": 0.23297137216189537, "grad_norm": 0.48500892519950867, "learning_rate": 0.00012749233572428804, "loss": 1.0324, "step": 590 }, { "epoch": 0.23336623889437316, "grad_norm": 0.5808454155921936, "learning_rate": 0.00012727804532209803, "loss": 1.0817, "step": 591 }, { "epoch": 0.23376110562685093, "grad_norm": 0.49606916308403015, "learning_rate": 0.0001270636195024719, "loss": 0.9889, "step": 592 }, { "epoch": 0.23415597235932872, "grad_norm": 0.6760202050209045, "learning_rate": 0.00012684905932989186, "loss": 1.1171, "step": 593 }, { "epoch": 0.23455083909180652, "grad_norm": 0.44945040345191956, "learning_rate": 0.00012663436586950714, "loss": 0.9308, "step": 594 }, { "epoch": 0.2349457058242843, "grad_norm": 0.6141867637634277, "learning_rate": 0.00012641954018712863, "loss": 1.1256, "step": 595 }, { "epoch": 0.2353405725567621, "grad_norm": 0.5368825197219849, "learning_rate": 0.0001262045833492236, "loss": 1.0181, "step": 596 }, { "epoch": 0.23573543928923987, "grad_norm": 0.5402265787124634, "learning_rate": 0.00012598949642291047, "loss": 1.2135, "step": 597 }, { "epoch": 0.23613030602171767, "grad_norm": 0.5296156406402588, "learning_rate": 0.00012577428047595344, "loss": 0.8361, "step": 598 }, { "epoch": 0.23652517275419546, "grad_norm": 0.4741549491882324, "learning_rate": 0.00012555893657675718, "loss": 0.9022, "step": 599 }, { "epoch": 0.23692003948667326, "grad_norm": 0.4711611866950989, "learning_rate": 0.0001253434657943616, "loss": 1.0165, "step": 600 }, { "epoch": 0.23731490621915102, "grad_norm": 0.4527783989906311, "learning_rate": 0.00012512786919843648, "loss": 0.8954, "step": 601 }, { "epoch": 0.23770977295162882, "grad_norm": 0.5278099775314331, "learning_rate": 0.0001249121478592762, "loss": 1.0807, "step": 602 }, { "epoch": 0.2381046396841066, "grad_norm": 0.45427364110946655, "learning_rate": 0.00012469630284779438, "loss": 0.9407, "step": 603 }, { "epoch": 0.2384995064165844, "grad_norm": 0.5772978663444519, "learning_rate": 0.00012448033523551865, "loss": 1.2734, "step": 604 }, { "epoch": 0.2388943731490622, "grad_norm": 0.5643810033798218, "learning_rate": 0.00012426424609458518, "loss": 1.0125, "step": 605 }, { "epoch": 0.23928923988153997, "grad_norm": 0.5393880009651184, "learning_rate": 0.0001240480364977335, "loss": 0.8613, "step": 606 }, { "epoch": 0.23968410661401776, "grad_norm": 0.49924036860466003, "learning_rate": 0.0001238317075183011, "loss": 0.9862, "step": 607 }, { "epoch": 0.24007897334649556, "grad_norm": 0.4382825493812561, "learning_rate": 0.00012361526023021822, "loss": 0.8692, "step": 608 }, { "epoch": 0.24047384007897335, "grad_norm": 0.5655290484428406, "learning_rate": 0.00012339869570800232, "loss": 0.9122, "step": 609 }, { "epoch": 0.24086870681145114, "grad_norm": 0.5245158076286316, "learning_rate": 0.00012318201502675285, "loss": 0.8347, "step": 610 }, { "epoch": 0.2412635735439289, "grad_norm": 0.5116402506828308, "learning_rate": 0.00012296521926214596, "loss": 1.0317, "step": 611 }, { "epoch": 0.2416584402764067, "grad_norm": 0.4878910779953003, "learning_rate": 0.00012274830949042908, "loss": 0.8947, "step": 612 }, { "epoch": 0.2420533070088845, "grad_norm": 0.5890231132507324, "learning_rate": 0.00012253128678841568, "loss": 1.0683, "step": 613 }, { "epoch": 0.2424481737413623, "grad_norm": 0.4757651388645172, "learning_rate": 0.00012231415223347972, "loss": 0.9816, "step": 614 }, { "epoch": 0.2428430404738401, "grad_norm": 0.5415259599685669, "learning_rate": 0.0001220969069035506, "loss": 0.9583, "step": 615 }, { "epoch": 0.24323790720631785, "grad_norm": 0.5389688014984131, "learning_rate": 0.0001218795518771075, "loss": 0.7091, "step": 616 }, { "epoch": 0.24363277393879565, "grad_norm": 0.8125389814376831, "learning_rate": 0.00012166208823317427, "loss": 1.0042, "step": 617 }, { "epoch": 0.24402764067127344, "grad_norm": 0.5463893413543701, "learning_rate": 0.0001214445170513139, "loss": 1.0685, "step": 618 }, { "epoch": 0.24442250740375124, "grad_norm": 0.4813181161880493, "learning_rate": 0.0001212268394116233, "loss": 0.8353, "step": 619 }, { "epoch": 0.24481737413622903, "grad_norm": 0.4448351263999939, "learning_rate": 0.00012100905639472779, "loss": 1.0261, "step": 620 }, { "epoch": 0.2452122408687068, "grad_norm": 0.509598433971405, "learning_rate": 0.00012079116908177593, "loss": 1.0274, "step": 621 }, { "epoch": 0.2456071076011846, "grad_norm": 0.5108718276023865, "learning_rate": 0.00012057317855443395, "loss": 0.8464, "step": 622 }, { "epoch": 0.2460019743336624, "grad_norm": 0.4307985007762909, "learning_rate": 0.00012035508589488053, "loss": 0.9281, "step": 623 }, { "epoch": 0.24639684106614018, "grad_norm": 0.5565152764320374, "learning_rate": 0.00012013689218580132, "loss": 0.9916, "step": 624 }, { "epoch": 0.24679170779861798, "grad_norm": 0.7218130826950073, "learning_rate": 0.0001199185985103836, "loss": 1.0378, "step": 625 }, { "epoch": 0.24718657453109574, "grad_norm": 0.5790725350379944, "learning_rate": 0.00011970020595231101, "loss": 1.0886, "step": 626 }, { "epoch": 0.24758144126357354, "grad_norm": 0.5239992141723633, "learning_rate": 0.000119481715595758, "loss": 1.0763, "step": 627 }, { "epoch": 0.24797630799605133, "grad_norm": 0.5147425532341003, "learning_rate": 0.00011926312852538455, "loss": 1.1421, "step": 628 }, { "epoch": 0.24837117472852913, "grad_norm": 0.6474115252494812, "learning_rate": 0.0001190444458263307, "loss": 0.8813, "step": 629 }, { "epoch": 0.24876604146100692, "grad_norm": 0.46953749656677246, "learning_rate": 0.00011882566858421135, "loss": 0.6636, "step": 630 }, { "epoch": 0.2491609081934847, "grad_norm": 0.6197345852851868, "learning_rate": 0.00011860679788511064, "loss": 0.8935, "step": 631 }, { "epoch": 0.24955577492596248, "grad_norm": 0.5545244812965393, "learning_rate": 0.00011838783481557664, "loss": 0.7358, "step": 632 }, { "epoch": 0.24995064165844028, "grad_norm": 0.5150088667869568, "learning_rate": 0.00011816878046261615, "loss": 0.8935, "step": 633 }, { "epoch": 0.25034550839091807, "grad_norm": 0.5777970552444458, "learning_rate": 0.00011794963591368893, "loss": 0.9984, "step": 634 }, { "epoch": 0.25074037512339586, "grad_norm": 0.6271523237228394, "learning_rate": 0.00011773040225670256, "loss": 1.0425, "step": 635 }, { "epoch": 0.25113524185587366, "grad_norm": 0.5507854223251343, "learning_rate": 0.00011751108058000706, "loss": 1.017, "step": 636 }, { "epoch": 0.25153010858835145, "grad_norm": 0.5012635588645935, "learning_rate": 0.00011729167197238935, "loss": 1.0421, "step": 637 }, { "epoch": 0.25192497532082925, "grad_norm": 0.5927073955535889, "learning_rate": 0.0001170721775230679, "loss": 0.8634, "step": 638 }, { "epoch": 0.252319842053307, "grad_norm": 0.5467817783355713, "learning_rate": 0.0001168525983216873, "loss": 0.828, "step": 639 }, { "epoch": 0.2527147087857848, "grad_norm": 0.5477814078330994, "learning_rate": 0.00011663293545831302, "loss": 0.8878, "step": 640 }, { "epoch": 0.2531095755182626, "grad_norm": 0.5831199884414673, "learning_rate": 0.00011641319002342568, "loss": 0.9146, "step": 641 }, { "epoch": 0.25350444225074037, "grad_norm": 0.4861442446708679, "learning_rate": 0.00011619336310791586, "loss": 0.9527, "step": 642 }, { "epoch": 0.25389930898321816, "grad_norm": 0.4937233030796051, "learning_rate": 0.00011597345580307875, "loss": 0.8631, "step": 643 }, { "epoch": 0.25429417571569596, "grad_norm": 0.5312657952308655, "learning_rate": 0.00011575346920060846, "loss": 1.0702, "step": 644 }, { "epoch": 0.25468904244817375, "grad_norm": 0.5473558306694031, "learning_rate": 0.00011553340439259286, "loss": 0.8867, "step": 645 }, { "epoch": 0.25508390918065155, "grad_norm": 0.5094785094261169, "learning_rate": 0.00011531326247150803, "loss": 0.9356, "step": 646 }, { "epoch": 0.25547877591312934, "grad_norm": 0.4709779620170593, "learning_rate": 0.00011509304453021288, "loss": 0.8547, "step": 647 }, { "epoch": 0.2558736426456071, "grad_norm": 0.5478555560112, "learning_rate": 0.00011487275166194367, "loss": 1.013, "step": 648 }, { "epoch": 0.2562685093780849, "grad_norm": 0.5569909811019897, "learning_rate": 0.00011465238496030868, "loss": 0.9655, "step": 649 }, { "epoch": 0.25666337611056267, "grad_norm": 0.6182945370674133, "learning_rate": 0.00011443194551928266, "loss": 1.0553, "step": 650 }, { "epoch": 0.25705824284304046, "grad_norm": 0.49789944291114807, "learning_rate": 0.00011421143443320155, "loss": 1.0104, "step": 651 }, { "epoch": 0.25745310957551826, "grad_norm": 0.5329926013946533, "learning_rate": 0.00011399085279675687, "loss": 1.1935, "step": 652 }, { "epoch": 0.25784797630799605, "grad_norm": 0.6216697692871094, "learning_rate": 0.0001137702017049904, "loss": 0.9502, "step": 653 }, { "epoch": 0.25824284304047385, "grad_norm": 0.46455976366996765, "learning_rate": 0.00011354948225328877, "loss": 1.0217, "step": 654 }, { "epoch": 0.25863770977295164, "grad_norm": 0.5468802452087402, "learning_rate": 0.0001133286955373779, "loss": 0.9669, "step": 655 }, { "epoch": 0.25903257650542943, "grad_norm": 0.6737267971038818, "learning_rate": 0.00011310784265331769, "loss": 0.828, "step": 656 }, { "epoch": 0.25942744323790723, "grad_norm": 0.5070087909698486, "learning_rate": 0.00011288692469749649, "loss": 0.9463, "step": 657 }, { "epoch": 0.25982230997038497, "grad_norm": 0.5430976152420044, "learning_rate": 0.0001126659427666257, "loss": 0.9986, "step": 658 }, { "epoch": 0.26021717670286276, "grad_norm": 0.4886017143726349, "learning_rate": 0.00011244489795773432, "loss": 1.0702, "step": 659 }, { "epoch": 0.26061204343534056, "grad_norm": 0.4727013111114502, "learning_rate": 0.00011222379136816345, "loss": 0.9179, "step": 660 }, { "epoch": 0.26100691016781835, "grad_norm": 0.48751479387283325, "learning_rate": 0.00011200262409556097, "loss": 0.9176, "step": 661 }, { "epoch": 0.26140177690029615, "grad_norm": 0.511191189289093, "learning_rate": 0.00011178139723787597, "loss": 1.0286, "step": 662 }, { "epoch": 0.26179664363277394, "grad_norm": 0.48295167088508606, "learning_rate": 0.00011156011189335332, "loss": 0.9306, "step": 663 }, { "epoch": 0.26219151036525173, "grad_norm": 0.4702792167663574, "learning_rate": 0.00011133876916052821, "loss": 0.9666, "step": 664 }, { "epoch": 0.26258637709772953, "grad_norm": 0.5391889810562134, "learning_rate": 0.00011111737013822088, "loss": 0.9043, "step": 665 }, { "epoch": 0.2629812438302073, "grad_norm": 0.6235263347625732, "learning_rate": 0.00011089591592553082, "loss": 1.0091, "step": 666 }, { "epoch": 0.2633761105626851, "grad_norm": 0.5476643443107605, "learning_rate": 0.00011067440762183164, "loss": 0.9399, "step": 667 }, { "epoch": 0.26377097729516286, "grad_norm": 0.42856502532958984, "learning_rate": 0.00011045284632676536, "loss": 0.8856, "step": 668 }, { "epoch": 0.26416584402764065, "grad_norm": 0.6507248878479004, "learning_rate": 0.00011023123314023717, "loss": 1.0047, "step": 669 }, { "epoch": 0.26456071076011844, "grad_norm": 0.5612763166427612, "learning_rate": 0.00011000956916240985, "loss": 0.9552, "step": 670 }, { "epoch": 0.26495557749259624, "grad_norm": 0.6156942844390869, "learning_rate": 0.00010978785549369823, "loss": 0.7834, "step": 671 }, { "epoch": 0.26535044422507403, "grad_norm": 0.5807977914810181, "learning_rate": 0.00010956609323476399, "loss": 1.0304, "step": 672 }, { "epoch": 0.26574531095755183, "grad_norm": 0.4886478781700134, "learning_rate": 0.00010934428348650986, "loss": 1.0725, "step": 673 }, { "epoch": 0.2661401776900296, "grad_norm": 0.6015396118164062, "learning_rate": 0.00010912242735007441, "loss": 0.9459, "step": 674 }, { "epoch": 0.2665350444225074, "grad_norm": 0.5361436605453491, "learning_rate": 0.0001089005259268265, "loss": 1.0034, "step": 675 }, { "epoch": 0.2669299111549852, "grad_norm": 0.5702396035194397, "learning_rate": 0.00010867858031835975, "loss": 1.1566, "step": 676 }, { "epoch": 0.267324777887463, "grad_norm": 0.5034629702568054, "learning_rate": 0.00010845659162648723, "loss": 0.9551, "step": 677 }, { "epoch": 0.26771964461994074, "grad_norm": 0.49666622281074524, "learning_rate": 0.00010823456095323579, "loss": 0.9059, "step": 678 }, { "epoch": 0.26811451135241854, "grad_norm": 0.5842538475990295, "learning_rate": 0.00010801248940084074, "loss": 0.8893, "step": 679 }, { "epoch": 0.26850937808489633, "grad_norm": 0.5735609531402588, "learning_rate": 0.00010779037807174033, "loss": 0.983, "step": 680 }, { "epoch": 0.2689042448173741, "grad_norm": 0.5265182256698608, "learning_rate": 0.00010756822806857028, "loss": 0.8681, "step": 681 }, { "epoch": 0.2692991115498519, "grad_norm": 0.5432460308074951, "learning_rate": 0.00010734604049415822, "loss": 0.7613, "step": 682 }, { "epoch": 0.2696939782823297, "grad_norm": 0.5309749245643616, "learning_rate": 0.00010712381645151844, "loss": 1.1094, "step": 683 }, { "epoch": 0.2700888450148075, "grad_norm": 0.588448703289032, "learning_rate": 0.00010690155704384615, "loss": 1.082, "step": 684 }, { "epoch": 0.2704837117472853, "grad_norm": 0.4991750717163086, "learning_rate": 0.00010667926337451217, "loss": 0.8744, "step": 685 }, { "epoch": 0.2708785784797631, "grad_norm": 0.4775620698928833, "learning_rate": 0.0001064569365470574, "loss": 1.025, "step": 686 }, { "epoch": 0.2712734452122409, "grad_norm": 0.5142999291419983, "learning_rate": 0.00010623457766518736, "loss": 0.9704, "step": 687 }, { "epoch": 0.27166831194471863, "grad_norm": 0.48825737833976746, "learning_rate": 0.00010601218783276672, "loss": 1.122, "step": 688 }, { "epoch": 0.2720631786771964, "grad_norm": 0.5517759919166565, "learning_rate": 0.00010578976815381372, "loss": 0.8859, "step": 689 }, { "epoch": 0.2724580454096742, "grad_norm": 0.5661524534225464, "learning_rate": 0.00010556731973249485, "loss": 0.89, "step": 690 }, { "epoch": 0.272852912142152, "grad_norm": 0.5802194476127625, "learning_rate": 0.00010534484367311923, "loss": 0.8048, "step": 691 }, { "epoch": 0.2732477788746298, "grad_norm": 0.5603414177894592, "learning_rate": 0.00010512234108013319, "loss": 1.0869, "step": 692 }, { "epoch": 0.2736426456071076, "grad_norm": 0.5277503728866577, "learning_rate": 0.00010489981305811487, "loss": 0.7287, "step": 693 }, { "epoch": 0.2740375123395854, "grad_norm": 0.6045374870300293, "learning_rate": 0.00010467726071176853, "loss": 0.9417, "step": 694 }, { "epoch": 0.2744323790720632, "grad_norm": 0.6021410822868347, "learning_rate": 0.00010445468514591925, "loss": 1.1399, "step": 695 }, { "epoch": 0.274827245804541, "grad_norm": 0.5544920563697815, "learning_rate": 0.00010423208746550732, "loss": 0.7279, "step": 696 }, { "epoch": 0.2752221125370188, "grad_norm": 0.5612072348594666, "learning_rate": 0.00010400946877558293, "loss": 0.8715, "step": 697 }, { "epoch": 0.2756169792694965, "grad_norm": 0.5820972323417664, "learning_rate": 0.00010378683018130047, "loss": 0.9456, "step": 698 }, { "epoch": 0.2760118460019743, "grad_norm": 0.43969717621803284, "learning_rate": 0.0001035641727879131, "loss": 0.8192, "step": 699 }, { "epoch": 0.2764067127344521, "grad_norm": 0.6528908610343933, "learning_rate": 0.00010334149770076747, "loss": 0.9742, "step": 700 }, { "epoch": 0.2768015794669299, "grad_norm": 0.4668010473251343, "learning_rate": 0.00010311880602529794, "loss": 0.9471, "step": 701 }, { "epoch": 0.2771964461994077, "grad_norm": 0.5474753379821777, "learning_rate": 0.0001028960988670212, "loss": 0.9224, "step": 702 }, { "epoch": 0.2775913129318855, "grad_norm": 0.5672122240066528, "learning_rate": 0.00010267337733153089, "loss": 0.8441, "step": 703 }, { "epoch": 0.2779861796643633, "grad_norm": 0.5570011138916016, "learning_rate": 0.00010245064252449201, "loss": 0.9897, "step": 704 }, { "epoch": 0.2783810463968411, "grad_norm": 0.5113812685012817, "learning_rate": 0.0001022278955516354, "loss": 1.0066, "step": 705 }, { "epoch": 0.2787759131293189, "grad_norm": 0.48175865411758423, "learning_rate": 0.00010200513751875227, "loss": 0.9212, "step": 706 }, { "epoch": 0.27917077986179667, "grad_norm": 0.5643198490142822, "learning_rate": 0.00010178236953168885, "loss": 1.2371, "step": 707 }, { "epoch": 0.2795656465942744, "grad_norm": 0.5115451216697693, "learning_rate": 0.00010155959269634068, "loss": 0.9701, "step": 708 }, { "epoch": 0.2799605133267522, "grad_norm": 0.5253522396087646, "learning_rate": 0.00010133680811864727, "loss": 1.0685, "step": 709 }, { "epoch": 0.28035538005923, "grad_norm": 0.7160750031471252, "learning_rate": 0.00010111401690458654, "loss": 1.0837, "step": 710 }, { "epoch": 0.28035538005923, "eval_loss": 0.9457208514213562, "eval_runtime": 61.5761, "eval_samples_per_second": 17.328, "eval_steps_per_second": 8.672, "step": 710 } ], "logging_steps": 1, "max_steps": 1420, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 355, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.602799288203346e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }