{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.2450288646568315, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006414368184733803, "grad_norm": 100.24421691894531, "learning_rate": 5e-06, "loss": 6.7825, "step": 1 }, { "epoch": 0.0012828736369467607, "grad_norm": 96.55813598632812, "learning_rate": 1e-05, "loss": 6.6818, "step": 2 }, { "epoch": 0.001924310455420141, "grad_norm": 68.83885955810547, "learning_rate": 1.5e-05, "loss": 3.804, "step": 3 }, { "epoch": 0.0025657472738935213, "grad_norm": 38.46703338623047, "learning_rate": 2e-05, "loss": 1.9238, "step": 4 }, { "epoch": 0.003207184092366902, "grad_norm": 15.147366523742676, "learning_rate": 2.5e-05, "loss": 1.3866, "step": 5 }, { "epoch": 0.003848620910840282, "grad_norm": 14.376184463500977, "learning_rate": 3e-05, "loss": 1.5185, "step": 6 }, { "epoch": 0.004490057729313663, "grad_norm": 15.449289321899414, "learning_rate": 3.5e-05, "loss": 1.4292, "step": 7 }, { "epoch": 0.005131494547787043, "grad_norm": 11.545750617980957, "learning_rate": 4e-05, "loss": 1.0364, "step": 8 }, { "epoch": 0.005772931366260423, "grad_norm": 10.854671478271484, "learning_rate": 4.5e-05, "loss": 1.0705, "step": 9 }, { "epoch": 0.006414368184733804, "grad_norm": 11.654020309448242, "learning_rate": 5e-05, "loss": 0.9265, "step": 10 }, { "epoch": 0.007055805003207184, "grad_norm": 10.016984939575195, "learning_rate": 4.998928647953718e-05, "loss": 0.9207, "step": 11 }, { "epoch": 0.007697241821680564, "grad_norm": 8.052754402160645, "learning_rate": 4.997857295907435e-05, "loss": 0.7656, "step": 12 }, { "epoch": 0.008338678640153944, "grad_norm": 5.5707855224609375, "learning_rate": 4.996785943861153e-05, "loss": 0.7391, "step": 13 }, { "epoch": 0.008980115458627326, "grad_norm": 4.301912307739258, "learning_rate": 4.995714591814871e-05, "loss": 0.7591, "step": 14 }, { "epoch": 0.009621552277100705, "grad_norm": 2.8485121726989746, "learning_rate": 4.994643239768588e-05, "loss": 0.5702, "step": 15 }, { "epoch": 0.010262989095574085, "grad_norm": 3.208489418029785, "learning_rate": 4.993571887722306e-05, "loss": 0.7505, "step": 16 }, { "epoch": 0.010904425914047467, "grad_norm": 2.344665050506592, "learning_rate": 4.9925005356760236e-05, "loss": 0.5729, "step": 17 }, { "epoch": 0.011545862732520847, "grad_norm": 2.4727344512939453, "learning_rate": 4.9914291836297414e-05, "loss": 0.6388, "step": 18 }, { "epoch": 0.012187299550994226, "grad_norm": 2.681884288787842, "learning_rate": 4.9903578315834585e-05, "loss": 0.689, "step": 19 }, { "epoch": 0.012828736369467608, "grad_norm": 2.1392059326171875, "learning_rate": 4.9892864795371764e-05, "loss": 0.5993, "step": 20 }, { "epoch": 0.013470173187940988, "grad_norm": 2.282071828842163, "learning_rate": 4.988215127490894e-05, "loss": 0.6194, "step": 21 }, { "epoch": 0.014111610006414367, "grad_norm": 2.0227839946746826, "learning_rate": 4.987143775444611e-05, "loss": 0.5856, "step": 22 }, { "epoch": 0.014753046824887749, "grad_norm": 2.0284602642059326, "learning_rate": 4.986072423398329e-05, "loss": 0.6496, "step": 23 }, { "epoch": 0.015394483643361129, "grad_norm": 1.9359203577041626, "learning_rate": 4.985001071352047e-05, "loss": 0.5761, "step": 24 }, { "epoch": 0.01603592046183451, "grad_norm": 2.2996370792388916, "learning_rate": 4.983929719305764e-05, "loss": 0.6577, "step": 25 }, { "epoch": 0.01667735728030789, "grad_norm": 2.2415122985839844, "learning_rate": 4.982858367259482e-05, "loss": 0.6267, "step": 26 }, { "epoch": 0.01731879409878127, "grad_norm": 1.9243086576461792, "learning_rate": 4.9817870152132e-05, "loss": 0.5432, "step": 27 }, { "epoch": 0.01796023091725465, "grad_norm": 2.2558109760284424, "learning_rate": 4.980715663166917e-05, "loss": 0.7068, "step": 28 }, { "epoch": 0.01860166773572803, "grad_norm": 2.1993391513824463, "learning_rate": 4.979644311120634e-05, "loss": 0.5604, "step": 29 }, { "epoch": 0.01924310455420141, "grad_norm": 2.469998359680176, "learning_rate": 4.978572959074352e-05, "loss": 0.6022, "step": 30 }, { "epoch": 0.019884541372674792, "grad_norm": 2.1126129627227783, "learning_rate": 4.9775016070280696e-05, "loss": 0.6629, "step": 31 }, { "epoch": 0.02052597819114817, "grad_norm": 2.17244291305542, "learning_rate": 4.976430254981787e-05, "loss": 0.6701, "step": 32 }, { "epoch": 0.021167415009621552, "grad_norm": 1.9778406620025635, "learning_rate": 4.9753589029355046e-05, "loss": 0.5202, "step": 33 }, { "epoch": 0.021808851828094934, "grad_norm": 1.877848505973816, "learning_rate": 4.9742875508892224e-05, "loss": 0.5952, "step": 34 }, { "epoch": 0.02245028864656831, "grad_norm": 1.7970020771026611, "learning_rate": 4.97321619884294e-05, "loss": 0.5078, "step": 35 }, { "epoch": 0.023091725465041693, "grad_norm": 1.8294093608856201, "learning_rate": 4.972144846796657e-05, "loss": 0.5333, "step": 36 }, { "epoch": 0.023733162283515075, "grad_norm": 1.8300261497497559, "learning_rate": 4.971073494750375e-05, "loss": 0.479, "step": 37 }, { "epoch": 0.024374599101988453, "grad_norm": 2.0693013668060303, "learning_rate": 4.970002142704093e-05, "loss": 0.514, "step": 38 }, { "epoch": 0.025016035920461834, "grad_norm": 2.3574793338775635, "learning_rate": 4.96893079065781e-05, "loss": 0.7274, "step": 39 }, { "epoch": 0.025657472738935216, "grad_norm": 2.061659097671509, "learning_rate": 4.967859438611528e-05, "loss": 0.6441, "step": 40 }, { "epoch": 0.026298909557408594, "grad_norm": 1.825835943222046, "learning_rate": 4.966788086565246e-05, "loss": 0.588, "step": 41 }, { "epoch": 0.026940346375881975, "grad_norm": 1.854544997215271, "learning_rate": 4.965716734518963e-05, "loss": 0.5601, "step": 42 }, { "epoch": 0.027581783194355357, "grad_norm": 1.9829517602920532, "learning_rate": 4.964645382472681e-05, "loss": 0.5665, "step": 43 }, { "epoch": 0.028223220012828735, "grad_norm": 1.902833342552185, "learning_rate": 4.9635740304263985e-05, "loss": 0.4878, "step": 44 }, { "epoch": 0.028864656831302116, "grad_norm": 2.017401695251465, "learning_rate": 4.9625026783801156e-05, "loss": 0.5003, "step": 45 }, { "epoch": 0.029506093649775498, "grad_norm": 2.052668809890747, "learning_rate": 4.9614313263338334e-05, "loss": 0.5292, "step": 46 }, { "epoch": 0.030147530468248876, "grad_norm": 1.8866446018218994, "learning_rate": 4.960359974287551e-05, "loss": 0.5227, "step": 47 }, { "epoch": 0.030788967286722257, "grad_norm": 1.8989174365997314, "learning_rate": 4.959288622241269e-05, "loss": 0.6582, "step": 48 }, { "epoch": 0.03143040410519564, "grad_norm": 1.8427460193634033, "learning_rate": 4.958217270194986e-05, "loss": 0.6119, "step": 49 }, { "epoch": 0.03207184092366902, "grad_norm": 2.160384178161621, "learning_rate": 4.957145918148704e-05, "loss": 0.6403, "step": 50 }, { "epoch": 0.0327132777421424, "grad_norm": 1.9759780168533325, "learning_rate": 4.956074566102422e-05, "loss": 0.6121, "step": 51 }, { "epoch": 0.03335471456061578, "grad_norm": 1.8529947996139526, "learning_rate": 4.955003214056139e-05, "loss": 0.5713, "step": 52 }, { "epoch": 0.03399615137908916, "grad_norm": 1.8441227674484253, "learning_rate": 4.953931862009857e-05, "loss": 0.5685, "step": 53 }, { "epoch": 0.03463758819756254, "grad_norm": 2.002082347869873, "learning_rate": 4.9528605099635746e-05, "loss": 0.5612, "step": 54 }, { "epoch": 0.03527902501603592, "grad_norm": 1.7718000411987305, "learning_rate": 4.951789157917292e-05, "loss": 0.5534, "step": 55 }, { "epoch": 0.0359204618345093, "grad_norm": 1.929713249206543, "learning_rate": 4.9507178058710096e-05, "loss": 0.6793, "step": 56 }, { "epoch": 0.036561898652982684, "grad_norm": 2.078444242477417, "learning_rate": 4.9496464538247274e-05, "loss": 0.6606, "step": 57 }, { "epoch": 0.03720333547145606, "grad_norm": 1.7518585920333862, "learning_rate": 4.948575101778445e-05, "loss": 0.5421, "step": 58 }, { "epoch": 0.03784477228992944, "grad_norm": 2.0391950607299805, "learning_rate": 4.947503749732162e-05, "loss": 0.711, "step": 59 }, { "epoch": 0.03848620910840282, "grad_norm": 1.849752426147461, "learning_rate": 4.94643239768588e-05, "loss": 0.6402, "step": 60 }, { "epoch": 0.0391276459268762, "grad_norm": 1.728917121887207, "learning_rate": 4.945361045639598e-05, "loss": 0.5278, "step": 61 }, { "epoch": 0.039769082745349585, "grad_norm": 1.674017071723938, "learning_rate": 4.9442896935933144e-05, "loss": 0.4939, "step": 62 }, { "epoch": 0.040410519563822966, "grad_norm": 2.1079559326171875, "learning_rate": 4.943218341547032e-05, "loss": 0.7008, "step": 63 }, { "epoch": 0.04105195638229634, "grad_norm": 1.6876856088638306, "learning_rate": 4.94214698950075e-05, "loss": 0.4802, "step": 64 }, { "epoch": 0.04169339320076972, "grad_norm": 1.77272367477417, "learning_rate": 4.941075637454468e-05, "loss": 0.5, "step": 65 }, { "epoch": 0.042334830019243104, "grad_norm": 1.8971494436264038, "learning_rate": 4.940004285408185e-05, "loss": 0.5228, "step": 66 }, { "epoch": 0.042976266837716486, "grad_norm": 2.0170295238494873, "learning_rate": 4.938932933361903e-05, "loss": 0.5638, "step": 67 }, { "epoch": 0.04361770365618987, "grad_norm": 1.8261733055114746, "learning_rate": 4.9378615813156206e-05, "loss": 0.5034, "step": 68 }, { "epoch": 0.04425914047466325, "grad_norm": 1.812777042388916, "learning_rate": 4.936790229269338e-05, "loss": 0.5528, "step": 69 }, { "epoch": 0.04490057729313662, "grad_norm": 1.7822304964065552, "learning_rate": 4.9357188772230556e-05, "loss": 0.5051, "step": 70 }, { "epoch": 0.045542014111610005, "grad_norm": 1.8895049095153809, "learning_rate": 4.9346475251767734e-05, "loss": 0.5525, "step": 71 }, { "epoch": 0.046183450930083386, "grad_norm": 1.8160299062728882, "learning_rate": 4.9335761731304905e-05, "loss": 0.4912, "step": 72 }, { "epoch": 0.04682488774855677, "grad_norm": 1.7559746503829956, "learning_rate": 4.9325048210842083e-05, "loss": 0.5541, "step": 73 }, { "epoch": 0.04746632456703015, "grad_norm": 2.0638630390167236, "learning_rate": 4.931433469037926e-05, "loss": 0.5941, "step": 74 }, { "epoch": 0.04810776138550353, "grad_norm": 1.5790079832077026, "learning_rate": 4.930362116991643e-05, "loss": 0.4544, "step": 75 }, { "epoch": 0.048749198203976905, "grad_norm": 1.7963168621063232, "learning_rate": 4.929290764945361e-05, "loss": 0.5108, "step": 76 }, { "epoch": 0.04939063502245029, "grad_norm": 1.6238653659820557, "learning_rate": 4.928219412899079e-05, "loss": 0.4633, "step": 77 }, { "epoch": 0.05003207184092367, "grad_norm": 1.7622190713882446, "learning_rate": 4.927148060852797e-05, "loss": 0.4804, "step": 78 }, { "epoch": 0.05067350865939705, "grad_norm": 1.9001740217208862, "learning_rate": 4.926076708806514e-05, "loss": 0.6196, "step": 79 }, { "epoch": 0.05131494547787043, "grad_norm": 1.954129695892334, "learning_rate": 4.925005356760232e-05, "loss": 0.6043, "step": 80 }, { "epoch": 0.05195638229634381, "grad_norm": 2.06662654876709, "learning_rate": 4.9239340047139495e-05, "loss": 0.703, "step": 81 }, { "epoch": 0.05259781911481719, "grad_norm": 1.7040766477584839, "learning_rate": 4.9228626526676666e-05, "loss": 0.5438, "step": 82 }, { "epoch": 0.05323925593329057, "grad_norm": 1.6499028205871582, "learning_rate": 4.9217913006213845e-05, "loss": 0.5822, "step": 83 }, { "epoch": 0.05388069275176395, "grad_norm": 1.6216635704040527, "learning_rate": 4.920719948575102e-05, "loss": 0.556, "step": 84 }, { "epoch": 0.05452212957023733, "grad_norm": 1.5636239051818848, "learning_rate": 4.9196485965288194e-05, "loss": 0.5218, "step": 85 }, { "epoch": 0.055163566388710714, "grad_norm": 1.6300181150436401, "learning_rate": 4.918577244482537e-05, "loss": 0.4711, "step": 86 }, { "epoch": 0.055805003207184095, "grad_norm": 1.9622509479522705, "learning_rate": 4.917505892436255e-05, "loss": 0.5746, "step": 87 }, { "epoch": 0.05644644002565747, "grad_norm": 2.0733256340026855, "learning_rate": 4.916434540389973e-05, "loss": 0.7158, "step": 88 }, { "epoch": 0.05708787684413085, "grad_norm": 1.93630051612854, "learning_rate": 4.91536318834369e-05, "loss": 0.5604, "step": 89 }, { "epoch": 0.05772931366260423, "grad_norm": 1.981084942817688, "learning_rate": 4.914291836297408e-05, "loss": 0.702, "step": 90 }, { "epoch": 0.058370750481077614, "grad_norm": 1.6981867551803589, "learning_rate": 4.9132204842511256e-05, "loss": 0.5162, "step": 91 }, { "epoch": 0.059012187299550996, "grad_norm": 1.8542778491973877, "learning_rate": 4.912149132204843e-05, "loss": 0.6608, "step": 92 }, { "epoch": 0.05965362411802438, "grad_norm": 1.5656605958938599, "learning_rate": 4.9110777801585606e-05, "loss": 0.5578, "step": 93 }, { "epoch": 0.06029506093649775, "grad_norm": 1.6726746559143066, "learning_rate": 4.9100064281122784e-05, "loss": 0.5208, "step": 94 }, { "epoch": 0.06093649775497113, "grad_norm": 1.5304489135742188, "learning_rate": 4.9089350760659955e-05, "loss": 0.5135, "step": 95 }, { "epoch": 0.061577934573444515, "grad_norm": 1.6554018259048462, "learning_rate": 4.907863724019713e-05, "loss": 0.4837, "step": 96 }, { "epoch": 0.062219371391917896, "grad_norm": 1.7143610715866089, "learning_rate": 4.9067923719734305e-05, "loss": 0.5544, "step": 97 }, { "epoch": 0.06286080821039128, "grad_norm": 1.6658152341842651, "learning_rate": 4.905721019927148e-05, "loss": 0.4896, "step": 98 }, { "epoch": 0.06350224502886466, "grad_norm": 1.8784583806991577, "learning_rate": 4.9046496678808654e-05, "loss": 0.5899, "step": 99 }, { "epoch": 0.06414368184733804, "grad_norm": 1.751034140586853, "learning_rate": 4.903578315834583e-05, "loss": 0.5696, "step": 100 }, { "epoch": 0.06478511866581142, "grad_norm": 1.7741023302078247, "learning_rate": 4.902506963788301e-05, "loss": 0.5333, "step": 101 }, { "epoch": 0.0654265554842848, "grad_norm": 1.7069830894470215, "learning_rate": 4.901435611742018e-05, "loss": 0.4964, "step": 102 }, { "epoch": 0.06606799230275817, "grad_norm": 1.9073753356933594, "learning_rate": 4.900364259695736e-05, "loss": 0.7719, "step": 103 }, { "epoch": 0.06670942912123155, "grad_norm": 1.4832751750946045, "learning_rate": 4.899292907649454e-05, "loss": 0.4515, "step": 104 }, { "epoch": 0.06735086593970493, "grad_norm": 1.600658655166626, "learning_rate": 4.8982215556031716e-05, "loss": 0.4652, "step": 105 }, { "epoch": 0.06799230275817832, "grad_norm": 1.5580891370773315, "learning_rate": 4.897150203556889e-05, "loss": 0.4924, "step": 106 }, { "epoch": 0.0686337395766517, "grad_norm": 1.8145841360092163, "learning_rate": 4.8960788515106066e-05, "loss": 0.5887, "step": 107 }, { "epoch": 0.06927517639512508, "grad_norm": 1.5942411422729492, "learning_rate": 4.8950074994643244e-05, "loss": 0.4504, "step": 108 }, { "epoch": 0.06991661321359846, "grad_norm": 1.65703547000885, "learning_rate": 4.8939361474180415e-05, "loss": 0.5335, "step": 109 }, { "epoch": 0.07055805003207184, "grad_norm": 1.7898274660110474, "learning_rate": 4.8928647953717594e-05, "loss": 0.4998, "step": 110 }, { "epoch": 0.07119948685054522, "grad_norm": 1.6554588079452515, "learning_rate": 4.891793443325477e-05, "loss": 0.4639, "step": 111 }, { "epoch": 0.0718409236690186, "grad_norm": 1.7719485759735107, "learning_rate": 4.890722091279194e-05, "loss": 0.5203, "step": 112 }, { "epoch": 0.07248236048749199, "grad_norm": 1.725519061088562, "learning_rate": 4.889650739232912e-05, "loss": 0.5078, "step": 113 }, { "epoch": 0.07312379730596537, "grad_norm": 1.5903488397598267, "learning_rate": 4.88857938718663e-05, "loss": 0.4762, "step": 114 }, { "epoch": 0.07376523412443874, "grad_norm": 1.9805841445922852, "learning_rate": 4.887508035140347e-05, "loss": 0.686, "step": 115 }, { "epoch": 0.07440667094291212, "grad_norm": 1.9111052751541138, "learning_rate": 4.886436683094065e-05, "loss": 0.6207, "step": 116 }, { "epoch": 0.0750481077613855, "grad_norm": 1.7614665031433105, "learning_rate": 4.885365331047783e-05, "loss": 0.593, "step": 117 }, { "epoch": 0.07568954457985888, "grad_norm": 1.7120221853256226, "learning_rate": 4.8842939790015005e-05, "loss": 0.5144, "step": 118 }, { "epoch": 0.07633098139833226, "grad_norm": 1.5200276374816895, "learning_rate": 4.883222626955218e-05, "loss": 0.4817, "step": 119 }, { "epoch": 0.07697241821680564, "grad_norm": 1.6384608745574951, "learning_rate": 4.8821512749089355e-05, "loss": 0.4963, "step": 120 }, { "epoch": 0.07761385503527903, "grad_norm": 1.6416105031967163, "learning_rate": 4.881079922862653e-05, "loss": 0.5578, "step": 121 }, { "epoch": 0.0782552918537524, "grad_norm": 1.7692714929580688, "learning_rate": 4.8800085708163704e-05, "loss": 0.5559, "step": 122 }, { "epoch": 0.07889672867222579, "grad_norm": 1.5257912874221802, "learning_rate": 4.878937218770088e-05, "loss": 0.5137, "step": 123 }, { "epoch": 0.07953816549069917, "grad_norm": 1.908044695854187, "learning_rate": 4.877865866723806e-05, "loss": 0.593, "step": 124 }, { "epoch": 0.08017960230917255, "grad_norm": 1.515125036239624, "learning_rate": 4.876794514677523e-05, "loss": 0.4453, "step": 125 }, { "epoch": 0.08082103912764593, "grad_norm": 1.7736637592315674, "learning_rate": 4.875723162631241e-05, "loss": 0.4846, "step": 126 }, { "epoch": 0.0814624759461193, "grad_norm": 1.615708589553833, "learning_rate": 4.874651810584959e-05, "loss": 0.5017, "step": 127 }, { "epoch": 0.08210391276459268, "grad_norm": 2.014517068862915, "learning_rate": 4.873580458538676e-05, "loss": 0.6698, "step": 128 }, { "epoch": 0.08274534958306606, "grad_norm": 1.795600414276123, "learning_rate": 4.872509106492394e-05, "loss": 0.6084, "step": 129 }, { "epoch": 0.08338678640153944, "grad_norm": 1.710478663444519, "learning_rate": 4.8714377544461116e-05, "loss": 0.5265, "step": 130 }, { "epoch": 0.08402822322001283, "grad_norm": 1.738063097000122, "learning_rate": 4.870366402399829e-05, "loss": 0.6281, "step": 131 }, { "epoch": 0.08466966003848621, "grad_norm": 1.8740025758743286, "learning_rate": 4.869295050353546e-05, "loss": 0.5187, "step": 132 }, { "epoch": 0.08531109685695959, "grad_norm": 1.6442103385925293, "learning_rate": 4.868223698307264e-05, "loss": 0.5803, "step": 133 }, { "epoch": 0.08595253367543297, "grad_norm": 1.354164481163025, "learning_rate": 4.8671523462609815e-05, "loss": 0.4205, "step": 134 }, { "epoch": 0.08659397049390635, "grad_norm": 1.6383416652679443, "learning_rate": 4.866080994214699e-05, "loss": 0.6328, "step": 135 }, { "epoch": 0.08723540731237973, "grad_norm": 1.4804211854934692, "learning_rate": 4.8650096421684165e-05, "loss": 0.4894, "step": 136 }, { "epoch": 0.08787684413085312, "grad_norm": 1.3127148151397705, "learning_rate": 4.863938290122134e-05, "loss": 0.4483, "step": 137 }, { "epoch": 0.0885182809493265, "grad_norm": 1.3717304468154907, "learning_rate": 4.862866938075852e-05, "loss": 0.4546, "step": 138 }, { "epoch": 0.08915971776779986, "grad_norm": 1.5721876621246338, "learning_rate": 4.861795586029569e-05, "loss": 0.5355, "step": 139 }, { "epoch": 0.08980115458627325, "grad_norm": 1.7701294422149658, "learning_rate": 4.860724233983287e-05, "loss": 0.5459, "step": 140 }, { "epoch": 0.09044259140474663, "grad_norm": 1.5756542682647705, "learning_rate": 4.859652881937005e-05, "loss": 0.5362, "step": 141 }, { "epoch": 0.09108402822322001, "grad_norm": 1.7308592796325684, "learning_rate": 4.858581529890722e-05, "loss": 0.5462, "step": 142 }, { "epoch": 0.09172546504169339, "grad_norm": 1.7323704957962036, "learning_rate": 4.85751017784444e-05, "loss": 0.5971, "step": 143 }, { "epoch": 0.09236690186016677, "grad_norm": 1.7644233703613281, "learning_rate": 4.8564388257981576e-05, "loss": 0.6561, "step": 144 }, { "epoch": 0.09300833867864015, "grad_norm": 1.8449784517288208, "learning_rate": 4.855367473751875e-05, "loss": 0.6637, "step": 145 }, { "epoch": 0.09364977549711354, "grad_norm": 2.1125733852386475, "learning_rate": 4.8542961217055926e-05, "loss": 0.7949, "step": 146 }, { "epoch": 0.09429121231558692, "grad_norm": 1.59554922580719, "learning_rate": 4.8532247696593104e-05, "loss": 0.608, "step": 147 }, { "epoch": 0.0949326491340603, "grad_norm": 1.5676103830337524, "learning_rate": 4.852153417613028e-05, "loss": 0.578, "step": 148 }, { "epoch": 0.09557408595253368, "grad_norm": 2.0604071617126465, "learning_rate": 4.851082065566745e-05, "loss": 0.6887, "step": 149 }, { "epoch": 0.09621552277100706, "grad_norm": 1.6824266910552979, "learning_rate": 4.850010713520463e-05, "loss": 0.6655, "step": 150 }, { "epoch": 0.09685695958948044, "grad_norm": 1.5957653522491455, "learning_rate": 4.848939361474181e-05, "loss": 0.5618, "step": 151 }, { "epoch": 0.09749839640795381, "grad_norm": 1.7735342979431152, "learning_rate": 4.847868009427898e-05, "loss": 0.5676, "step": 152 }, { "epoch": 0.09813983322642719, "grad_norm": 1.583139419555664, "learning_rate": 4.846796657381616e-05, "loss": 0.5539, "step": 153 }, { "epoch": 0.09878127004490057, "grad_norm": 1.7483869791030884, "learning_rate": 4.845725305335334e-05, "loss": 0.5336, "step": 154 }, { "epoch": 0.09942270686337396, "grad_norm": 1.647527813911438, "learning_rate": 4.844653953289051e-05, "loss": 0.5122, "step": 155 }, { "epoch": 0.10006414368184734, "grad_norm": 1.658677339553833, "learning_rate": 4.843582601242769e-05, "loss": 0.5881, "step": 156 }, { "epoch": 0.10070558050032072, "grad_norm": 1.8139889240264893, "learning_rate": 4.8425112491964865e-05, "loss": 0.5772, "step": 157 }, { "epoch": 0.1013470173187941, "grad_norm": 1.5739526748657227, "learning_rate": 4.8414398971502036e-05, "loss": 0.5019, "step": 158 }, { "epoch": 0.10198845413726748, "grad_norm": 1.8966543674468994, "learning_rate": 4.8403685451039214e-05, "loss": 0.7111, "step": 159 }, { "epoch": 0.10262989095574086, "grad_norm": 1.7432408332824707, "learning_rate": 4.839297193057639e-05, "loss": 0.4942, "step": 160 }, { "epoch": 0.10327132777421424, "grad_norm": 1.5300322771072388, "learning_rate": 4.838225841011357e-05, "loss": 0.5138, "step": 161 }, { "epoch": 0.10391276459268763, "grad_norm": 1.7612435817718506, "learning_rate": 4.837154488965074e-05, "loss": 0.4716, "step": 162 }, { "epoch": 0.10455420141116101, "grad_norm": 1.6488189697265625, "learning_rate": 4.836083136918792e-05, "loss": 0.61, "step": 163 }, { "epoch": 0.10519563822963438, "grad_norm": 1.4644861221313477, "learning_rate": 4.83501178487251e-05, "loss": 0.4923, "step": 164 }, { "epoch": 0.10583707504810776, "grad_norm": 1.6652438640594482, "learning_rate": 4.833940432826227e-05, "loss": 0.5812, "step": 165 }, { "epoch": 0.10647851186658114, "grad_norm": 1.6392529010772705, "learning_rate": 4.832869080779944e-05, "loss": 0.5971, "step": 166 }, { "epoch": 0.10711994868505452, "grad_norm": 1.5112555027008057, "learning_rate": 4.831797728733662e-05, "loss": 0.4704, "step": 167 }, { "epoch": 0.1077613855035279, "grad_norm": 1.689149260520935, "learning_rate": 4.83072637668738e-05, "loss": 0.6043, "step": 168 }, { "epoch": 0.10840282232200128, "grad_norm": 1.8384333848953247, "learning_rate": 4.829655024641097e-05, "loss": 0.6324, "step": 169 }, { "epoch": 0.10904425914047466, "grad_norm": 1.82729172706604, "learning_rate": 4.828583672594815e-05, "loss": 0.4672, "step": 170 }, { "epoch": 0.10968569595894805, "grad_norm": 1.9861725568771362, "learning_rate": 4.8275123205485325e-05, "loss": 0.63, "step": 171 }, { "epoch": 0.11032713277742143, "grad_norm": 1.7452824115753174, "learning_rate": 4.8264409685022497e-05, "loss": 0.5346, "step": 172 }, { "epoch": 0.11096856959589481, "grad_norm": 1.6078959703445435, "learning_rate": 4.8253696164559675e-05, "loss": 0.5335, "step": 173 }, { "epoch": 0.11161000641436819, "grad_norm": 1.5774822235107422, "learning_rate": 4.824298264409685e-05, "loss": 0.4992, "step": 174 }, { "epoch": 0.11225144323284157, "grad_norm": 1.4560508728027344, "learning_rate": 4.8232269123634024e-05, "loss": 0.4979, "step": 175 }, { "epoch": 0.11289288005131494, "grad_norm": 1.359175682067871, "learning_rate": 4.82215556031712e-05, "loss": 0.4501, "step": 176 }, { "epoch": 0.11353431686978832, "grad_norm": 1.5068504810333252, "learning_rate": 4.821084208270838e-05, "loss": 0.5067, "step": 177 }, { "epoch": 0.1141757536882617, "grad_norm": 1.4865458011627197, "learning_rate": 4.820012856224556e-05, "loss": 0.5185, "step": 178 }, { "epoch": 0.11481719050673508, "grad_norm": 1.4227818250656128, "learning_rate": 4.818941504178273e-05, "loss": 0.4615, "step": 179 }, { "epoch": 0.11545862732520847, "grad_norm": 1.6488735675811768, "learning_rate": 4.817870152131991e-05, "loss": 0.5709, "step": 180 }, { "epoch": 0.11610006414368185, "grad_norm": 1.6535853147506714, "learning_rate": 4.8167988000857086e-05, "loss": 0.507, "step": 181 }, { "epoch": 0.11674150096215523, "grad_norm": 1.476467251777649, "learning_rate": 4.815727448039426e-05, "loss": 0.482, "step": 182 }, { "epoch": 0.11738293778062861, "grad_norm": 1.6573824882507324, "learning_rate": 4.8146560959931436e-05, "loss": 0.5248, "step": 183 }, { "epoch": 0.11802437459910199, "grad_norm": 1.5171890258789062, "learning_rate": 4.8135847439468614e-05, "loss": 0.4931, "step": 184 }, { "epoch": 0.11866581141757537, "grad_norm": 1.7187572717666626, "learning_rate": 4.8125133919005785e-05, "loss": 0.6074, "step": 185 }, { "epoch": 0.11930724823604875, "grad_norm": 1.5924901962280273, "learning_rate": 4.8114420398542964e-05, "loss": 0.5417, "step": 186 }, { "epoch": 0.11994868505452214, "grad_norm": 1.626067042350769, "learning_rate": 4.810370687808014e-05, "loss": 0.6544, "step": 187 }, { "epoch": 0.1205901218729955, "grad_norm": 1.5036357641220093, "learning_rate": 4.809299335761731e-05, "loss": 0.5256, "step": 188 }, { "epoch": 0.12123155869146889, "grad_norm": 1.5415763854980469, "learning_rate": 4.808227983715449e-05, "loss": 0.4656, "step": 189 }, { "epoch": 0.12187299550994227, "grad_norm": 1.558143138885498, "learning_rate": 4.807156631669167e-05, "loss": 0.5377, "step": 190 }, { "epoch": 0.12251443232841565, "grad_norm": 1.6941472291946411, "learning_rate": 4.806085279622885e-05, "loss": 0.5004, "step": 191 }, { "epoch": 0.12315586914688903, "grad_norm": 1.5763695240020752, "learning_rate": 4.805013927576602e-05, "loss": 0.5145, "step": 192 }, { "epoch": 0.12379730596536241, "grad_norm": 1.7750403881072998, "learning_rate": 4.80394257553032e-05, "loss": 0.5104, "step": 193 }, { "epoch": 0.12443874278383579, "grad_norm": 1.478691577911377, "learning_rate": 4.8028712234840375e-05, "loss": 0.4572, "step": 194 }, { "epoch": 0.12508017960230916, "grad_norm": 1.8954867124557495, "learning_rate": 4.8017998714377547e-05, "loss": 0.6736, "step": 195 }, { "epoch": 0.12572161642078256, "grad_norm": 1.4616411924362183, "learning_rate": 4.8007285193914725e-05, "loss": 0.4848, "step": 196 }, { "epoch": 0.12636305323925592, "grad_norm": 1.4471367597579956, "learning_rate": 4.79965716734519e-05, "loss": 0.3714, "step": 197 }, { "epoch": 0.12700449005772932, "grad_norm": 1.6354637145996094, "learning_rate": 4.7985858152989074e-05, "loss": 0.5376, "step": 198 }, { "epoch": 0.1276459268762027, "grad_norm": 1.5469319820404053, "learning_rate": 4.7975144632526246e-05, "loss": 0.4332, "step": 199 }, { "epoch": 0.12828736369467608, "grad_norm": 1.4849071502685547, "learning_rate": 4.7964431112063424e-05, "loss": 0.4074, "step": 200 }, { "epoch": 0.12892880051314945, "grad_norm": 1.5249581336975098, "learning_rate": 4.79537175916006e-05, "loss": 0.4749, "step": 201 }, { "epoch": 0.12957023733162285, "grad_norm": 1.6055619716644287, "learning_rate": 4.794300407113777e-05, "loss": 0.4635, "step": 202 }, { "epoch": 0.1302116741500962, "grad_norm": 1.9342025518417358, "learning_rate": 4.793229055067495e-05, "loss": 0.7519, "step": 203 }, { "epoch": 0.1308531109685696, "grad_norm": 1.5930043458938599, "learning_rate": 4.792157703021213e-05, "loss": 0.5525, "step": 204 }, { "epoch": 0.13149454778704298, "grad_norm": 1.6431877613067627, "learning_rate": 4.79108635097493e-05, "loss": 0.49, "step": 205 }, { "epoch": 0.13213598460551634, "grad_norm": 1.547714114189148, "learning_rate": 4.790014998928648e-05, "loss": 0.4543, "step": 206 }, { "epoch": 0.13277742142398974, "grad_norm": 1.708211898803711, "learning_rate": 4.788943646882366e-05, "loss": 0.5831, "step": 207 }, { "epoch": 0.1334188582424631, "grad_norm": 1.9091277122497559, "learning_rate": 4.7878722948360835e-05, "loss": 0.7214, "step": 208 }, { "epoch": 0.1340602950609365, "grad_norm": 1.50388765335083, "learning_rate": 4.786800942789801e-05, "loss": 0.4154, "step": 209 }, { "epoch": 0.13470173187940987, "grad_norm": 1.813082218170166, "learning_rate": 4.7857295907435185e-05, "loss": 0.5793, "step": 210 }, { "epoch": 0.13534316869788326, "grad_norm": 1.3508574962615967, "learning_rate": 4.784658238697236e-05, "loss": 0.4632, "step": 211 }, { "epoch": 0.13598460551635663, "grad_norm": 1.6064634323120117, "learning_rate": 4.7835868866509534e-05, "loss": 0.6065, "step": 212 }, { "epoch": 0.13662604233483003, "grad_norm": 1.5153980255126953, "learning_rate": 4.782515534604671e-05, "loss": 0.593, "step": 213 }, { "epoch": 0.1372674791533034, "grad_norm": 1.5047849416732788, "learning_rate": 4.781444182558389e-05, "loss": 0.4729, "step": 214 }, { "epoch": 0.1379089159717768, "grad_norm": 1.4270846843719482, "learning_rate": 4.780372830512106e-05, "loss": 0.4759, "step": 215 }, { "epoch": 0.13855035279025016, "grad_norm": 1.6271865367889404, "learning_rate": 4.779301478465824e-05, "loss": 0.4772, "step": 216 }, { "epoch": 0.13919178960872355, "grad_norm": 1.555928111076355, "learning_rate": 4.778230126419542e-05, "loss": 0.5221, "step": 217 }, { "epoch": 0.13983322642719692, "grad_norm": 1.5493953227996826, "learning_rate": 4.7771587743732597e-05, "loss": 0.4841, "step": 218 }, { "epoch": 0.1404746632456703, "grad_norm": 1.4955663681030273, "learning_rate": 4.776087422326977e-05, "loss": 0.4319, "step": 219 }, { "epoch": 0.14111610006414368, "grad_norm": 1.8192116022109985, "learning_rate": 4.7750160702806946e-05, "loss": 0.5862, "step": 220 }, { "epoch": 0.14175753688261705, "grad_norm": 1.4944863319396973, "learning_rate": 4.7739447182344124e-05, "loss": 0.4772, "step": 221 }, { "epoch": 0.14239897370109045, "grad_norm": 1.5729892253875732, "learning_rate": 4.7728733661881296e-05, "loss": 0.4819, "step": 222 }, { "epoch": 0.14304041051956382, "grad_norm": 1.569934368133545, "learning_rate": 4.7718020141418474e-05, "loss": 0.4912, "step": 223 }, { "epoch": 0.1436818473380372, "grad_norm": 1.5215916633605957, "learning_rate": 4.770730662095565e-05, "loss": 0.4569, "step": 224 }, { "epoch": 0.14432328415651058, "grad_norm": 1.6247138977050781, "learning_rate": 4.769659310049282e-05, "loss": 0.5046, "step": 225 }, { "epoch": 0.14496472097498397, "grad_norm": 1.3567224740982056, "learning_rate": 4.768587958003e-05, "loss": 0.4812, "step": 226 }, { "epoch": 0.14560615779345734, "grad_norm": 1.3151541948318481, "learning_rate": 4.767516605956718e-05, "loss": 0.4605, "step": 227 }, { "epoch": 0.14624759461193074, "grad_norm": 1.5889521837234497, "learning_rate": 4.766445253910435e-05, "loss": 0.5891, "step": 228 }, { "epoch": 0.1468890314304041, "grad_norm": 1.6343984603881836, "learning_rate": 4.765373901864153e-05, "loss": 0.5235, "step": 229 }, { "epoch": 0.14753046824887747, "grad_norm": 1.8016682863235474, "learning_rate": 4.764302549817871e-05, "loss": 0.614, "step": 230 }, { "epoch": 0.14817190506735087, "grad_norm": 1.6019026041030884, "learning_rate": 4.7632311977715885e-05, "loss": 0.6102, "step": 231 }, { "epoch": 0.14881334188582424, "grad_norm": 1.2824311256408691, "learning_rate": 4.762159845725306e-05, "loss": 0.4532, "step": 232 }, { "epoch": 0.14945477870429763, "grad_norm": 1.407341480255127, "learning_rate": 4.7610884936790235e-05, "loss": 0.4788, "step": 233 }, { "epoch": 0.150096215522771, "grad_norm": 1.5635006427764893, "learning_rate": 4.7600171416327406e-05, "loss": 0.5387, "step": 234 }, { "epoch": 0.1507376523412444, "grad_norm": 1.475744605064392, "learning_rate": 4.758945789586458e-05, "loss": 0.5062, "step": 235 }, { "epoch": 0.15137908915971776, "grad_norm": 1.7191523313522339, "learning_rate": 4.7578744375401756e-05, "loss": 0.542, "step": 236 }, { "epoch": 0.15202052597819116, "grad_norm": 1.6239638328552246, "learning_rate": 4.7568030854938934e-05, "loss": 0.5773, "step": 237 }, { "epoch": 0.15266196279666452, "grad_norm": 1.6453572511672974, "learning_rate": 4.755731733447611e-05, "loss": 0.5528, "step": 238 }, { "epoch": 0.15330339961513792, "grad_norm": 1.5776801109313965, "learning_rate": 4.7546603814013283e-05, "loss": 0.5009, "step": 239 }, { "epoch": 0.1539448364336113, "grad_norm": 1.4142982959747314, "learning_rate": 4.753589029355046e-05, "loss": 0.5204, "step": 240 }, { "epoch": 0.15458627325208468, "grad_norm": 1.517731785774231, "learning_rate": 4.752517677308764e-05, "loss": 0.4615, "step": 241 }, { "epoch": 0.15522771007055805, "grad_norm": 1.6813066005706787, "learning_rate": 4.751446325262481e-05, "loss": 0.5141, "step": 242 }, { "epoch": 0.15586914688903142, "grad_norm": 1.346747875213623, "learning_rate": 4.750374973216199e-05, "loss": 0.4118, "step": 243 }, { "epoch": 0.1565105837075048, "grad_norm": 1.7371439933776855, "learning_rate": 4.749303621169917e-05, "loss": 0.6157, "step": 244 }, { "epoch": 0.15715202052597818, "grad_norm": 1.5112606287002563, "learning_rate": 4.748232269123634e-05, "loss": 0.5185, "step": 245 }, { "epoch": 0.15779345734445158, "grad_norm": 1.892873764038086, "learning_rate": 4.747160917077352e-05, "loss": 0.5661, "step": 246 }, { "epoch": 0.15843489416292494, "grad_norm": 1.711940050125122, "learning_rate": 4.7460895650310695e-05, "loss": 0.5514, "step": 247 }, { "epoch": 0.15907633098139834, "grad_norm": 1.6900933980941772, "learning_rate": 4.745018212984787e-05, "loss": 0.5047, "step": 248 }, { "epoch": 0.1597177677998717, "grad_norm": 1.5495864152908325, "learning_rate": 4.7439468609385045e-05, "loss": 0.5624, "step": 249 }, { "epoch": 0.1603592046183451, "grad_norm": 1.458254098892212, "learning_rate": 4.742875508892222e-05, "loss": 0.4973, "step": 250 }, { "epoch": 0.16100064143681847, "grad_norm": 1.6938390731811523, "learning_rate": 4.74180415684594e-05, "loss": 0.5221, "step": 251 }, { "epoch": 0.16164207825529187, "grad_norm": 1.4692200422286987, "learning_rate": 4.740732804799657e-05, "loss": 0.5531, "step": 252 }, { "epoch": 0.16228351507376523, "grad_norm": 1.8710209131240845, "learning_rate": 4.739661452753375e-05, "loss": 0.5527, "step": 253 }, { "epoch": 0.1629249518922386, "grad_norm": 1.467434048652649, "learning_rate": 4.738590100707093e-05, "loss": 0.4801, "step": 254 }, { "epoch": 0.163566388710712, "grad_norm": 1.6584354639053345, "learning_rate": 4.73751874866081e-05, "loss": 0.4924, "step": 255 }, { "epoch": 0.16420782552918536, "grad_norm": 1.5574162006378174, "learning_rate": 4.736447396614528e-05, "loss": 0.5502, "step": 256 }, { "epoch": 0.16484926234765876, "grad_norm": 1.4646837711334229, "learning_rate": 4.7353760445682456e-05, "loss": 0.4526, "step": 257 }, { "epoch": 0.16549069916613213, "grad_norm": 1.3951880931854248, "learning_rate": 4.734304692521963e-05, "loss": 0.5006, "step": 258 }, { "epoch": 0.16613213598460552, "grad_norm": 1.632028341293335, "learning_rate": 4.7332333404756806e-05, "loss": 0.559, "step": 259 }, { "epoch": 0.1667735728030789, "grad_norm": 1.3829830884933472, "learning_rate": 4.7321619884293984e-05, "loss": 0.4356, "step": 260 }, { "epoch": 0.16741500962155229, "grad_norm": 1.563115119934082, "learning_rate": 4.731090636383116e-05, "loss": 0.4778, "step": 261 }, { "epoch": 0.16805644644002565, "grad_norm": 1.5808372497558594, "learning_rate": 4.7300192843368333e-05, "loss": 0.4958, "step": 262 }, { "epoch": 0.16869788325849905, "grad_norm": 1.7043310403823853, "learning_rate": 4.728947932290551e-05, "loss": 0.5036, "step": 263 }, { "epoch": 0.16933932007697242, "grad_norm": 1.699259877204895, "learning_rate": 4.727876580244269e-05, "loss": 0.6092, "step": 264 }, { "epoch": 0.1699807568954458, "grad_norm": 1.723818063735962, "learning_rate": 4.726805228197986e-05, "loss": 0.6332, "step": 265 }, { "epoch": 0.17062219371391918, "grad_norm": 1.5261138677597046, "learning_rate": 4.725733876151704e-05, "loss": 0.4999, "step": 266 }, { "epoch": 0.17126363053239255, "grad_norm": 1.6044187545776367, "learning_rate": 4.724662524105422e-05, "loss": 0.5611, "step": 267 }, { "epoch": 0.17190506735086594, "grad_norm": 1.5518348217010498, "learning_rate": 4.723591172059139e-05, "loss": 0.567, "step": 268 }, { "epoch": 0.1725465041693393, "grad_norm": 1.5821868181228638, "learning_rate": 4.722519820012856e-05, "loss": 0.5228, "step": 269 }, { "epoch": 0.1731879409878127, "grad_norm": 1.4579048156738281, "learning_rate": 4.721448467966574e-05, "loss": 0.4526, "step": 270 }, { "epoch": 0.17382937780628607, "grad_norm": 1.369643211364746, "learning_rate": 4.7203771159202916e-05, "loss": 0.4661, "step": 271 }, { "epoch": 0.17447081462475947, "grad_norm": 1.5554405450820923, "learning_rate": 4.719305763874009e-05, "loss": 0.5052, "step": 272 }, { "epoch": 0.17511225144323284, "grad_norm": 1.3762422800064087, "learning_rate": 4.7182344118277266e-05, "loss": 0.4901, "step": 273 }, { "epoch": 0.17575368826170623, "grad_norm": 1.605282187461853, "learning_rate": 4.7171630597814444e-05, "loss": 0.4781, "step": 274 }, { "epoch": 0.1763951250801796, "grad_norm": 1.6735568046569824, "learning_rate": 4.7160917077351615e-05, "loss": 0.6231, "step": 275 }, { "epoch": 0.177036561898653, "grad_norm": 1.3934433460235596, "learning_rate": 4.7150203556888794e-05, "loss": 0.4993, "step": 276 }, { "epoch": 0.17767799871712636, "grad_norm": 1.4424147605895996, "learning_rate": 4.713949003642597e-05, "loss": 0.4909, "step": 277 }, { "epoch": 0.17831943553559973, "grad_norm": 1.7380441427230835, "learning_rate": 4.712877651596315e-05, "loss": 0.6072, "step": 278 }, { "epoch": 0.17896087235407312, "grad_norm": 1.5534104108810425, "learning_rate": 4.711806299550032e-05, "loss": 0.5808, "step": 279 }, { "epoch": 0.1796023091725465, "grad_norm": 1.4766466617584229, "learning_rate": 4.71073494750375e-05, "loss": 0.4876, "step": 280 }, { "epoch": 0.1802437459910199, "grad_norm": 1.4162263870239258, "learning_rate": 4.709663595457468e-05, "loss": 0.4514, "step": 281 }, { "epoch": 0.18088518280949326, "grad_norm": 1.6947705745697021, "learning_rate": 4.708592243411185e-05, "loss": 0.4628, "step": 282 }, { "epoch": 0.18152661962796665, "grad_norm": 1.5543221235275269, "learning_rate": 4.707520891364903e-05, "loss": 0.524, "step": 283 }, { "epoch": 0.18216805644644002, "grad_norm": 1.280588984489441, "learning_rate": 4.7064495393186205e-05, "loss": 0.4366, "step": 284 }, { "epoch": 0.18280949326491341, "grad_norm": 1.6857483386993408, "learning_rate": 4.705378187272338e-05, "loss": 0.6039, "step": 285 }, { "epoch": 0.18345093008338678, "grad_norm": 1.5844470262527466, "learning_rate": 4.7043068352260555e-05, "loss": 0.5223, "step": 286 }, { "epoch": 0.18409236690186018, "grad_norm": 1.6337124109268188, "learning_rate": 4.703235483179773e-05, "loss": 0.617, "step": 287 }, { "epoch": 0.18473380372033354, "grad_norm": 1.3356449604034424, "learning_rate": 4.7021641311334904e-05, "loss": 0.4455, "step": 288 }, { "epoch": 0.18537524053880694, "grad_norm": 1.3304201364517212, "learning_rate": 4.701092779087208e-05, "loss": 0.4608, "step": 289 }, { "epoch": 0.1860166773572803, "grad_norm": 1.6193445920944214, "learning_rate": 4.700021427040926e-05, "loss": 0.4551, "step": 290 }, { "epoch": 0.18665811417575368, "grad_norm": 1.5284535884857178, "learning_rate": 4.698950074994644e-05, "loss": 0.4975, "step": 291 }, { "epoch": 0.18729955099422707, "grad_norm": 2.047988176345825, "learning_rate": 4.697878722948361e-05, "loss": 0.5552, "step": 292 }, { "epoch": 0.18794098781270044, "grad_norm": 1.4307637214660645, "learning_rate": 4.696807370902079e-05, "loss": 0.4185, "step": 293 }, { "epoch": 0.18858242463117383, "grad_norm": 1.4615066051483154, "learning_rate": 4.6957360188557966e-05, "loss": 0.5334, "step": 294 }, { "epoch": 0.1892238614496472, "grad_norm": 1.5693747997283936, "learning_rate": 4.694664666809514e-05, "loss": 0.4851, "step": 295 }, { "epoch": 0.1898652982681206, "grad_norm": 1.589124083518982, "learning_rate": 4.6935933147632316e-05, "loss": 0.5386, "step": 296 }, { "epoch": 0.19050673508659396, "grad_norm": 1.5410070419311523, "learning_rate": 4.6925219627169494e-05, "loss": 0.5619, "step": 297 }, { "epoch": 0.19114817190506736, "grad_norm": 1.3358327150344849, "learning_rate": 4.6914506106706665e-05, "loss": 0.4264, "step": 298 }, { "epoch": 0.19178960872354073, "grad_norm": 1.7839651107788086, "learning_rate": 4.6903792586243844e-05, "loss": 0.5994, "step": 299 }, { "epoch": 0.19243104554201412, "grad_norm": 1.7678946256637573, "learning_rate": 4.689307906578102e-05, "loss": 0.6329, "step": 300 }, { "epoch": 0.1930724823604875, "grad_norm": 1.7360612154006958, "learning_rate": 4.688236554531819e-05, "loss": 0.5445, "step": 301 }, { "epoch": 0.19371391917896089, "grad_norm": 1.2778462171554565, "learning_rate": 4.6871652024855364e-05, "loss": 0.4371, "step": 302 }, { "epoch": 0.19435535599743425, "grad_norm": 1.3607347011566162, "learning_rate": 4.686093850439254e-05, "loss": 0.4578, "step": 303 }, { "epoch": 0.19499679281590762, "grad_norm": 1.6441987752914429, "learning_rate": 4.685022498392972e-05, "loss": 0.6042, "step": 304 }, { "epoch": 0.19563822963438102, "grad_norm": 1.6056040525436401, "learning_rate": 4.683951146346689e-05, "loss": 0.5253, "step": 305 }, { "epoch": 0.19627966645285438, "grad_norm": 1.3334013223648071, "learning_rate": 4.682879794300407e-05, "loss": 0.4458, "step": 306 }, { "epoch": 0.19692110327132778, "grad_norm": 1.4572285413742065, "learning_rate": 4.681808442254125e-05, "loss": 0.4713, "step": 307 }, { "epoch": 0.19756254008980115, "grad_norm": 2.2872133255004883, "learning_rate": 4.6807370902078427e-05, "loss": 0.3982, "step": 308 }, { "epoch": 0.19820397690827454, "grad_norm": 1.833408236503601, "learning_rate": 4.67966573816156e-05, "loss": 0.6075, "step": 309 }, { "epoch": 0.1988454137267479, "grad_norm": 1.6712530851364136, "learning_rate": 4.6785943861152776e-05, "loss": 0.5479, "step": 310 }, { "epoch": 0.1994868505452213, "grad_norm": 1.8464308977127075, "learning_rate": 4.6775230340689954e-05, "loss": 0.5244, "step": 311 }, { "epoch": 0.20012828736369467, "grad_norm": 1.7027182579040527, "learning_rate": 4.6764516820227126e-05, "loss": 0.6082, "step": 312 }, { "epoch": 0.20076972418216807, "grad_norm": 1.530312180519104, "learning_rate": 4.6753803299764304e-05, "loss": 0.4792, "step": 313 }, { "epoch": 0.20141116100064144, "grad_norm": 1.6097338199615479, "learning_rate": 4.674308977930148e-05, "loss": 0.5683, "step": 314 }, { "epoch": 0.2020525978191148, "grad_norm": 1.6826748847961426, "learning_rate": 4.673237625883865e-05, "loss": 0.6352, "step": 315 }, { "epoch": 0.2026940346375882, "grad_norm": 1.5383225679397583, "learning_rate": 4.672166273837583e-05, "loss": 0.5537, "step": 316 }, { "epoch": 0.20333547145606157, "grad_norm": 1.3590924739837646, "learning_rate": 4.671094921791301e-05, "loss": 0.5306, "step": 317 }, { "epoch": 0.20397690827453496, "grad_norm": 1.4048683643341064, "learning_rate": 4.670023569745018e-05, "loss": 0.4771, "step": 318 }, { "epoch": 0.20461834509300833, "grad_norm": 1.6621766090393066, "learning_rate": 4.668952217698736e-05, "loss": 0.6294, "step": 319 }, { "epoch": 0.20525978191148173, "grad_norm": 1.3039182424545288, "learning_rate": 4.667880865652454e-05, "loss": 0.4563, "step": 320 }, { "epoch": 0.2059012187299551, "grad_norm": 1.4288063049316406, "learning_rate": 4.6668095136061715e-05, "loss": 0.476, "step": 321 }, { "epoch": 0.2065426555484285, "grad_norm": 1.446771502494812, "learning_rate": 4.665738161559889e-05, "loss": 0.5053, "step": 322 }, { "epoch": 0.20718409236690186, "grad_norm": 1.430417776107788, "learning_rate": 4.6646668095136065e-05, "loss": 0.4584, "step": 323 }, { "epoch": 0.20782552918537525, "grad_norm": 1.556067943572998, "learning_rate": 4.663595457467324e-05, "loss": 0.4939, "step": 324 }, { "epoch": 0.20846696600384862, "grad_norm": 1.6403834819793701, "learning_rate": 4.6625241054210414e-05, "loss": 0.5771, "step": 325 }, { "epoch": 0.20910840282232201, "grad_norm": 1.4305994510650635, "learning_rate": 4.661452753374759e-05, "loss": 0.5211, "step": 326 }, { "epoch": 0.20974983964079538, "grad_norm": 1.4296616315841675, "learning_rate": 4.660381401328477e-05, "loss": 0.504, "step": 327 }, { "epoch": 0.21039127645926875, "grad_norm": 1.31740140914917, "learning_rate": 4.659310049282194e-05, "loss": 0.4205, "step": 328 }, { "epoch": 0.21103271327774215, "grad_norm": 1.5254106521606445, "learning_rate": 4.658238697235912e-05, "loss": 0.4944, "step": 329 }, { "epoch": 0.2116741500962155, "grad_norm": 1.649407982826233, "learning_rate": 4.65716734518963e-05, "loss": 0.5223, "step": 330 }, { "epoch": 0.2123155869146889, "grad_norm": 1.6064136028289795, "learning_rate": 4.6560959931433477e-05, "loss": 0.5601, "step": 331 }, { "epoch": 0.21295702373316228, "grad_norm": 1.7800652980804443, "learning_rate": 4.655024641097065e-05, "loss": 0.657, "step": 332 }, { "epoch": 0.21359846055163567, "grad_norm": 1.4999407529830933, "learning_rate": 4.6539532890507826e-05, "loss": 0.5514, "step": 333 }, { "epoch": 0.21423989737010904, "grad_norm": 1.4923063516616821, "learning_rate": 4.6528819370045004e-05, "loss": 0.5366, "step": 334 }, { "epoch": 0.21488133418858243, "grad_norm": 1.6542556285858154, "learning_rate": 4.6518105849582176e-05, "loss": 0.7167, "step": 335 }, { "epoch": 0.2155227710070558, "grad_norm": 1.277533769607544, "learning_rate": 4.6507392329119354e-05, "loss": 0.4443, "step": 336 }, { "epoch": 0.2161642078255292, "grad_norm": 1.3998193740844727, "learning_rate": 4.6496678808656525e-05, "loss": 0.511, "step": 337 }, { "epoch": 0.21680564464400257, "grad_norm": 1.3951646089553833, "learning_rate": 4.64859652881937e-05, "loss": 0.5088, "step": 338 }, { "epoch": 0.21744708146247593, "grad_norm": 1.8245488405227661, "learning_rate": 4.6475251767730875e-05, "loss": 0.7404, "step": 339 }, { "epoch": 0.21808851828094933, "grad_norm": 1.450464129447937, "learning_rate": 4.646453824726805e-05, "loss": 0.4664, "step": 340 }, { "epoch": 0.2187299550994227, "grad_norm": 1.6238346099853516, "learning_rate": 4.645382472680523e-05, "loss": 0.5744, "step": 341 }, { "epoch": 0.2193713919178961, "grad_norm": 1.4141359329223633, "learning_rate": 4.64431112063424e-05, "loss": 0.4928, "step": 342 }, { "epoch": 0.22001282873636946, "grad_norm": 1.4492441415786743, "learning_rate": 4.643239768587958e-05, "loss": 0.4857, "step": 343 }, { "epoch": 0.22065426555484285, "grad_norm": 1.4806609153747559, "learning_rate": 4.642168416541676e-05, "loss": 0.4732, "step": 344 }, { "epoch": 0.22129570237331622, "grad_norm": 1.553146481513977, "learning_rate": 4.641097064495393e-05, "loss": 0.5582, "step": 345 }, { "epoch": 0.22193713919178962, "grad_norm": 1.3742289543151855, "learning_rate": 4.640025712449111e-05, "loss": 0.4387, "step": 346 }, { "epoch": 0.22257857601026299, "grad_norm": 1.728601098060608, "learning_rate": 4.6389543604028286e-05, "loss": 0.7135, "step": 347 }, { "epoch": 0.22322001282873638, "grad_norm": 1.4172694683074951, "learning_rate": 4.637883008356546e-05, "loss": 0.4649, "step": 348 }, { "epoch": 0.22386144964720975, "grad_norm": 1.3263812065124512, "learning_rate": 4.6368116563102636e-05, "loss": 0.379, "step": 349 }, { "epoch": 0.22450288646568314, "grad_norm": 1.5398468971252441, "learning_rate": 4.6357403042639814e-05, "loss": 0.5081, "step": 350 }, { "epoch": 0.2251443232841565, "grad_norm": 1.2656311988830566, "learning_rate": 4.634668952217699e-05, "loss": 0.4288, "step": 351 }, { "epoch": 0.22578576010262988, "grad_norm": 1.5645676851272583, "learning_rate": 4.6335976001714163e-05, "loss": 0.5997, "step": 352 }, { "epoch": 0.22642719692110327, "grad_norm": 1.6120851039886475, "learning_rate": 4.632526248125134e-05, "loss": 0.6301, "step": 353 }, { "epoch": 0.22706863373957664, "grad_norm": 1.8576008081436157, "learning_rate": 4.631454896078852e-05, "loss": 0.4732, "step": 354 }, { "epoch": 0.22771007055805004, "grad_norm": 1.5975759029388428, "learning_rate": 4.630383544032569e-05, "loss": 0.5598, "step": 355 }, { "epoch": 0.2283515073765234, "grad_norm": 1.3927828073501587, "learning_rate": 4.629312191986287e-05, "loss": 0.5005, "step": 356 }, { "epoch": 0.2289929441949968, "grad_norm": 1.4944462776184082, "learning_rate": 4.628240839940005e-05, "loss": 0.4604, "step": 357 }, { "epoch": 0.22963438101347017, "grad_norm": 1.4728032350540161, "learning_rate": 4.627169487893722e-05, "loss": 0.5027, "step": 358 }, { "epoch": 0.23027581783194356, "grad_norm": 1.4038598537445068, "learning_rate": 4.62609813584744e-05, "loss": 0.4297, "step": 359 }, { "epoch": 0.23091725465041693, "grad_norm": 1.3563872575759888, "learning_rate": 4.6250267838011575e-05, "loss": 0.4542, "step": 360 }, { "epoch": 0.23155869146889033, "grad_norm": 1.7057814598083496, "learning_rate": 4.623955431754875e-05, "loss": 0.6181, "step": 361 }, { "epoch": 0.2322001282873637, "grad_norm": 1.8444342613220215, "learning_rate": 4.6228840797085925e-05, "loss": 0.6941, "step": 362 }, { "epoch": 0.23284156510583706, "grad_norm": 1.5131815671920776, "learning_rate": 4.62181272766231e-05, "loss": 0.558, "step": 363 }, { "epoch": 0.23348300192431046, "grad_norm": 1.5922291278839111, "learning_rate": 4.620741375616028e-05, "loss": 0.5548, "step": 364 }, { "epoch": 0.23412443874278382, "grad_norm": 1.4044103622436523, "learning_rate": 4.619670023569745e-05, "loss": 0.5012, "step": 365 }, { "epoch": 0.23476587556125722, "grad_norm": 1.2750734090805054, "learning_rate": 4.618598671523463e-05, "loss": 0.4439, "step": 366 }, { "epoch": 0.2354073123797306, "grad_norm": 1.5231209993362427, "learning_rate": 4.617527319477181e-05, "loss": 0.5378, "step": 367 }, { "epoch": 0.23604874919820398, "grad_norm": 1.427763819694519, "learning_rate": 4.616455967430898e-05, "loss": 0.4457, "step": 368 }, { "epoch": 0.23669018601667735, "grad_norm": 1.1999701261520386, "learning_rate": 4.615384615384616e-05, "loss": 0.4141, "step": 369 }, { "epoch": 0.23733162283515075, "grad_norm": 1.413572907447815, "learning_rate": 4.6143132633383336e-05, "loss": 0.4762, "step": 370 }, { "epoch": 0.2379730596536241, "grad_norm": 1.4929783344268799, "learning_rate": 4.613241911292051e-05, "loss": 0.5295, "step": 371 }, { "epoch": 0.2386144964720975, "grad_norm": 1.7919728755950928, "learning_rate": 4.612170559245768e-05, "loss": 0.4706, "step": 372 }, { "epoch": 0.23925593329057088, "grad_norm": 1.5677385330200195, "learning_rate": 4.611099207199486e-05, "loss": 0.497, "step": 373 }, { "epoch": 0.23989737010904427, "grad_norm": 1.5595201253890991, "learning_rate": 4.6100278551532035e-05, "loss": 0.5433, "step": 374 }, { "epoch": 0.24053880692751764, "grad_norm": 1.5932742357254028, "learning_rate": 4.608956503106921e-05, "loss": 0.5307, "step": 375 }, { "epoch": 0.241180243745991, "grad_norm": 1.4591612815856934, "learning_rate": 4.6078851510606385e-05, "loss": 0.5001, "step": 376 }, { "epoch": 0.2418216805644644, "grad_norm": 1.4209476709365845, "learning_rate": 4.606813799014356e-05, "loss": 0.5061, "step": 377 }, { "epoch": 0.24246311738293777, "grad_norm": 1.2774978876113892, "learning_rate": 4.605742446968074e-05, "loss": 0.4219, "step": 378 }, { "epoch": 0.24310455420141117, "grad_norm": 1.155782699584961, "learning_rate": 4.604671094921791e-05, "loss": 0.3975, "step": 379 }, { "epoch": 0.24374599101988453, "grad_norm": 1.376373052597046, "learning_rate": 4.603599742875509e-05, "loss": 0.4741, "step": 380 }, { "epoch": 0.24438742783835793, "grad_norm": 1.3529887199401855, "learning_rate": 4.602528390829227e-05, "loss": 0.5039, "step": 381 }, { "epoch": 0.2450288646568313, "grad_norm": 1.6168662309646606, "learning_rate": 4.601457038782944e-05, "loss": 0.5447, "step": 382 }, { "epoch": 0.2456703014753047, "grad_norm": 1.3796201944351196, "learning_rate": 4.600385686736662e-05, "loss": 0.435, "step": 383 }, { "epoch": 0.24631173829377806, "grad_norm": 1.6600466966629028, "learning_rate": 4.5993143346903796e-05, "loss": 0.4735, "step": 384 }, { "epoch": 0.24695317511225146, "grad_norm": 1.364157795906067, "learning_rate": 4.598242982644097e-05, "loss": 0.4476, "step": 385 }, { "epoch": 0.24759461193072482, "grad_norm": 1.2961732149124146, "learning_rate": 4.5971716305978146e-05, "loss": 0.4301, "step": 386 }, { "epoch": 0.2482360487491982, "grad_norm": 1.7346696853637695, "learning_rate": 4.5961002785515324e-05, "loss": 0.6086, "step": 387 }, { "epoch": 0.24887748556767159, "grad_norm": 1.4478318691253662, "learning_rate": 4.5950289265052496e-05, "loss": 0.4704, "step": 388 }, { "epoch": 0.24951892238614495, "grad_norm": 1.302009105682373, "learning_rate": 4.5939575744589674e-05, "loss": 0.416, "step": 389 }, { "epoch": 0.2501603592046183, "grad_norm": 1.4174201488494873, "learning_rate": 4.592886222412685e-05, "loss": 0.4666, "step": 390 }, { "epoch": 0.2508017960230917, "grad_norm": 1.7057377099990845, "learning_rate": 4.591814870366403e-05, "loss": 0.4672, "step": 391 }, { "epoch": 0.2514432328415651, "grad_norm": 1.3881409168243408, "learning_rate": 4.59074351832012e-05, "loss": 0.4739, "step": 392 }, { "epoch": 0.2520846696600385, "grad_norm": 1.2296890020370483, "learning_rate": 4.589672166273838e-05, "loss": 0.4311, "step": 393 }, { "epoch": 0.25272610647851185, "grad_norm": 1.3220642805099487, "learning_rate": 4.588600814227556e-05, "loss": 0.4736, "step": 394 }, { "epoch": 0.25336754329698524, "grad_norm": 1.5447416305541992, "learning_rate": 4.587529462181273e-05, "loss": 0.4831, "step": 395 }, { "epoch": 0.25400898011545864, "grad_norm": 1.469234824180603, "learning_rate": 4.586458110134991e-05, "loss": 0.5077, "step": 396 }, { "epoch": 0.25465041693393203, "grad_norm": 1.777005672454834, "learning_rate": 4.5853867580887085e-05, "loss": 0.6555, "step": 397 }, { "epoch": 0.2552918537524054, "grad_norm": 1.239014983177185, "learning_rate": 4.584315406042426e-05, "loss": 0.3964, "step": 398 }, { "epoch": 0.25593329057087877, "grad_norm": 1.7449710369110107, "learning_rate": 4.5832440539961435e-05, "loss": 0.6226, "step": 399 }, { "epoch": 0.25657472738935216, "grad_norm": 1.1974855661392212, "learning_rate": 4.582172701949861e-05, "loss": 0.4148, "step": 400 }, { "epoch": 0.2572161642078255, "grad_norm": 1.3654508590698242, "learning_rate": 4.5811013499035784e-05, "loss": 0.4831, "step": 401 }, { "epoch": 0.2578576010262989, "grad_norm": 1.629918098449707, "learning_rate": 4.580029997857296e-05, "loss": 0.6105, "step": 402 }, { "epoch": 0.2584990378447723, "grad_norm": 1.4976164102554321, "learning_rate": 4.578958645811014e-05, "loss": 0.5453, "step": 403 }, { "epoch": 0.2591404746632457, "grad_norm": 1.5742278099060059, "learning_rate": 4.577887293764732e-05, "loss": 0.592, "step": 404 }, { "epoch": 0.25978191148171903, "grad_norm": 1.5738730430603027, "learning_rate": 4.5768159417184483e-05, "loss": 0.5795, "step": 405 }, { "epoch": 0.2604233483001924, "grad_norm": 1.369555115699768, "learning_rate": 4.575744589672166e-05, "loss": 0.4641, "step": 406 }, { "epoch": 0.2610647851186658, "grad_norm": 1.416852593421936, "learning_rate": 4.574673237625884e-05, "loss": 0.4848, "step": 407 }, { "epoch": 0.2617062219371392, "grad_norm": 1.4698175191879272, "learning_rate": 4.573601885579602e-05, "loss": 0.5124, "step": 408 }, { "epoch": 0.26234765875561256, "grad_norm": 1.2608624696731567, "learning_rate": 4.572530533533319e-05, "loss": 0.4469, "step": 409 }, { "epoch": 0.26298909557408595, "grad_norm": 1.4622141122817993, "learning_rate": 4.571459181487037e-05, "loss": 0.4707, "step": 410 }, { "epoch": 0.26363053239255935, "grad_norm": 1.5310230255126953, "learning_rate": 4.5703878294407546e-05, "loss": 0.5611, "step": 411 }, { "epoch": 0.2642719692110327, "grad_norm": 1.3838398456573486, "learning_rate": 4.569316477394472e-05, "loss": 0.4829, "step": 412 }, { "epoch": 0.2649134060295061, "grad_norm": 1.4291026592254639, "learning_rate": 4.5682451253481895e-05, "loss": 0.55, "step": 413 }, { "epoch": 0.2655548428479795, "grad_norm": 1.455706000328064, "learning_rate": 4.567173773301907e-05, "loss": 0.4869, "step": 414 }, { "epoch": 0.2661962796664529, "grad_norm": 1.3665153980255127, "learning_rate": 4.5661024212556245e-05, "loss": 0.4447, "step": 415 }, { "epoch": 0.2668377164849262, "grad_norm": 1.396051287651062, "learning_rate": 4.565031069209342e-05, "loss": 0.5086, "step": 416 }, { "epoch": 0.2674791533033996, "grad_norm": 1.5710020065307617, "learning_rate": 4.56395971716306e-05, "loss": 0.5098, "step": 417 }, { "epoch": 0.268120590121873, "grad_norm": 1.6508487462997437, "learning_rate": 4.562888365116777e-05, "loss": 0.5492, "step": 418 }, { "epoch": 0.2687620269403464, "grad_norm": 1.4381122589111328, "learning_rate": 4.561817013070495e-05, "loss": 0.485, "step": 419 }, { "epoch": 0.26940346375881974, "grad_norm": 1.2869155406951904, "learning_rate": 4.560745661024213e-05, "loss": 0.4546, "step": 420 }, { "epoch": 0.27004490057729313, "grad_norm": 1.3426905870437622, "learning_rate": 4.559674308977931e-05, "loss": 0.488, "step": 421 }, { "epoch": 0.27068633739576653, "grad_norm": 1.420833706855774, "learning_rate": 4.558602956931648e-05, "loss": 0.4998, "step": 422 }, { "epoch": 0.27132777421423987, "grad_norm": 1.7550712823867798, "learning_rate": 4.5575316048853656e-05, "loss": 0.4905, "step": 423 }, { "epoch": 0.27196921103271327, "grad_norm": 1.4633597135543823, "learning_rate": 4.5564602528390834e-05, "loss": 0.5287, "step": 424 }, { "epoch": 0.27261064785118666, "grad_norm": 1.2200931310653687, "learning_rate": 4.5553889007928006e-05, "loss": 0.4471, "step": 425 }, { "epoch": 0.27325208466966006, "grad_norm": 1.3163163661956787, "learning_rate": 4.5543175487465184e-05, "loss": 0.4298, "step": 426 }, { "epoch": 0.2738935214881334, "grad_norm": 1.4415762424468994, "learning_rate": 4.553246196700236e-05, "loss": 0.5223, "step": 427 }, { "epoch": 0.2745349583066068, "grad_norm": 1.6550372838974, "learning_rate": 4.552174844653953e-05, "loss": 0.4987, "step": 428 }, { "epoch": 0.2751763951250802, "grad_norm": 1.490904450416565, "learning_rate": 4.551103492607671e-05, "loss": 0.5402, "step": 429 }, { "epoch": 0.2758178319435536, "grad_norm": 1.5963994264602661, "learning_rate": 4.550032140561389e-05, "loss": 0.5571, "step": 430 }, { "epoch": 0.2764592687620269, "grad_norm": 1.4573734998703003, "learning_rate": 4.548960788515106e-05, "loss": 0.4919, "step": 431 }, { "epoch": 0.2771007055805003, "grad_norm": 1.367922067642212, "learning_rate": 4.547889436468824e-05, "loss": 0.4658, "step": 432 }, { "epoch": 0.2777421423989737, "grad_norm": 1.4025838375091553, "learning_rate": 4.546818084422542e-05, "loss": 0.5164, "step": 433 }, { "epoch": 0.2783835792174471, "grad_norm": 1.510438084602356, "learning_rate": 4.5457467323762596e-05, "loss": 0.5293, "step": 434 }, { "epoch": 0.27902501603592045, "grad_norm": 1.4438951015472412, "learning_rate": 4.544675380329977e-05, "loss": 0.4545, "step": 435 }, { "epoch": 0.27966645285439384, "grad_norm": 1.2862393856048584, "learning_rate": 4.5436040282836945e-05, "loss": 0.4267, "step": 436 }, { "epoch": 0.28030788967286724, "grad_norm": 1.5621165037155151, "learning_rate": 4.542532676237412e-05, "loss": 0.4952, "step": 437 }, { "epoch": 0.2809493264913406, "grad_norm": 1.3061023950576782, "learning_rate": 4.5414613241911295e-05, "loss": 0.4628, "step": 438 }, { "epoch": 0.281590763309814, "grad_norm": 1.4725111722946167, "learning_rate": 4.5403899721448466e-05, "loss": 0.5519, "step": 439 }, { "epoch": 0.28223220012828737, "grad_norm": 1.30818772315979, "learning_rate": 4.5393186200985644e-05, "loss": 0.4238, "step": 440 }, { "epoch": 0.28287363694676076, "grad_norm": 1.5705479383468628, "learning_rate": 4.538247268052282e-05, "loss": 0.5154, "step": 441 }, { "epoch": 0.2835150737652341, "grad_norm": 1.611924648284912, "learning_rate": 4.5371759160059994e-05, "loss": 0.6043, "step": 442 }, { "epoch": 0.2841565105837075, "grad_norm": 1.1975364685058594, "learning_rate": 4.536104563959717e-05, "loss": 0.3949, "step": 443 }, { "epoch": 0.2847979474021809, "grad_norm": 1.3375297784805298, "learning_rate": 4.535033211913435e-05, "loss": 0.4438, "step": 444 }, { "epoch": 0.2854393842206543, "grad_norm": 1.4540621042251587, "learning_rate": 4.533961859867152e-05, "loss": 0.5737, "step": 445 }, { "epoch": 0.28608082103912763, "grad_norm": 1.4586882591247559, "learning_rate": 4.53289050782087e-05, "loss": 0.4748, "step": 446 }, { "epoch": 0.286722257857601, "grad_norm": 1.3715091943740845, "learning_rate": 4.531819155774588e-05, "loss": 0.4831, "step": 447 }, { "epoch": 0.2873636946760744, "grad_norm": 1.468746542930603, "learning_rate": 4.530747803728305e-05, "loss": 0.4132, "step": 448 }, { "epoch": 0.28800513149454776, "grad_norm": 1.2549887895584106, "learning_rate": 4.529676451682023e-05, "loss": 0.4053, "step": 449 }, { "epoch": 0.28864656831302116, "grad_norm": 1.5793724060058594, "learning_rate": 4.5286050996357405e-05, "loss": 0.5601, "step": 450 }, { "epoch": 0.28928800513149455, "grad_norm": 1.4544566869735718, "learning_rate": 4.527533747589458e-05, "loss": 0.4847, "step": 451 }, { "epoch": 0.28992944194996795, "grad_norm": 1.3019859790802002, "learning_rate": 4.5264623955431755e-05, "loss": 0.4396, "step": 452 }, { "epoch": 0.2905708787684413, "grad_norm": 1.286805510520935, "learning_rate": 4.525391043496893e-05, "loss": 0.4467, "step": 453 }, { "epoch": 0.2912123155869147, "grad_norm": 1.455458402633667, "learning_rate": 4.524319691450611e-05, "loss": 0.4827, "step": 454 }, { "epoch": 0.2918537524053881, "grad_norm": 1.4614661931991577, "learning_rate": 4.523248339404328e-05, "loss": 0.5043, "step": 455 }, { "epoch": 0.2924951892238615, "grad_norm": 1.3894802331924438, "learning_rate": 4.522176987358046e-05, "loss": 0.4562, "step": 456 }, { "epoch": 0.2931366260423348, "grad_norm": 1.623382806777954, "learning_rate": 4.521105635311764e-05, "loss": 0.4547, "step": 457 }, { "epoch": 0.2937780628608082, "grad_norm": 1.4566304683685303, "learning_rate": 4.520034283265481e-05, "loss": 0.4033, "step": 458 }, { "epoch": 0.2944194996792816, "grad_norm": 1.256400227546692, "learning_rate": 4.518962931219199e-05, "loss": 0.4371, "step": 459 }, { "epoch": 0.29506093649775494, "grad_norm": 1.4798544645309448, "learning_rate": 4.5178915791729166e-05, "loss": 0.5134, "step": 460 }, { "epoch": 0.29570237331622834, "grad_norm": 1.260454773902893, "learning_rate": 4.516820227126634e-05, "loss": 0.4141, "step": 461 }, { "epoch": 0.29634381013470174, "grad_norm": 1.2998229265213013, "learning_rate": 4.5157488750803516e-05, "loss": 0.4496, "step": 462 }, { "epoch": 0.29698524695317513, "grad_norm": 1.3116885423660278, "learning_rate": 4.5146775230340694e-05, "loss": 0.4648, "step": 463 }, { "epoch": 0.29762668377164847, "grad_norm": 1.3824630975723267, "learning_rate": 4.513606170987787e-05, "loss": 0.5302, "step": 464 }, { "epoch": 0.29826812059012187, "grad_norm": 1.420333981513977, "learning_rate": 4.5125348189415044e-05, "loss": 0.5359, "step": 465 }, { "epoch": 0.29890955740859526, "grad_norm": 1.4115830659866333, "learning_rate": 4.511463466895222e-05, "loss": 0.428, "step": 466 }, { "epoch": 0.29955099422706866, "grad_norm": 1.390466570854187, "learning_rate": 4.51039211484894e-05, "loss": 0.5208, "step": 467 }, { "epoch": 0.300192431045542, "grad_norm": 1.3951661586761475, "learning_rate": 4.509320762802657e-05, "loss": 0.4819, "step": 468 }, { "epoch": 0.3008338678640154, "grad_norm": 1.4040907621383667, "learning_rate": 4.508249410756375e-05, "loss": 0.5027, "step": 469 }, { "epoch": 0.3014753046824888, "grad_norm": 1.3882837295532227, "learning_rate": 4.507178058710093e-05, "loss": 0.4945, "step": 470 }, { "epoch": 0.3021167415009622, "grad_norm": 1.2979682683944702, "learning_rate": 4.50610670666381e-05, "loss": 0.471, "step": 471 }, { "epoch": 0.3027581783194355, "grad_norm": 1.6236354112625122, "learning_rate": 4.505035354617528e-05, "loss": 0.6155, "step": 472 }, { "epoch": 0.3033996151379089, "grad_norm": 1.2914626598358154, "learning_rate": 4.5039640025712455e-05, "loss": 0.4693, "step": 473 }, { "epoch": 0.3040410519563823, "grad_norm": 1.3439966440200806, "learning_rate": 4.5028926505249627e-05, "loss": 0.451, "step": 474 }, { "epoch": 0.30468248877485565, "grad_norm": 1.5358506441116333, "learning_rate": 4.50182129847868e-05, "loss": 0.6135, "step": 475 }, { "epoch": 0.30532392559332905, "grad_norm": 1.4096317291259766, "learning_rate": 4.5007499464323976e-05, "loss": 0.5041, "step": 476 }, { "epoch": 0.30596536241180244, "grad_norm": 1.5538190603256226, "learning_rate": 4.4996785943861154e-05, "loss": 0.6074, "step": 477 }, { "epoch": 0.30660679923027584, "grad_norm": 1.3752061128616333, "learning_rate": 4.4986072423398326e-05, "loss": 0.4725, "step": 478 }, { "epoch": 0.3072482360487492, "grad_norm": 1.4321306943893433, "learning_rate": 4.4975358902935504e-05, "loss": 0.4485, "step": 479 }, { "epoch": 0.3078896728672226, "grad_norm": 1.4220929145812988, "learning_rate": 4.496464538247268e-05, "loss": 0.4516, "step": 480 }, { "epoch": 0.30853110968569597, "grad_norm": 1.211046814918518, "learning_rate": 4.495393186200986e-05, "loss": 0.3904, "step": 481 }, { "epoch": 0.30917254650416937, "grad_norm": 1.3046022653579712, "learning_rate": 4.494321834154703e-05, "loss": 0.4566, "step": 482 }, { "epoch": 0.3098139833226427, "grad_norm": 1.6621602773666382, "learning_rate": 4.493250482108421e-05, "loss": 0.522, "step": 483 }, { "epoch": 0.3104554201411161, "grad_norm": 1.0918346643447876, "learning_rate": 4.492179130062139e-05, "loss": 0.364, "step": 484 }, { "epoch": 0.3110968569595895, "grad_norm": 1.630610704421997, "learning_rate": 4.491107778015856e-05, "loss": 0.6017, "step": 485 }, { "epoch": 0.31173829377806284, "grad_norm": 1.3767156600952148, "learning_rate": 4.490036425969574e-05, "loss": 0.4648, "step": 486 }, { "epoch": 0.31237973059653623, "grad_norm": 1.6161319017410278, "learning_rate": 4.4889650739232915e-05, "loss": 0.5757, "step": 487 }, { "epoch": 0.3130211674150096, "grad_norm": 1.4139721393585205, "learning_rate": 4.487893721877009e-05, "loss": 0.5151, "step": 488 }, { "epoch": 0.313662604233483, "grad_norm": 1.2944648265838623, "learning_rate": 4.4868223698307265e-05, "loss": 0.4332, "step": 489 }, { "epoch": 0.31430404105195636, "grad_norm": 1.461159348487854, "learning_rate": 4.485751017784444e-05, "loss": 0.5012, "step": 490 }, { "epoch": 0.31494547787042976, "grad_norm": 1.2473292350769043, "learning_rate": 4.484679665738162e-05, "loss": 0.44, "step": 491 }, { "epoch": 0.31558691468890315, "grad_norm": 1.417854905128479, "learning_rate": 4.483608313691879e-05, "loss": 0.5212, "step": 492 }, { "epoch": 0.31622835150737655, "grad_norm": 1.2230188846588135, "learning_rate": 4.482536961645597e-05, "loss": 0.4218, "step": 493 }, { "epoch": 0.3168697883258499, "grad_norm": 1.3240710496902466, "learning_rate": 4.481465609599315e-05, "loss": 0.4443, "step": 494 }, { "epoch": 0.3175112251443233, "grad_norm": 1.2055859565734863, "learning_rate": 4.480394257553032e-05, "loss": 0.4296, "step": 495 }, { "epoch": 0.3181526619627967, "grad_norm": 1.3216733932495117, "learning_rate": 4.47932290550675e-05, "loss": 0.4306, "step": 496 }, { "epoch": 0.31879409878127, "grad_norm": 1.386330008506775, "learning_rate": 4.4782515534604677e-05, "loss": 0.5051, "step": 497 }, { "epoch": 0.3194355355997434, "grad_norm": 1.4104686975479126, "learning_rate": 4.477180201414185e-05, "loss": 0.4916, "step": 498 }, { "epoch": 0.3200769724182168, "grad_norm": 1.6040958166122437, "learning_rate": 4.4761088493679026e-05, "loss": 0.5016, "step": 499 }, { "epoch": 0.3207184092366902, "grad_norm": 1.1831400394439697, "learning_rate": 4.4750374973216204e-05, "loss": 0.3899, "step": 500 }, { "epoch": 0.32135984605516354, "grad_norm": 1.5171695947647095, "learning_rate": 4.4739661452753376e-05, "loss": 0.4796, "step": 501 }, { "epoch": 0.32200128287363694, "grad_norm": 1.2565295696258545, "learning_rate": 4.4728947932290554e-05, "loss": 0.4223, "step": 502 }, { "epoch": 0.32264271969211034, "grad_norm": 1.4012917280197144, "learning_rate": 4.471823441182773e-05, "loss": 0.4518, "step": 503 }, { "epoch": 0.32328415651058373, "grad_norm": 1.4091616868972778, "learning_rate": 4.470752089136491e-05, "loss": 0.4184, "step": 504 }, { "epoch": 0.32392559332905707, "grad_norm": 1.5391875505447388, "learning_rate": 4.469680737090208e-05, "loss": 0.482, "step": 505 }, { "epoch": 0.32456703014753047, "grad_norm": 1.5720252990722656, "learning_rate": 4.468609385043926e-05, "loss": 0.552, "step": 506 }, { "epoch": 0.32520846696600386, "grad_norm": 1.3848786354064941, "learning_rate": 4.467538032997644e-05, "loss": 0.4815, "step": 507 }, { "epoch": 0.3258499037844772, "grad_norm": 1.428274154663086, "learning_rate": 4.46646668095136e-05, "loss": 0.5541, "step": 508 }, { "epoch": 0.3264913406029506, "grad_norm": 1.487172245979309, "learning_rate": 4.465395328905078e-05, "loss": 0.5902, "step": 509 }, { "epoch": 0.327132777421424, "grad_norm": 1.217198133468628, "learning_rate": 4.464323976858796e-05, "loss": 0.4208, "step": 510 }, { "epoch": 0.3277742142398974, "grad_norm": 1.2567338943481445, "learning_rate": 4.463252624812514e-05, "loss": 0.4517, "step": 511 }, { "epoch": 0.32841565105837073, "grad_norm": 1.2875603437423706, "learning_rate": 4.462181272766231e-05, "loss": 0.3936, "step": 512 }, { "epoch": 0.3290570878768441, "grad_norm": 1.3143458366394043, "learning_rate": 4.4611099207199486e-05, "loss": 0.486, "step": 513 }, { "epoch": 0.3296985246953175, "grad_norm": 1.5528936386108398, "learning_rate": 4.4600385686736664e-05, "loss": 0.5402, "step": 514 }, { "epoch": 0.3303399615137909, "grad_norm": 1.4599571228027344, "learning_rate": 4.4589672166273836e-05, "loss": 0.4847, "step": 515 }, { "epoch": 0.33098139833226425, "grad_norm": 1.2794709205627441, "learning_rate": 4.4578958645811014e-05, "loss": 0.4637, "step": 516 }, { "epoch": 0.33162283515073765, "grad_norm": 1.2259162664413452, "learning_rate": 4.456824512534819e-05, "loss": 0.3944, "step": 517 }, { "epoch": 0.33226427196921104, "grad_norm": 1.2429255247116089, "learning_rate": 4.4557531604885363e-05, "loss": 0.4232, "step": 518 }, { "epoch": 0.33290570878768444, "grad_norm": 1.397773265838623, "learning_rate": 4.454681808442254e-05, "loss": 0.4423, "step": 519 }, { "epoch": 0.3335471456061578, "grad_norm": 1.4750643968582153, "learning_rate": 4.453610456395972e-05, "loss": 0.4931, "step": 520 }, { "epoch": 0.3341885824246312, "grad_norm": 1.5362123250961304, "learning_rate": 4.45253910434969e-05, "loss": 0.4743, "step": 521 }, { "epoch": 0.33483001924310457, "grad_norm": 1.5249499082565308, "learning_rate": 4.451467752303407e-05, "loss": 0.4539, "step": 522 }, { "epoch": 0.3354714560615779, "grad_norm": 1.6878784894943237, "learning_rate": 4.450396400257125e-05, "loss": 0.5248, "step": 523 }, { "epoch": 0.3361128928800513, "grad_norm": 1.6690584421157837, "learning_rate": 4.4493250482108426e-05, "loss": 0.6147, "step": 524 }, { "epoch": 0.3367543296985247, "grad_norm": 1.414570927619934, "learning_rate": 4.44825369616456e-05, "loss": 0.48, "step": 525 }, { "epoch": 0.3373957665169981, "grad_norm": 1.2832664251327515, "learning_rate": 4.4471823441182775e-05, "loss": 0.4768, "step": 526 }, { "epoch": 0.33803720333547144, "grad_norm": 1.415837287902832, "learning_rate": 4.446110992071995e-05, "loss": 0.4058, "step": 527 }, { "epoch": 0.33867864015394483, "grad_norm": 1.3919284343719482, "learning_rate": 4.4450396400257125e-05, "loss": 0.4663, "step": 528 }, { "epoch": 0.3393200769724182, "grad_norm": 1.254813313484192, "learning_rate": 4.44396828797943e-05, "loss": 0.4316, "step": 529 }, { "epoch": 0.3399615137908916, "grad_norm": 1.3527708053588867, "learning_rate": 4.442896935933148e-05, "loss": 0.445, "step": 530 }, { "epoch": 0.34060295060936496, "grad_norm": 1.3240491151809692, "learning_rate": 4.441825583886865e-05, "loss": 0.3972, "step": 531 }, { "epoch": 0.34124438742783836, "grad_norm": 1.2502055168151855, "learning_rate": 4.440754231840583e-05, "loss": 0.3733, "step": 532 }, { "epoch": 0.34188582424631175, "grad_norm": 1.6138560771942139, "learning_rate": 4.439682879794301e-05, "loss": 0.376, "step": 533 }, { "epoch": 0.3425272610647851, "grad_norm": 1.3693591356277466, "learning_rate": 4.438611527748019e-05, "loss": 0.4076, "step": 534 }, { "epoch": 0.3431686978832585, "grad_norm": 1.473752737045288, "learning_rate": 4.437540175701736e-05, "loss": 0.4931, "step": 535 }, { "epoch": 0.3438101347017319, "grad_norm": 1.6008069515228271, "learning_rate": 4.4364688236554536e-05, "loss": 0.5252, "step": 536 }, { "epoch": 0.3444515715202053, "grad_norm": 1.5811138153076172, "learning_rate": 4.4353974716091714e-05, "loss": 0.4407, "step": 537 }, { "epoch": 0.3450930083386786, "grad_norm": 1.8310914039611816, "learning_rate": 4.4343261195628886e-05, "loss": 0.4769, "step": 538 }, { "epoch": 0.345734445157152, "grad_norm": 1.6980938911437988, "learning_rate": 4.4332547675166064e-05, "loss": 0.5908, "step": 539 }, { "epoch": 0.3463758819756254, "grad_norm": 1.4620975255966187, "learning_rate": 4.432183415470324e-05, "loss": 0.4678, "step": 540 }, { "epoch": 0.3470173187940988, "grad_norm": 1.4264655113220215, "learning_rate": 4.4311120634240413e-05, "loss": 0.4615, "step": 541 }, { "epoch": 0.34765875561257215, "grad_norm": 1.370735764503479, "learning_rate": 4.4300407113777585e-05, "loss": 0.393, "step": 542 }, { "epoch": 0.34830019243104554, "grad_norm": 1.3949134349822998, "learning_rate": 4.428969359331476e-05, "loss": 0.5096, "step": 543 }, { "epoch": 0.34894162924951894, "grad_norm": 1.384234070777893, "learning_rate": 4.427898007285194e-05, "loss": 0.5346, "step": 544 }, { "epoch": 0.3495830660679923, "grad_norm": 1.5887359380722046, "learning_rate": 4.426826655238911e-05, "loss": 0.5629, "step": 545 }, { "epoch": 0.35022450288646567, "grad_norm": 1.4185655117034912, "learning_rate": 4.425755303192629e-05, "loss": 0.5031, "step": 546 }, { "epoch": 0.35086593970493907, "grad_norm": 1.081157922744751, "learning_rate": 4.424683951146347e-05, "loss": 0.375, "step": 547 }, { "epoch": 0.35150737652341246, "grad_norm": 1.1129094362258911, "learning_rate": 4.423612599100064e-05, "loss": 0.4026, "step": 548 }, { "epoch": 0.3521488133418858, "grad_norm": 1.2502634525299072, "learning_rate": 4.422541247053782e-05, "loss": 0.4365, "step": 549 }, { "epoch": 0.3527902501603592, "grad_norm": 1.7710344791412354, "learning_rate": 4.4214698950074996e-05, "loss": 0.6804, "step": 550 }, { "epoch": 0.3534316869788326, "grad_norm": 1.0841896533966064, "learning_rate": 4.4203985429612175e-05, "loss": 0.3934, "step": 551 }, { "epoch": 0.354073123797306, "grad_norm": 1.300932765007019, "learning_rate": 4.4193271909149346e-05, "loss": 0.4001, "step": 552 }, { "epoch": 0.35471456061577933, "grad_norm": 1.5507783889770508, "learning_rate": 4.4182558388686524e-05, "loss": 0.5474, "step": 553 }, { "epoch": 0.3553559974342527, "grad_norm": 1.5066304206848145, "learning_rate": 4.41718448682237e-05, "loss": 0.5237, "step": 554 }, { "epoch": 0.3559974342527261, "grad_norm": 1.573716163635254, "learning_rate": 4.4161131347760874e-05, "loss": 0.5283, "step": 555 }, { "epoch": 0.35663887107119946, "grad_norm": 1.642425298690796, "learning_rate": 4.415041782729805e-05, "loss": 0.6799, "step": 556 }, { "epoch": 0.35728030788967285, "grad_norm": 1.3902350664138794, "learning_rate": 4.413970430683523e-05, "loss": 0.4946, "step": 557 }, { "epoch": 0.35792174470814625, "grad_norm": 1.3653048276901245, "learning_rate": 4.41289907863724e-05, "loss": 0.4849, "step": 558 }, { "epoch": 0.35856318152661965, "grad_norm": 1.442947506904602, "learning_rate": 4.411827726590958e-05, "loss": 0.4994, "step": 559 }, { "epoch": 0.359204618345093, "grad_norm": 1.4463330507278442, "learning_rate": 4.410756374544676e-05, "loss": 0.5171, "step": 560 }, { "epoch": 0.3598460551635664, "grad_norm": 1.6812576055526733, "learning_rate": 4.409685022498393e-05, "loss": 0.5273, "step": 561 }, { "epoch": 0.3604874919820398, "grad_norm": 1.51375412940979, "learning_rate": 4.408613670452111e-05, "loss": 0.6341, "step": 562 }, { "epoch": 0.36112892880051317, "grad_norm": 1.261888861656189, "learning_rate": 4.4075423184058285e-05, "loss": 0.4298, "step": 563 }, { "epoch": 0.3617703656189865, "grad_norm": 1.20806884765625, "learning_rate": 4.4064709663595463e-05, "loss": 0.4112, "step": 564 }, { "epoch": 0.3624118024374599, "grad_norm": 1.4263348579406738, "learning_rate": 4.4053996143132635e-05, "loss": 0.53, "step": 565 }, { "epoch": 0.3630532392559333, "grad_norm": 1.3374727964401245, "learning_rate": 4.404328262266981e-05, "loss": 0.5195, "step": 566 }, { "epoch": 0.3636946760744067, "grad_norm": 1.3475596904754639, "learning_rate": 4.403256910220699e-05, "loss": 0.4988, "step": 567 }, { "epoch": 0.36433611289288004, "grad_norm": 1.360692024230957, "learning_rate": 4.402185558174416e-05, "loss": 0.5134, "step": 568 }, { "epoch": 0.36497754971135343, "grad_norm": 1.5242480039596558, "learning_rate": 4.401114206128134e-05, "loss": 0.6013, "step": 569 }, { "epoch": 0.36561898652982683, "grad_norm": 1.4276233911514282, "learning_rate": 4.400042854081852e-05, "loss": 0.5216, "step": 570 }, { "epoch": 0.36626042334830017, "grad_norm": 1.7506054639816284, "learning_rate": 4.398971502035569e-05, "loss": 0.5708, "step": 571 }, { "epoch": 0.36690186016677356, "grad_norm": 1.2901298999786377, "learning_rate": 4.397900149989287e-05, "loss": 0.4119, "step": 572 }, { "epoch": 0.36754329698524696, "grad_norm": 1.409049391746521, "learning_rate": 4.3968287979430046e-05, "loss": 0.4719, "step": 573 }, { "epoch": 0.36818473380372035, "grad_norm": 1.3611568212509155, "learning_rate": 4.395757445896722e-05, "loss": 0.4105, "step": 574 }, { "epoch": 0.3688261706221937, "grad_norm": 1.3916494846343994, "learning_rate": 4.3946860938504396e-05, "loss": 0.4875, "step": 575 }, { "epoch": 0.3694676074406671, "grad_norm": 1.4193522930145264, "learning_rate": 4.3936147418041574e-05, "loss": 0.5046, "step": 576 }, { "epoch": 0.3701090442591405, "grad_norm": 1.4327878952026367, "learning_rate": 4.3925433897578745e-05, "loss": 0.4398, "step": 577 }, { "epoch": 0.3707504810776139, "grad_norm": 1.4063422679901123, "learning_rate": 4.391472037711592e-05, "loss": 0.4608, "step": 578 }, { "epoch": 0.3713919178960872, "grad_norm": 1.6685892343521118, "learning_rate": 4.3904006856653095e-05, "loss": 0.5403, "step": 579 }, { "epoch": 0.3720333547145606, "grad_norm": 1.4026646614074707, "learning_rate": 4.389329333619027e-05, "loss": 0.5012, "step": 580 }, { "epoch": 0.372674791533034, "grad_norm": 1.610607385635376, "learning_rate": 4.388257981572745e-05, "loss": 0.6178, "step": 581 }, { "epoch": 0.37331622835150735, "grad_norm": 1.418107271194458, "learning_rate": 4.387186629526462e-05, "loss": 0.5587, "step": 582 }, { "epoch": 0.37395766516998075, "grad_norm": 1.4899518489837646, "learning_rate": 4.38611527748018e-05, "loss": 0.4763, "step": 583 }, { "epoch": 0.37459910198845414, "grad_norm": 1.367907166481018, "learning_rate": 4.385043925433898e-05, "loss": 0.489, "step": 584 }, { "epoch": 0.37524053880692754, "grad_norm": 1.4670310020446777, "learning_rate": 4.383972573387615e-05, "loss": 0.5983, "step": 585 }, { "epoch": 0.3758819756254009, "grad_norm": 1.280769944190979, "learning_rate": 4.382901221341333e-05, "loss": 0.4592, "step": 586 }, { "epoch": 0.3765234124438743, "grad_norm": 1.2375593185424805, "learning_rate": 4.381829869295051e-05, "loss": 0.4589, "step": 587 }, { "epoch": 0.37716484926234767, "grad_norm": 1.3491277694702148, "learning_rate": 4.380758517248768e-05, "loss": 0.4337, "step": 588 }, { "epoch": 0.37780628608082106, "grad_norm": 1.1822655200958252, "learning_rate": 4.3796871652024856e-05, "loss": 0.3866, "step": 589 }, { "epoch": 0.3784477228992944, "grad_norm": 1.342938780784607, "learning_rate": 4.3786158131562034e-05, "loss": 0.5446, "step": 590 }, { "epoch": 0.3790891597177678, "grad_norm": 1.0920994281768799, "learning_rate": 4.3775444611099206e-05, "loss": 0.402, "step": 591 }, { "epoch": 0.3797305965362412, "grad_norm": 1.214938998222351, "learning_rate": 4.3764731090636384e-05, "loss": 0.4321, "step": 592 }, { "epoch": 0.38037203335471453, "grad_norm": 1.9551986455917358, "learning_rate": 4.375401757017356e-05, "loss": 0.6784, "step": 593 }, { "epoch": 0.38101347017318793, "grad_norm": 1.2754671573638916, "learning_rate": 4.374330404971074e-05, "loss": 0.4995, "step": 594 }, { "epoch": 0.3816549069916613, "grad_norm": 1.3707493543624878, "learning_rate": 4.373259052924791e-05, "loss": 0.4927, "step": 595 }, { "epoch": 0.3822963438101347, "grad_norm": 1.4337764978408813, "learning_rate": 4.372187700878509e-05, "loss": 0.4858, "step": 596 }, { "epoch": 0.38293778062860806, "grad_norm": 1.340712070465088, "learning_rate": 4.371116348832227e-05, "loss": 0.4245, "step": 597 }, { "epoch": 0.38357921744708146, "grad_norm": 1.3596079349517822, "learning_rate": 4.370044996785944e-05, "loss": 0.4675, "step": 598 }, { "epoch": 0.38422065426555485, "grad_norm": 1.3523589372634888, "learning_rate": 4.368973644739662e-05, "loss": 0.4388, "step": 599 }, { "epoch": 0.38486209108402825, "grad_norm": 1.2949239015579224, "learning_rate": 4.3679022926933795e-05, "loss": 0.4917, "step": 600 }, { "epoch": 0.3855035279025016, "grad_norm": 1.159661889076233, "learning_rate": 4.366830940647097e-05, "loss": 0.4222, "step": 601 }, { "epoch": 0.386144964720975, "grad_norm": 1.3051742315292358, "learning_rate": 4.3657595886008145e-05, "loss": 0.4125, "step": 602 }, { "epoch": 0.3867864015394484, "grad_norm": 1.1150778532028198, "learning_rate": 4.364688236554532e-05, "loss": 0.3988, "step": 603 }, { "epoch": 0.38742783835792177, "grad_norm": 1.3768811225891113, "learning_rate": 4.36361688450825e-05, "loss": 0.4646, "step": 604 }, { "epoch": 0.3880692751763951, "grad_norm": 1.4145660400390625, "learning_rate": 4.362545532461967e-05, "loss": 0.4691, "step": 605 }, { "epoch": 0.3887107119948685, "grad_norm": 1.5981329679489136, "learning_rate": 4.361474180415685e-05, "loss": 0.6238, "step": 606 }, { "epoch": 0.3893521488133419, "grad_norm": 1.3452757596969604, "learning_rate": 4.360402828369403e-05, "loss": 0.548, "step": 607 }, { "epoch": 0.38999358563181524, "grad_norm": 1.8776661157608032, "learning_rate": 4.35933147632312e-05, "loss": 0.4559, "step": 608 }, { "epoch": 0.39063502245028864, "grad_norm": 1.1440383195877075, "learning_rate": 4.358260124276838e-05, "loss": 0.3936, "step": 609 }, { "epoch": 0.39127645926876203, "grad_norm": 1.2057170867919922, "learning_rate": 4.357188772230556e-05, "loss": 0.3811, "step": 610 }, { "epoch": 0.39191789608723543, "grad_norm": 1.3704719543457031, "learning_rate": 4.356117420184273e-05, "loss": 0.4704, "step": 611 }, { "epoch": 0.39255933290570877, "grad_norm": 1.2422642707824707, "learning_rate": 4.35504606813799e-05, "loss": 0.4366, "step": 612 }, { "epoch": 0.39320076972418216, "grad_norm": 1.3668595552444458, "learning_rate": 4.353974716091708e-05, "loss": 0.4036, "step": 613 }, { "epoch": 0.39384220654265556, "grad_norm": 1.1881579160690308, "learning_rate": 4.3529033640454256e-05, "loss": 0.4199, "step": 614 }, { "epoch": 0.39448364336112896, "grad_norm": 1.4034414291381836, "learning_rate": 4.351832011999143e-05, "loss": 0.4606, "step": 615 }, { "epoch": 0.3951250801796023, "grad_norm": 1.6402472257614136, "learning_rate": 4.3507606599528605e-05, "loss": 0.5982, "step": 616 }, { "epoch": 0.3957665169980757, "grad_norm": 1.5913333892822266, "learning_rate": 4.349689307906578e-05, "loss": 0.5526, "step": 617 }, { "epoch": 0.3964079538165491, "grad_norm": 1.5372097492218018, "learning_rate": 4.3486179558602955e-05, "loss": 0.4888, "step": 618 }, { "epoch": 0.3970493906350224, "grad_norm": 1.457160234451294, "learning_rate": 4.347546603814013e-05, "loss": 0.4778, "step": 619 }, { "epoch": 0.3976908274534958, "grad_norm": 1.2194206714630127, "learning_rate": 4.346475251767731e-05, "loss": 0.52, "step": 620 }, { "epoch": 0.3983322642719692, "grad_norm": 1.6855413913726807, "learning_rate": 4.345403899721448e-05, "loss": 0.6103, "step": 621 }, { "epoch": 0.3989737010904426, "grad_norm": 1.4891718626022339, "learning_rate": 4.344332547675166e-05, "loss": 0.5691, "step": 622 }, { "epoch": 0.39961513790891595, "grad_norm": 1.3337441682815552, "learning_rate": 4.343261195628884e-05, "loss": 0.4992, "step": 623 }, { "epoch": 0.40025657472738935, "grad_norm": 1.1811940670013428, "learning_rate": 4.342189843582602e-05, "loss": 0.4423, "step": 624 }, { "epoch": 0.40089801154586274, "grad_norm": 1.236438274383545, "learning_rate": 4.341118491536319e-05, "loss": 0.4912, "step": 625 }, { "epoch": 0.40153944836433614, "grad_norm": 1.446531891822815, "learning_rate": 4.3400471394900366e-05, "loss": 0.4952, "step": 626 }, { "epoch": 0.4021808851828095, "grad_norm": 1.1634560823440552, "learning_rate": 4.3389757874437544e-05, "loss": 0.4281, "step": 627 }, { "epoch": 0.4028223220012829, "grad_norm": 1.4928056001663208, "learning_rate": 4.3379044353974716e-05, "loss": 0.5181, "step": 628 }, { "epoch": 0.40346375881975627, "grad_norm": 1.3717461824417114, "learning_rate": 4.3368330833511894e-05, "loss": 0.439, "step": 629 }, { "epoch": 0.4041051956382296, "grad_norm": 1.3095824718475342, "learning_rate": 4.335761731304907e-05, "loss": 0.397, "step": 630 }, { "epoch": 0.404746632456703, "grad_norm": 1.4073431491851807, "learning_rate": 4.3346903792586244e-05, "loss": 0.4243, "step": 631 }, { "epoch": 0.4053880692751764, "grad_norm": 1.56426203250885, "learning_rate": 4.333619027212342e-05, "loss": 0.5968, "step": 632 }, { "epoch": 0.4060295060936498, "grad_norm": 1.4814070463180542, "learning_rate": 4.33254767516606e-05, "loss": 0.4555, "step": 633 }, { "epoch": 0.40667094291212313, "grad_norm": 1.3381128311157227, "learning_rate": 4.331476323119778e-05, "loss": 0.4282, "step": 634 }, { "epoch": 0.40731237973059653, "grad_norm": 1.3868017196655273, "learning_rate": 4.330404971073495e-05, "loss": 0.4883, "step": 635 }, { "epoch": 0.4079538165490699, "grad_norm": 1.2420063018798828, "learning_rate": 4.329333619027213e-05, "loss": 0.4168, "step": 636 }, { "epoch": 0.4085952533675433, "grad_norm": 1.1991676092147827, "learning_rate": 4.3282622669809306e-05, "loss": 0.4041, "step": 637 }, { "epoch": 0.40923669018601666, "grad_norm": 1.1373746395111084, "learning_rate": 4.327190914934648e-05, "loss": 0.4251, "step": 638 }, { "epoch": 0.40987812700449006, "grad_norm": 1.2478983402252197, "learning_rate": 4.3261195628883655e-05, "loss": 0.5293, "step": 639 }, { "epoch": 0.41051956382296345, "grad_norm": 1.371132254600525, "learning_rate": 4.325048210842083e-05, "loss": 0.4742, "step": 640 }, { "epoch": 0.4111610006414368, "grad_norm": 1.3154553174972534, "learning_rate": 4.3239768587958005e-05, "loss": 0.5298, "step": 641 }, { "epoch": 0.4118024374599102, "grad_norm": 1.251438856124878, "learning_rate": 4.322905506749518e-05, "loss": 0.4677, "step": 642 }, { "epoch": 0.4124438742783836, "grad_norm": 1.147836446762085, "learning_rate": 4.321834154703236e-05, "loss": 0.3857, "step": 643 }, { "epoch": 0.413085311096857, "grad_norm": 1.2261364459991455, "learning_rate": 4.320762802656953e-05, "loss": 0.4688, "step": 644 }, { "epoch": 0.4137267479153303, "grad_norm": 1.2125319242477417, "learning_rate": 4.3196914506106704e-05, "loss": 0.4296, "step": 645 }, { "epoch": 0.4143681847338037, "grad_norm": 1.2789543867111206, "learning_rate": 4.318620098564388e-05, "loss": 0.4083, "step": 646 }, { "epoch": 0.4150096215522771, "grad_norm": 1.4948880672454834, "learning_rate": 4.317548746518106e-05, "loss": 0.4655, "step": 647 }, { "epoch": 0.4156510583707505, "grad_norm": 1.376426339149475, "learning_rate": 4.316477394471823e-05, "loss": 0.5104, "step": 648 }, { "epoch": 0.41629249518922384, "grad_norm": 1.451549768447876, "learning_rate": 4.315406042425541e-05, "loss": 0.4341, "step": 649 }, { "epoch": 0.41693393200769724, "grad_norm": 1.4137637615203857, "learning_rate": 4.314334690379259e-05, "loss": 0.4119, "step": 650 }, { "epoch": 0.41757536882617063, "grad_norm": 1.3314013481140137, "learning_rate": 4.313263338332976e-05, "loss": 0.4308, "step": 651 }, { "epoch": 0.41821680564464403, "grad_norm": 1.4554474353790283, "learning_rate": 4.312191986286694e-05, "loss": 0.5511, "step": 652 }, { "epoch": 0.41885824246311737, "grad_norm": 1.4591790437698364, "learning_rate": 4.3111206342404115e-05, "loss": 0.542, "step": 653 }, { "epoch": 0.41949967928159076, "grad_norm": 1.203815221786499, "learning_rate": 4.3100492821941294e-05, "loss": 0.4415, "step": 654 }, { "epoch": 0.42014111610006416, "grad_norm": 1.5107988119125366, "learning_rate": 4.3089779301478465e-05, "loss": 0.5253, "step": 655 }, { "epoch": 0.4207825529185375, "grad_norm": 1.2913750410079956, "learning_rate": 4.307906578101564e-05, "loss": 0.5376, "step": 656 }, { "epoch": 0.4214239897370109, "grad_norm": 1.4411437511444092, "learning_rate": 4.306835226055282e-05, "loss": 0.5499, "step": 657 }, { "epoch": 0.4220654265554843, "grad_norm": 1.082398533821106, "learning_rate": 4.305763874008999e-05, "loss": 0.3898, "step": 658 }, { "epoch": 0.4227068633739577, "grad_norm": 1.3062257766723633, "learning_rate": 4.304692521962717e-05, "loss": 0.5035, "step": 659 }, { "epoch": 0.423348300192431, "grad_norm": 1.1961442232131958, "learning_rate": 4.303621169916435e-05, "loss": 0.4855, "step": 660 }, { "epoch": 0.4239897370109044, "grad_norm": 1.2047052383422852, "learning_rate": 4.302549817870152e-05, "loss": 0.5005, "step": 661 }, { "epoch": 0.4246311738293778, "grad_norm": 1.452361822128296, "learning_rate": 4.30147846582387e-05, "loss": 0.4208, "step": 662 }, { "epoch": 0.4252726106478512, "grad_norm": 1.0641976594924927, "learning_rate": 4.3004071137775877e-05, "loss": 0.3823, "step": 663 }, { "epoch": 0.42591404746632455, "grad_norm": 1.4622794389724731, "learning_rate": 4.2993357617313055e-05, "loss": 0.5715, "step": 664 }, { "epoch": 0.42655548428479795, "grad_norm": 1.3241081237792969, "learning_rate": 4.2982644096850226e-05, "loss": 0.4678, "step": 665 }, { "epoch": 0.42719692110327134, "grad_norm": 1.2142091989517212, "learning_rate": 4.2971930576387404e-05, "loss": 0.4042, "step": 666 }, { "epoch": 0.4278383579217447, "grad_norm": 1.5751606225967407, "learning_rate": 4.296121705592458e-05, "loss": 0.6015, "step": 667 }, { "epoch": 0.4284797947402181, "grad_norm": 1.2297223806381226, "learning_rate": 4.2950503535461754e-05, "loss": 0.4699, "step": 668 }, { "epoch": 0.4291212315586915, "grad_norm": 1.4452649354934692, "learning_rate": 4.293979001499893e-05, "loss": 0.5021, "step": 669 }, { "epoch": 0.42976266837716487, "grad_norm": 1.2511813640594482, "learning_rate": 4.292907649453611e-05, "loss": 0.414, "step": 670 }, { "epoch": 0.4304041051956382, "grad_norm": 1.3499305248260498, "learning_rate": 4.291836297407328e-05, "loss": 0.4959, "step": 671 }, { "epoch": 0.4310455420141116, "grad_norm": 1.4469267129898071, "learning_rate": 4.290764945361046e-05, "loss": 0.5662, "step": 672 }, { "epoch": 0.431686978832585, "grad_norm": 1.3142863512039185, "learning_rate": 4.289693593314764e-05, "loss": 0.4467, "step": 673 }, { "epoch": 0.4323284156510584, "grad_norm": 1.4183611869812012, "learning_rate": 4.288622241268481e-05, "loss": 0.54, "step": 674 }, { "epoch": 0.43296985246953174, "grad_norm": 1.4550334215164185, "learning_rate": 4.287550889222199e-05, "loss": 0.5213, "step": 675 }, { "epoch": 0.43361128928800513, "grad_norm": 1.2371617555618286, "learning_rate": 4.2864795371759165e-05, "loss": 0.4446, "step": 676 }, { "epoch": 0.4342527261064785, "grad_norm": 1.3688544034957886, "learning_rate": 4.2854081851296344e-05, "loss": 0.575, "step": 677 }, { "epoch": 0.43489416292495187, "grad_norm": 1.2038682699203491, "learning_rate": 4.2843368330833515e-05, "loss": 0.4279, "step": 678 }, { "epoch": 0.43553559974342526, "grad_norm": 1.120154857635498, "learning_rate": 4.283265481037069e-05, "loss": 0.4265, "step": 679 }, { "epoch": 0.43617703656189866, "grad_norm": 1.3668910264968872, "learning_rate": 4.2821941289907864e-05, "loss": 0.4838, "step": 680 }, { "epoch": 0.43681847338037205, "grad_norm": 1.348271369934082, "learning_rate": 4.281122776944504e-05, "loss": 0.3871, "step": 681 }, { "epoch": 0.4374599101988454, "grad_norm": 1.2377501726150513, "learning_rate": 4.2800514248982214e-05, "loss": 0.4858, "step": 682 }, { "epoch": 0.4381013470173188, "grad_norm": 1.3041191101074219, "learning_rate": 4.278980072851939e-05, "loss": 0.4701, "step": 683 }, { "epoch": 0.4387427838357922, "grad_norm": 1.3529102802276611, "learning_rate": 4.277908720805657e-05, "loss": 0.5371, "step": 684 }, { "epoch": 0.4393842206542656, "grad_norm": 1.236303448677063, "learning_rate": 4.276837368759374e-05, "loss": 0.4133, "step": 685 }, { "epoch": 0.4400256574727389, "grad_norm": 1.2306972742080688, "learning_rate": 4.275766016713092e-05, "loss": 0.4683, "step": 686 }, { "epoch": 0.4406670942912123, "grad_norm": 1.5021259784698486, "learning_rate": 4.27469466466681e-05, "loss": 0.5125, "step": 687 }, { "epoch": 0.4413085311096857, "grad_norm": 1.6686437129974365, "learning_rate": 4.273623312620527e-05, "loss": 0.5702, "step": 688 }, { "epoch": 0.44194996792815905, "grad_norm": 1.4889203310012817, "learning_rate": 4.272551960574245e-05, "loss": 0.5037, "step": 689 }, { "epoch": 0.44259140474663244, "grad_norm": 1.4493929147720337, "learning_rate": 4.2714806085279626e-05, "loss": 0.5045, "step": 690 }, { "epoch": 0.44323284156510584, "grad_norm": 1.206022024154663, "learning_rate": 4.27040925648168e-05, "loss": 0.3981, "step": 691 }, { "epoch": 0.44387427838357923, "grad_norm": 1.4577784538269043, "learning_rate": 4.2693379044353975e-05, "loss": 0.5633, "step": 692 }, { "epoch": 0.4445157152020526, "grad_norm": 1.0807557106018066, "learning_rate": 4.268266552389115e-05, "loss": 0.3949, "step": 693 }, { "epoch": 0.44515715202052597, "grad_norm": 1.3417600393295288, "learning_rate": 4.267195200342833e-05, "loss": 0.5531, "step": 694 }, { "epoch": 0.44579858883899937, "grad_norm": 1.0704056024551392, "learning_rate": 4.26612384829655e-05, "loss": 0.3737, "step": 695 }, { "epoch": 0.44644002565747276, "grad_norm": 1.160280704498291, "learning_rate": 4.265052496250268e-05, "loss": 0.4236, "step": 696 }, { "epoch": 0.4470814624759461, "grad_norm": 1.1598244905471802, "learning_rate": 4.263981144203986e-05, "loss": 0.4311, "step": 697 }, { "epoch": 0.4477228992944195, "grad_norm": 1.2300828695297241, "learning_rate": 4.262909792157703e-05, "loss": 0.459, "step": 698 }, { "epoch": 0.4483643361128929, "grad_norm": 1.7568156719207764, "learning_rate": 4.261838440111421e-05, "loss": 0.42, "step": 699 }, { "epoch": 0.4490057729313663, "grad_norm": 1.3146709203720093, "learning_rate": 4.260767088065139e-05, "loss": 0.4584, "step": 700 }, { "epoch": 0.4496472097498396, "grad_norm": 1.4753499031066895, "learning_rate": 4.259695736018856e-05, "loss": 0.5687, "step": 701 }, { "epoch": 0.450288646568313, "grad_norm": 1.204638957977295, "learning_rate": 4.2586243839725736e-05, "loss": 0.4533, "step": 702 }, { "epoch": 0.4509300833867864, "grad_norm": 1.1578384637832642, "learning_rate": 4.2575530319262914e-05, "loss": 0.3812, "step": 703 }, { "epoch": 0.45157152020525976, "grad_norm": 1.4107757806777954, "learning_rate": 4.2564816798800086e-05, "loss": 0.4513, "step": 704 }, { "epoch": 0.45221295702373315, "grad_norm": 1.5860992670059204, "learning_rate": 4.2554103278337264e-05, "loss": 0.5834, "step": 705 }, { "epoch": 0.45285439384220655, "grad_norm": 1.2096140384674072, "learning_rate": 4.254338975787444e-05, "loss": 0.3783, "step": 706 }, { "epoch": 0.45349583066067994, "grad_norm": 1.484701156616211, "learning_rate": 4.253267623741162e-05, "loss": 0.5251, "step": 707 }, { "epoch": 0.4541372674791533, "grad_norm": 1.4345386028289795, "learning_rate": 4.252196271694879e-05, "loss": 0.4031, "step": 708 }, { "epoch": 0.4547787042976267, "grad_norm": 1.4926538467407227, "learning_rate": 4.251124919648597e-05, "loss": 0.4177, "step": 709 }, { "epoch": 0.4554201411161001, "grad_norm": 1.1257978677749634, "learning_rate": 4.250053567602315e-05, "loss": 0.371, "step": 710 }, { "epoch": 0.45606157793457347, "grad_norm": 1.6188085079193115, "learning_rate": 4.248982215556032e-05, "loss": 0.4718, "step": 711 }, { "epoch": 0.4567030147530468, "grad_norm": 1.4205244779586792, "learning_rate": 4.24791086350975e-05, "loss": 0.4693, "step": 712 }, { "epoch": 0.4573444515715202, "grad_norm": 1.2922439575195312, "learning_rate": 4.2468395114634676e-05, "loss": 0.446, "step": 713 }, { "epoch": 0.4579858883899936, "grad_norm": 1.342252254486084, "learning_rate": 4.245768159417185e-05, "loss": 0.451, "step": 714 }, { "epoch": 0.45862732520846694, "grad_norm": 1.2729089260101318, "learning_rate": 4.244696807370902e-05, "loss": 0.4361, "step": 715 }, { "epoch": 0.45926876202694034, "grad_norm": 1.2568392753601074, "learning_rate": 4.2436254553246196e-05, "loss": 0.4341, "step": 716 }, { "epoch": 0.45991019884541373, "grad_norm": 1.4083093404769897, "learning_rate": 4.2425541032783375e-05, "loss": 0.5778, "step": 717 }, { "epoch": 0.4605516356638871, "grad_norm": 1.090470552444458, "learning_rate": 4.2414827512320546e-05, "loss": 0.3609, "step": 718 }, { "epoch": 0.46119307248236047, "grad_norm": 1.4944792985916138, "learning_rate": 4.2404113991857724e-05, "loss": 0.5323, "step": 719 }, { "epoch": 0.46183450930083386, "grad_norm": 1.1001687049865723, "learning_rate": 4.23934004713949e-05, "loss": 0.3723, "step": 720 }, { "epoch": 0.46247594611930726, "grad_norm": 1.3190070390701294, "learning_rate": 4.2382686950932074e-05, "loss": 0.476, "step": 721 }, { "epoch": 0.46311738293778065, "grad_norm": 1.412632942199707, "learning_rate": 4.237197343046925e-05, "loss": 0.5234, "step": 722 }, { "epoch": 0.463758819756254, "grad_norm": 1.3116475343704224, "learning_rate": 4.236125991000643e-05, "loss": 0.4377, "step": 723 }, { "epoch": 0.4644002565747274, "grad_norm": 1.3904707431793213, "learning_rate": 4.235054638954361e-05, "loss": 0.4731, "step": 724 }, { "epoch": 0.4650416933932008, "grad_norm": 1.249496579170227, "learning_rate": 4.233983286908078e-05, "loss": 0.4747, "step": 725 }, { "epoch": 0.4656831302116741, "grad_norm": 1.2780790328979492, "learning_rate": 4.232911934861796e-05, "loss": 0.4914, "step": 726 }, { "epoch": 0.4663245670301475, "grad_norm": 1.5581682920455933, "learning_rate": 4.2318405828155136e-05, "loss": 0.6051, "step": 727 }, { "epoch": 0.4669660038486209, "grad_norm": 1.0071502923965454, "learning_rate": 4.230769230769231e-05, "loss": 0.3577, "step": 728 }, { "epoch": 0.4676074406670943, "grad_norm": 1.3384190797805786, "learning_rate": 4.2296978787229485e-05, "loss": 0.4778, "step": 729 }, { "epoch": 0.46824887748556765, "grad_norm": 1.3645994663238525, "learning_rate": 4.2286265266766663e-05, "loss": 0.4172, "step": 730 }, { "epoch": 0.46889031430404104, "grad_norm": 1.4770419597625732, "learning_rate": 4.2275551746303835e-05, "loss": 0.4924, "step": 731 }, { "epoch": 0.46953175112251444, "grad_norm": 1.057324767112732, "learning_rate": 4.226483822584101e-05, "loss": 0.3649, "step": 732 }, { "epoch": 0.47017318794098784, "grad_norm": 1.1195363998413086, "learning_rate": 4.225412470537819e-05, "loss": 0.378, "step": 733 }, { "epoch": 0.4708146247594612, "grad_norm": 1.6304171085357666, "learning_rate": 4.224341118491536e-05, "loss": 0.596, "step": 734 }, { "epoch": 0.47145606157793457, "grad_norm": 1.5911062955856323, "learning_rate": 4.223269766445254e-05, "loss": 0.5163, "step": 735 }, { "epoch": 0.47209749839640797, "grad_norm": 1.3995252847671509, "learning_rate": 4.222198414398972e-05, "loss": 0.501, "step": 736 }, { "epoch": 0.47273893521488136, "grad_norm": 1.3473520278930664, "learning_rate": 4.22112706235269e-05, "loss": 0.3977, "step": 737 }, { "epoch": 0.4733803720333547, "grad_norm": 1.3058971166610718, "learning_rate": 4.220055710306407e-05, "loss": 0.4724, "step": 738 }, { "epoch": 0.4740218088518281, "grad_norm": 1.5070905685424805, "learning_rate": 4.2189843582601246e-05, "loss": 0.5595, "step": 739 }, { "epoch": 0.4746632456703015, "grad_norm": 1.484187126159668, "learning_rate": 4.2179130062138425e-05, "loss": 0.5547, "step": 740 }, { "epoch": 0.47530468248877483, "grad_norm": 1.2915997505187988, "learning_rate": 4.2168416541675596e-05, "loss": 0.4675, "step": 741 }, { "epoch": 0.4759461193072482, "grad_norm": 1.2221107482910156, "learning_rate": 4.2157703021212774e-05, "loss": 0.4211, "step": 742 }, { "epoch": 0.4765875561257216, "grad_norm": 1.3003102540969849, "learning_rate": 4.214698950074995e-05, "loss": 0.4977, "step": 743 }, { "epoch": 0.477228992944195, "grad_norm": 1.31505286693573, "learning_rate": 4.2136275980287124e-05, "loss": 0.4529, "step": 744 }, { "epoch": 0.47787042976266836, "grad_norm": 1.393858790397644, "learning_rate": 4.21255624598243e-05, "loss": 0.4687, "step": 745 }, { "epoch": 0.47851186658114175, "grad_norm": 1.1530884504318237, "learning_rate": 4.211484893936148e-05, "loss": 0.4416, "step": 746 }, { "epoch": 0.47915330339961515, "grad_norm": 1.0763828754425049, "learning_rate": 4.210413541889866e-05, "loss": 0.3941, "step": 747 }, { "epoch": 0.47979474021808854, "grad_norm": 1.4018279314041138, "learning_rate": 4.209342189843582e-05, "loss": 0.5317, "step": 748 }, { "epoch": 0.4804361770365619, "grad_norm": 1.3721460103988647, "learning_rate": 4.2082708377973e-05, "loss": 0.521, "step": 749 }, { "epoch": 0.4810776138550353, "grad_norm": 1.1628996133804321, "learning_rate": 4.207199485751018e-05, "loss": 0.4273, "step": 750 }, { "epoch": 0.4817190506735087, "grad_norm": 1.158600926399231, "learning_rate": 4.206128133704735e-05, "loss": 0.4194, "step": 751 }, { "epoch": 0.482360487491982, "grad_norm": 1.3068856000900269, "learning_rate": 4.205056781658453e-05, "loss": 0.4697, "step": 752 }, { "epoch": 0.4830019243104554, "grad_norm": 1.1982241868972778, "learning_rate": 4.203985429612171e-05, "loss": 0.3937, "step": 753 }, { "epoch": 0.4836433611289288, "grad_norm": 1.2753545045852661, "learning_rate": 4.2029140775658885e-05, "loss": 0.3974, "step": 754 }, { "epoch": 0.4842847979474022, "grad_norm": 1.3104493618011475, "learning_rate": 4.2018427255196056e-05, "loss": 0.505, "step": 755 }, { "epoch": 0.48492623476587554, "grad_norm": 1.3677289485931396, "learning_rate": 4.2007713734733234e-05, "loss": 0.5079, "step": 756 }, { "epoch": 0.48556767158434894, "grad_norm": 1.1045199632644653, "learning_rate": 4.199700021427041e-05, "loss": 0.3441, "step": 757 }, { "epoch": 0.48620910840282233, "grad_norm": 1.2459588050842285, "learning_rate": 4.1986286693807584e-05, "loss": 0.4512, "step": 758 }, { "epoch": 0.4868505452212957, "grad_norm": 1.630593180656433, "learning_rate": 4.197557317334476e-05, "loss": 0.5926, "step": 759 }, { "epoch": 0.48749198203976907, "grad_norm": 1.7605259418487549, "learning_rate": 4.196485965288194e-05, "loss": 0.6167, "step": 760 }, { "epoch": 0.48813341885824246, "grad_norm": 1.4542274475097656, "learning_rate": 4.195414613241911e-05, "loss": 0.462, "step": 761 }, { "epoch": 0.48877485567671586, "grad_norm": 1.2446599006652832, "learning_rate": 4.194343261195629e-05, "loss": 0.4364, "step": 762 }, { "epoch": 0.4894162924951892, "grad_norm": 1.4795591831207275, "learning_rate": 4.193271909149347e-05, "loss": 0.5114, "step": 763 }, { "epoch": 0.4900577293136626, "grad_norm": 1.3626521825790405, "learning_rate": 4.192200557103064e-05, "loss": 0.4816, "step": 764 }, { "epoch": 0.490699166132136, "grad_norm": 1.284379243850708, "learning_rate": 4.191129205056782e-05, "loss": 0.477, "step": 765 }, { "epoch": 0.4913406029506094, "grad_norm": 1.2010862827301025, "learning_rate": 4.1900578530104995e-05, "loss": 0.388, "step": 766 }, { "epoch": 0.4919820397690827, "grad_norm": 1.8307101726531982, "learning_rate": 4.1889865009642174e-05, "loss": 0.5398, "step": 767 }, { "epoch": 0.4926234765875561, "grad_norm": 1.1729912757873535, "learning_rate": 4.1879151489179345e-05, "loss": 0.4062, "step": 768 }, { "epoch": 0.4932649134060295, "grad_norm": 1.0755279064178467, "learning_rate": 4.186843796871652e-05, "loss": 0.3705, "step": 769 }, { "epoch": 0.4939063502245029, "grad_norm": 1.393713116645813, "learning_rate": 4.18577244482537e-05, "loss": 0.5219, "step": 770 }, { "epoch": 0.49454778704297625, "grad_norm": 1.1047090291976929, "learning_rate": 4.184701092779087e-05, "loss": 0.4061, "step": 771 }, { "epoch": 0.49518922386144965, "grad_norm": 1.1136821508407593, "learning_rate": 4.183629740732805e-05, "loss": 0.4097, "step": 772 }, { "epoch": 0.49583066067992304, "grad_norm": 1.0769213438034058, "learning_rate": 4.182558388686523e-05, "loss": 0.3601, "step": 773 }, { "epoch": 0.4964720974983964, "grad_norm": 1.0824848413467407, "learning_rate": 4.18148703664024e-05, "loss": 0.3965, "step": 774 }, { "epoch": 0.4971135343168698, "grad_norm": 1.1884863376617432, "learning_rate": 4.180415684593958e-05, "loss": 0.4332, "step": 775 }, { "epoch": 0.49775497113534317, "grad_norm": 1.1648573875427246, "learning_rate": 4.1793443325476757e-05, "loss": 0.4939, "step": 776 }, { "epoch": 0.49839640795381657, "grad_norm": 1.3294246196746826, "learning_rate": 4.1782729805013935e-05, "loss": 0.4728, "step": 777 }, { "epoch": 0.4990378447722899, "grad_norm": 1.077837586402893, "learning_rate": 4.1772016284551106e-05, "loss": 0.3821, "step": 778 }, { "epoch": 0.4996792815907633, "grad_norm": 1.3848767280578613, "learning_rate": 4.1761302764088284e-05, "loss": 0.5009, "step": 779 }, { "epoch": 0.5003207184092366, "grad_norm": 1.0822575092315674, "learning_rate": 4.175058924362546e-05, "loss": 0.3919, "step": 780 }, { "epoch": 0.50096215522771, "grad_norm": 1.3909189701080322, "learning_rate": 4.1739875723162634e-05, "loss": 0.5353, "step": 781 }, { "epoch": 0.5016035920461834, "grad_norm": 1.326123833656311, "learning_rate": 4.172916220269981e-05, "loss": 0.3982, "step": 782 }, { "epoch": 0.5022450288646568, "grad_norm": 1.2433537244796753, "learning_rate": 4.171844868223698e-05, "loss": 0.457, "step": 783 }, { "epoch": 0.5028864656831302, "grad_norm": 1.3176020383834839, "learning_rate": 4.170773516177416e-05, "loss": 0.3859, "step": 784 }, { "epoch": 0.5035279025016036, "grad_norm": 1.4606759548187256, "learning_rate": 4.169702164131133e-05, "loss": 0.5201, "step": 785 }, { "epoch": 0.504169339320077, "grad_norm": 1.4640673398971558, "learning_rate": 4.168630812084851e-05, "loss": 0.505, "step": 786 }, { "epoch": 0.5048107761385503, "grad_norm": 1.3853683471679688, "learning_rate": 4.167559460038569e-05, "loss": 0.4189, "step": 787 }, { "epoch": 0.5054522129570237, "grad_norm": 1.3774045705795288, "learning_rate": 4.166488107992286e-05, "loss": 0.4409, "step": 788 }, { "epoch": 0.5060936497754971, "grad_norm": 1.507002592086792, "learning_rate": 4.165416755946004e-05, "loss": 0.5571, "step": 789 }, { "epoch": 0.5067350865939705, "grad_norm": 1.3353524208068848, "learning_rate": 4.164345403899722e-05, "loss": 0.484, "step": 790 }, { "epoch": 0.5073765234124439, "grad_norm": 1.5265083312988281, "learning_rate": 4.163274051853439e-05, "loss": 0.5267, "step": 791 }, { "epoch": 0.5080179602309173, "grad_norm": 1.561117172241211, "learning_rate": 4.1622026998071566e-05, "loss": 0.6541, "step": 792 }, { "epoch": 0.5086593970493907, "grad_norm": 1.4025194644927979, "learning_rate": 4.1611313477608744e-05, "loss": 0.5838, "step": 793 }, { "epoch": 0.5093008338678641, "grad_norm": 1.234157919883728, "learning_rate": 4.160059995714592e-05, "loss": 0.4463, "step": 794 }, { "epoch": 0.5099422706863374, "grad_norm": 1.3999847173690796, "learning_rate": 4.1589886436683094e-05, "loss": 0.5149, "step": 795 }, { "epoch": 0.5105837075048107, "grad_norm": 1.2007912397384644, "learning_rate": 4.157917291622027e-05, "loss": 0.3863, "step": 796 }, { "epoch": 0.5112251443232841, "grad_norm": 1.1875790357589722, "learning_rate": 4.156845939575745e-05, "loss": 0.4373, "step": 797 }, { "epoch": 0.5118665811417575, "grad_norm": 1.2985470294952393, "learning_rate": 4.155774587529462e-05, "loss": 0.4475, "step": 798 }, { "epoch": 0.5125080179602309, "grad_norm": 1.313909649848938, "learning_rate": 4.15470323548318e-05, "loss": 0.4746, "step": 799 }, { "epoch": 0.5131494547787043, "grad_norm": 1.3383748531341553, "learning_rate": 4.153631883436898e-05, "loss": 0.5335, "step": 800 }, { "epoch": 0.5137908915971777, "grad_norm": 1.2042173147201538, "learning_rate": 4.152560531390615e-05, "loss": 0.4058, "step": 801 }, { "epoch": 0.514432328415651, "grad_norm": 1.5084099769592285, "learning_rate": 4.151489179344333e-05, "loss": 0.5757, "step": 802 }, { "epoch": 0.5150737652341244, "grad_norm": 1.2693897485733032, "learning_rate": 4.1504178272980506e-05, "loss": 0.5115, "step": 803 }, { "epoch": 0.5157152020525978, "grad_norm": 1.555182933807373, "learning_rate": 4.149346475251768e-05, "loss": 0.5145, "step": 804 }, { "epoch": 0.5163566388710712, "grad_norm": 1.2531101703643799, "learning_rate": 4.1482751232054855e-05, "loss": 0.4116, "step": 805 }, { "epoch": 0.5169980756895446, "grad_norm": 1.1960363388061523, "learning_rate": 4.147203771159203e-05, "loss": 0.4499, "step": 806 }, { "epoch": 0.517639512508018, "grad_norm": 1.2779021263122559, "learning_rate": 4.146132419112921e-05, "loss": 0.4833, "step": 807 }, { "epoch": 0.5182809493264914, "grad_norm": 1.207776427268982, "learning_rate": 4.145061067066638e-05, "loss": 0.3617, "step": 808 }, { "epoch": 0.5189223861449648, "grad_norm": 1.092637062072754, "learning_rate": 4.143989715020356e-05, "loss": 0.3812, "step": 809 }, { "epoch": 0.5195638229634381, "grad_norm": 1.1895534992218018, "learning_rate": 4.142918362974074e-05, "loss": 0.4333, "step": 810 }, { "epoch": 0.5202052597819115, "grad_norm": 1.5927592515945435, "learning_rate": 4.141847010927791e-05, "loss": 0.624, "step": 811 }, { "epoch": 0.5208466966003849, "grad_norm": 1.1472618579864502, "learning_rate": 4.140775658881509e-05, "loss": 0.3985, "step": 812 }, { "epoch": 0.5214881334188582, "grad_norm": 1.5903352499008179, "learning_rate": 4.139704306835227e-05, "loss": 0.4684, "step": 813 }, { "epoch": 0.5221295702373316, "grad_norm": 1.3882107734680176, "learning_rate": 4.138632954788944e-05, "loss": 0.5062, "step": 814 }, { "epoch": 0.522771007055805, "grad_norm": 1.4023207426071167, "learning_rate": 4.1375616027426616e-05, "loss": 0.4379, "step": 815 }, { "epoch": 0.5234124438742784, "grad_norm": 1.2989726066589355, "learning_rate": 4.1364902506963794e-05, "loss": 0.4681, "step": 816 }, { "epoch": 0.5240538806927517, "grad_norm": 1.5836584568023682, "learning_rate": 4.1354188986500966e-05, "loss": 0.5774, "step": 817 }, { "epoch": 0.5246953175112251, "grad_norm": 1.6587045192718506, "learning_rate": 4.134347546603814e-05, "loss": 0.6019, "step": 818 }, { "epoch": 0.5253367543296985, "grad_norm": 1.406650424003601, "learning_rate": 4.1332761945575315e-05, "loss": 0.4534, "step": 819 }, { "epoch": 0.5259781911481719, "grad_norm": 1.332986831665039, "learning_rate": 4.1322048425112493e-05, "loss": 0.4376, "step": 820 }, { "epoch": 0.5266196279666453, "grad_norm": 1.3276114463806152, "learning_rate": 4.1311334904649665e-05, "loss": 0.4627, "step": 821 }, { "epoch": 0.5272610647851187, "grad_norm": 1.2846254110336304, "learning_rate": 4.130062138418684e-05, "loss": 0.4786, "step": 822 }, { "epoch": 0.5279025016035921, "grad_norm": 1.4415920972824097, "learning_rate": 4.128990786372402e-05, "loss": 0.5013, "step": 823 }, { "epoch": 0.5285439384220654, "grad_norm": 1.376891016960144, "learning_rate": 4.12791943432612e-05, "loss": 0.4913, "step": 824 }, { "epoch": 0.5291853752405388, "grad_norm": 1.0928715467453003, "learning_rate": 4.126848082279837e-05, "loss": 0.4224, "step": 825 }, { "epoch": 0.5298268120590122, "grad_norm": 1.0921342372894287, "learning_rate": 4.125776730233555e-05, "loss": 0.3664, "step": 826 }, { "epoch": 0.5304682488774856, "grad_norm": 1.2860875129699707, "learning_rate": 4.124705378187273e-05, "loss": 0.5167, "step": 827 }, { "epoch": 0.531109685695959, "grad_norm": 1.2210197448730469, "learning_rate": 4.12363402614099e-05, "loss": 0.4129, "step": 828 }, { "epoch": 0.5317511225144324, "grad_norm": 1.5183844566345215, "learning_rate": 4.1225626740947077e-05, "loss": 0.6443, "step": 829 }, { "epoch": 0.5323925593329057, "grad_norm": 1.2901687622070312, "learning_rate": 4.1214913220484255e-05, "loss": 0.4952, "step": 830 }, { "epoch": 0.5330339961513791, "grad_norm": 1.2214730978012085, "learning_rate": 4.1204199700021426e-05, "loss": 0.5007, "step": 831 }, { "epoch": 0.5336754329698524, "grad_norm": 1.4725204706192017, "learning_rate": 4.1193486179558604e-05, "loss": 0.4715, "step": 832 }, { "epoch": 0.5343168697883258, "grad_norm": 1.5509145259857178, "learning_rate": 4.118277265909578e-05, "loss": 0.4902, "step": 833 }, { "epoch": 0.5349583066067992, "grad_norm": 1.6656055450439453, "learning_rate": 4.1172059138632954e-05, "loss": 0.5439, "step": 834 }, { "epoch": 0.5355997434252726, "grad_norm": 8.651101112365723, "learning_rate": 4.116134561817013e-05, "loss": 0.5291, "step": 835 }, { "epoch": 0.536241180243746, "grad_norm": 1.5183433294296265, "learning_rate": 4.115063209770731e-05, "loss": 0.4644, "step": 836 }, { "epoch": 0.5368826170622194, "grad_norm": 1.0932656526565552, "learning_rate": 4.113991857724449e-05, "loss": 0.3602, "step": 837 }, { "epoch": 0.5375240538806928, "grad_norm": 1.5199846029281616, "learning_rate": 4.112920505678166e-05, "loss": 0.412, "step": 838 }, { "epoch": 0.5381654906991661, "grad_norm": 1.3120222091674805, "learning_rate": 4.111849153631884e-05, "loss": 0.4485, "step": 839 }, { "epoch": 0.5388069275176395, "grad_norm": 1.5059500932693481, "learning_rate": 4.1107778015856016e-05, "loss": 0.5297, "step": 840 }, { "epoch": 0.5394483643361129, "grad_norm": 1.388665795326233, "learning_rate": 4.109706449539319e-05, "loss": 0.5221, "step": 841 }, { "epoch": 0.5400898011545863, "grad_norm": 1.032963514328003, "learning_rate": 4.1086350974930365e-05, "loss": 0.3436, "step": 842 }, { "epoch": 0.5407312379730597, "grad_norm": 1.4057632684707642, "learning_rate": 4.1075637454467543e-05, "loss": 0.5482, "step": 843 }, { "epoch": 0.5413726747915331, "grad_norm": 1.3029431104660034, "learning_rate": 4.1064923934004715e-05, "loss": 0.4548, "step": 844 }, { "epoch": 0.5420141116100065, "grad_norm": 1.2597070932388306, "learning_rate": 4.105421041354189e-05, "loss": 0.5244, "step": 845 }, { "epoch": 0.5426555484284797, "grad_norm": 1.3447906970977783, "learning_rate": 4.104349689307907e-05, "loss": 0.578, "step": 846 }, { "epoch": 0.5432969852469531, "grad_norm": 1.090559720993042, "learning_rate": 4.103278337261624e-05, "loss": 0.3768, "step": 847 }, { "epoch": 0.5439384220654265, "grad_norm": 1.2750370502471924, "learning_rate": 4.102206985215342e-05, "loss": 0.4762, "step": 848 }, { "epoch": 0.5445798588838999, "grad_norm": 1.337522029876709, "learning_rate": 4.10113563316906e-05, "loss": 0.5515, "step": 849 }, { "epoch": 0.5452212957023733, "grad_norm": 1.5000765323638916, "learning_rate": 4.100064281122778e-05, "loss": 0.4875, "step": 850 }, { "epoch": 0.5458627325208467, "grad_norm": 1.3014552593231201, "learning_rate": 4.098992929076494e-05, "loss": 0.4412, "step": 851 }, { "epoch": 0.5465041693393201, "grad_norm": 1.122015357017517, "learning_rate": 4.097921577030212e-05, "loss": 0.4095, "step": 852 }, { "epoch": 0.5471456061577935, "grad_norm": 1.2016229629516602, "learning_rate": 4.09685022498393e-05, "loss": 0.378, "step": 853 }, { "epoch": 0.5477870429762668, "grad_norm": 1.4457942247390747, "learning_rate": 4.0957788729376476e-05, "loss": 0.4585, "step": 854 }, { "epoch": 0.5484284797947402, "grad_norm": 1.36692214012146, "learning_rate": 4.094707520891365e-05, "loss": 0.4318, "step": 855 }, { "epoch": 0.5490699166132136, "grad_norm": 1.2974785566329956, "learning_rate": 4.0936361688450826e-05, "loss": 0.41, "step": 856 }, { "epoch": 0.549711353431687, "grad_norm": 1.2137665748596191, "learning_rate": 4.0925648167988004e-05, "loss": 0.3426, "step": 857 }, { "epoch": 0.5503527902501604, "grad_norm": 1.4634791612625122, "learning_rate": 4.0914934647525175e-05, "loss": 0.4679, "step": 858 }, { "epoch": 0.5509942270686338, "grad_norm": 1.336010217666626, "learning_rate": 4.090422112706235e-05, "loss": 0.457, "step": 859 }, { "epoch": 0.5516356638871072, "grad_norm": 1.440920352935791, "learning_rate": 4.089350760659953e-05, "loss": 0.5212, "step": 860 }, { "epoch": 0.5522771007055804, "grad_norm": 1.0569688081741333, "learning_rate": 4.08827940861367e-05, "loss": 0.3668, "step": 861 }, { "epoch": 0.5529185375240538, "grad_norm": 1.4445664882659912, "learning_rate": 4.087208056567388e-05, "loss": 0.5229, "step": 862 }, { "epoch": 0.5535599743425272, "grad_norm": 1.3622400760650635, "learning_rate": 4.086136704521106e-05, "loss": 0.5364, "step": 863 }, { "epoch": 0.5542014111610006, "grad_norm": 1.4694647789001465, "learning_rate": 4.085065352474823e-05, "loss": 0.4986, "step": 864 }, { "epoch": 0.554842847979474, "grad_norm": 1.3481063842773438, "learning_rate": 4.083994000428541e-05, "loss": 0.5152, "step": 865 }, { "epoch": 0.5554842847979474, "grad_norm": 1.137320876121521, "learning_rate": 4.082922648382259e-05, "loss": 0.4176, "step": 866 }, { "epoch": 0.5561257216164208, "grad_norm": 1.380717396736145, "learning_rate": 4.0818512963359765e-05, "loss": 0.5358, "step": 867 }, { "epoch": 0.5567671584348942, "grad_norm": 1.2629718780517578, "learning_rate": 4.0807799442896936e-05, "loss": 0.4745, "step": 868 }, { "epoch": 0.5574085952533675, "grad_norm": 1.3871471881866455, "learning_rate": 4.0797085922434114e-05, "loss": 0.5158, "step": 869 }, { "epoch": 0.5580500320718409, "grad_norm": 1.4508618116378784, "learning_rate": 4.078637240197129e-05, "loss": 0.5034, "step": 870 }, { "epoch": 0.5586914688903143, "grad_norm": 1.1893222332000732, "learning_rate": 4.0775658881508464e-05, "loss": 0.4231, "step": 871 }, { "epoch": 0.5593329057087877, "grad_norm": 1.332793116569519, "learning_rate": 4.076494536104564e-05, "loss": 0.4012, "step": 872 }, { "epoch": 0.5599743425272611, "grad_norm": 1.3900114297866821, "learning_rate": 4.075423184058282e-05, "loss": 0.5153, "step": 873 }, { "epoch": 0.5606157793457345, "grad_norm": 1.3905940055847168, "learning_rate": 4.074351832011999e-05, "loss": 0.4464, "step": 874 }, { "epoch": 0.5612572161642079, "grad_norm": 1.4020575284957886, "learning_rate": 4.073280479965717e-05, "loss": 0.5588, "step": 875 }, { "epoch": 0.5618986529826812, "grad_norm": 1.1343992948532104, "learning_rate": 4.072209127919435e-05, "loss": 0.3829, "step": 876 }, { "epoch": 0.5625400898011546, "grad_norm": 1.1779731512069702, "learning_rate": 4.071137775873152e-05, "loss": 0.4156, "step": 877 }, { "epoch": 0.563181526619628, "grad_norm": 1.205591082572937, "learning_rate": 4.07006642382687e-05, "loss": 0.4729, "step": 878 }, { "epoch": 0.5638229634381013, "grad_norm": 1.3715996742248535, "learning_rate": 4.0689950717805876e-05, "loss": 0.466, "step": 879 }, { "epoch": 0.5644644002565747, "grad_norm": 1.241431474685669, "learning_rate": 4.0679237197343054e-05, "loss": 0.4801, "step": 880 }, { "epoch": 0.5651058370750481, "grad_norm": 1.1296281814575195, "learning_rate": 4.0668523676880225e-05, "loss": 0.3854, "step": 881 }, { "epoch": 0.5657472738935215, "grad_norm": 1.0511831045150757, "learning_rate": 4.06578101564174e-05, "loss": 0.3504, "step": 882 }, { "epoch": 0.5663887107119948, "grad_norm": 1.2262699604034424, "learning_rate": 4.064709663595458e-05, "loss": 0.406, "step": 883 }, { "epoch": 0.5670301475304682, "grad_norm": 1.5853757858276367, "learning_rate": 4.063638311549175e-05, "loss": 0.5069, "step": 884 }, { "epoch": 0.5676715843489416, "grad_norm": 1.073994517326355, "learning_rate": 4.062566959502893e-05, "loss": 0.3417, "step": 885 }, { "epoch": 0.568313021167415, "grad_norm": 1.5912362337112427, "learning_rate": 4.06149560745661e-05, "loss": 0.5588, "step": 886 }, { "epoch": 0.5689544579858884, "grad_norm": 1.1669434309005737, "learning_rate": 4.060424255410328e-05, "loss": 0.4158, "step": 887 }, { "epoch": 0.5695958948043618, "grad_norm": 1.5763205289840698, "learning_rate": 4.059352903364045e-05, "loss": 0.628, "step": 888 }, { "epoch": 0.5702373316228352, "grad_norm": 1.308627963066101, "learning_rate": 4.058281551317763e-05, "loss": 0.4699, "step": 889 }, { "epoch": 0.5708787684413086, "grad_norm": 1.2786086797714233, "learning_rate": 4.057210199271481e-05, "loss": 0.4685, "step": 890 }, { "epoch": 0.5715202052597819, "grad_norm": 1.2460139989852905, "learning_rate": 4.056138847225198e-05, "loss": 0.3886, "step": 891 }, { "epoch": 0.5721616420782553, "grad_norm": 1.4151033163070679, "learning_rate": 4.055067495178916e-05, "loss": 0.5925, "step": 892 }, { "epoch": 0.5728030788967287, "grad_norm": 1.0729212760925293, "learning_rate": 4.0539961431326336e-05, "loss": 0.3822, "step": 893 }, { "epoch": 0.573444515715202, "grad_norm": 1.401073932647705, "learning_rate": 4.052924791086351e-05, "loss": 0.4823, "step": 894 }, { "epoch": 0.5740859525336754, "grad_norm": 1.1776906251907349, "learning_rate": 4.0518534390400685e-05, "loss": 0.3664, "step": 895 }, { "epoch": 0.5747273893521488, "grad_norm": 1.2522739171981812, "learning_rate": 4.050782086993786e-05, "loss": 0.4203, "step": 896 }, { "epoch": 0.5753688261706222, "grad_norm": 1.2894084453582764, "learning_rate": 4.049710734947504e-05, "loss": 0.5211, "step": 897 }, { "epoch": 0.5760102629890955, "grad_norm": 1.6159788370132446, "learning_rate": 4.048639382901221e-05, "loss": 0.4748, "step": 898 }, { "epoch": 0.5766516998075689, "grad_norm": 1.1975713968276978, "learning_rate": 4.047568030854939e-05, "loss": 0.4952, "step": 899 }, { "epoch": 0.5772931366260423, "grad_norm": 1.3848521709442139, "learning_rate": 4.046496678808657e-05, "loss": 0.5413, "step": 900 }, { "epoch": 0.5779345734445157, "grad_norm": 1.3215417861938477, "learning_rate": 4.045425326762374e-05, "loss": 0.5128, "step": 901 }, { "epoch": 0.5785760102629891, "grad_norm": 1.1821900606155396, "learning_rate": 4.044353974716092e-05, "loss": 0.4257, "step": 902 }, { "epoch": 0.5792174470814625, "grad_norm": 1.0449192523956299, "learning_rate": 4.04328262266981e-05, "loss": 0.3675, "step": 903 }, { "epoch": 0.5798588838999359, "grad_norm": 1.3802460432052612, "learning_rate": 4.042211270623527e-05, "loss": 0.468, "step": 904 }, { "epoch": 0.5805003207184093, "grad_norm": 1.334312915802002, "learning_rate": 4.0411399185772446e-05, "loss": 0.4915, "step": 905 }, { "epoch": 0.5811417575368826, "grad_norm": 1.3020806312561035, "learning_rate": 4.0400685665309625e-05, "loss": 0.4822, "step": 906 }, { "epoch": 0.581783194355356, "grad_norm": 1.5378156900405884, "learning_rate": 4.03899721448468e-05, "loss": 0.5692, "step": 907 }, { "epoch": 0.5824246311738294, "grad_norm": 1.176268458366394, "learning_rate": 4.0379258624383974e-05, "loss": 0.3655, "step": 908 }, { "epoch": 0.5830660679923028, "grad_norm": 1.3597474098205566, "learning_rate": 4.036854510392115e-05, "loss": 0.4813, "step": 909 }, { "epoch": 0.5837075048107762, "grad_norm": 1.1760586500167847, "learning_rate": 4.035783158345833e-05, "loss": 0.422, "step": 910 }, { "epoch": 0.5843489416292496, "grad_norm": 1.2299644947052002, "learning_rate": 4.03471180629955e-05, "loss": 0.4144, "step": 911 }, { "epoch": 0.584990378447723, "grad_norm": 1.7226940393447876, "learning_rate": 4.033640454253268e-05, "loss": 0.5044, "step": 912 }, { "epoch": 0.5856318152661962, "grad_norm": 1.3750571012496948, "learning_rate": 4.032569102206986e-05, "loss": 0.4805, "step": 913 }, { "epoch": 0.5862732520846696, "grad_norm": 1.2084760665893555, "learning_rate": 4.031497750160703e-05, "loss": 0.4138, "step": 914 }, { "epoch": 0.586914688903143, "grad_norm": 1.3225609064102173, "learning_rate": 4.030426398114421e-05, "loss": 0.4188, "step": 915 }, { "epoch": 0.5875561257216164, "grad_norm": 1.2927405834197998, "learning_rate": 4.0293550460681386e-05, "loss": 0.4641, "step": 916 }, { "epoch": 0.5881975625400898, "grad_norm": 1.1465972661972046, "learning_rate": 4.028283694021856e-05, "loss": 0.4137, "step": 917 }, { "epoch": 0.5888389993585632, "grad_norm": 1.2223292589187622, "learning_rate": 4.0272123419755735e-05, "loss": 0.4064, "step": 918 }, { "epoch": 0.5894804361770366, "grad_norm": 1.4366458654403687, "learning_rate": 4.026140989929291e-05, "loss": 0.548, "step": 919 }, { "epoch": 0.5901218729955099, "grad_norm": 1.1827925443649292, "learning_rate": 4.0250696378830085e-05, "loss": 0.3951, "step": 920 }, { "epoch": 0.5907633098139833, "grad_norm": 1.2374598979949951, "learning_rate": 4.0239982858367256e-05, "loss": 0.3899, "step": 921 }, { "epoch": 0.5914047466324567, "grad_norm": 1.4156516790390015, "learning_rate": 4.0229269337904434e-05, "loss": 0.4503, "step": 922 }, { "epoch": 0.5920461834509301, "grad_norm": 1.1548502445220947, "learning_rate": 4.021855581744161e-05, "loss": 0.4294, "step": 923 }, { "epoch": 0.5926876202694035, "grad_norm": 1.097030520439148, "learning_rate": 4.0207842296978784e-05, "loss": 0.4078, "step": 924 }, { "epoch": 0.5933290570878769, "grad_norm": 1.361930251121521, "learning_rate": 4.019712877651596e-05, "loss": 0.4688, "step": 925 }, { "epoch": 0.5939704939063503, "grad_norm": 1.2212930917739868, "learning_rate": 4.018641525605314e-05, "loss": 0.4338, "step": 926 }, { "epoch": 0.5946119307248237, "grad_norm": 1.2715790271759033, "learning_rate": 4.017570173559032e-05, "loss": 0.4795, "step": 927 }, { "epoch": 0.5952533675432969, "grad_norm": 1.3388895988464355, "learning_rate": 4.016498821512749e-05, "loss": 0.4612, "step": 928 }, { "epoch": 0.5958948043617703, "grad_norm": 0.8194640278816223, "learning_rate": 4.015427469466467e-05, "loss": 0.2959, "step": 929 }, { "epoch": 0.5965362411802437, "grad_norm": 1.5512464046478271, "learning_rate": 4.0143561174201846e-05, "loss": 0.5245, "step": 930 }, { "epoch": 0.5971776779987171, "grad_norm": 1.418296456336975, "learning_rate": 4.013284765373902e-05, "loss": 0.3837, "step": 931 }, { "epoch": 0.5978191148171905, "grad_norm": 0.9616816639900208, "learning_rate": 4.0122134133276195e-05, "loss": 0.3469, "step": 932 }, { "epoch": 0.5984605516356639, "grad_norm": 1.2837295532226562, "learning_rate": 4.0111420612813374e-05, "loss": 0.4797, "step": 933 }, { "epoch": 0.5991019884541373, "grad_norm": 1.281262755393982, "learning_rate": 4.0100707092350545e-05, "loss": 0.4788, "step": 934 }, { "epoch": 0.5997434252726106, "grad_norm": 1.205741286277771, "learning_rate": 4.008999357188772e-05, "loss": 0.3719, "step": 935 }, { "epoch": 0.600384862091084, "grad_norm": 1.6864328384399414, "learning_rate": 4.00792800514249e-05, "loss": 0.5594, "step": 936 }, { "epoch": 0.6010262989095574, "grad_norm": 1.4764186143875122, "learning_rate": 4.006856653096208e-05, "loss": 0.4431, "step": 937 }, { "epoch": 0.6016677357280308, "grad_norm": 1.2038589715957642, "learning_rate": 4.005785301049925e-05, "loss": 0.3925, "step": 938 }, { "epoch": 0.6023091725465042, "grad_norm": 1.1469310522079468, "learning_rate": 4.004713949003643e-05, "loss": 0.3869, "step": 939 }, { "epoch": 0.6029506093649776, "grad_norm": 1.1887799501419067, "learning_rate": 4.003642596957361e-05, "loss": 0.3744, "step": 940 }, { "epoch": 0.603592046183451, "grad_norm": 1.3979735374450684, "learning_rate": 4.002571244911078e-05, "loss": 0.4762, "step": 941 }, { "epoch": 0.6042334830019244, "grad_norm": 1.2185084819793701, "learning_rate": 4.0014998928647957e-05, "loss": 0.4274, "step": 942 }, { "epoch": 0.6048749198203976, "grad_norm": 1.2345707416534424, "learning_rate": 4.0004285408185135e-05, "loss": 0.45, "step": 943 }, { "epoch": 0.605516356638871, "grad_norm": 1.5587877035140991, "learning_rate": 3.9993571887722306e-05, "loss": 0.5743, "step": 944 }, { "epoch": 0.6061577934573444, "grad_norm": 1.3357875347137451, "learning_rate": 3.9982858367259484e-05, "loss": 0.4447, "step": 945 }, { "epoch": 0.6067992302758178, "grad_norm": 1.4396666288375854, "learning_rate": 3.997214484679666e-05, "loss": 0.5487, "step": 946 }, { "epoch": 0.6074406670942912, "grad_norm": 1.0936567783355713, "learning_rate": 3.9961431326333834e-05, "loss": 0.4349, "step": 947 }, { "epoch": 0.6080821039127646, "grad_norm": 1.2504557371139526, "learning_rate": 3.995071780587101e-05, "loss": 0.4598, "step": 948 }, { "epoch": 0.608723540731238, "grad_norm": 1.4477324485778809, "learning_rate": 3.994000428540819e-05, "loss": 0.4831, "step": 949 }, { "epoch": 0.6093649775497113, "grad_norm": 1.5358695983886719, "learning_rate": 3.992929076494537e-05, "loss": 0.5736, "step": 950 }, { "epoch": 0.6100064143681847, "grad_norm": 1.2252250909805298, "learning_rate": 3.991857724448254e-05, "loss": 0.4429, "step": 951 }, { "epoch": 0.6106478511866581, "grad_norm": 1.3017523288726807, "learning_rate": 3.990786372401972e-05, "loss": 0.5019, "step": 952 }, { "epoch": 0.6112892880051315, "grad_norm": 1.2566688060760498, "learning_rate": 3.9897150203556896e-05, "loss": 0.4165, "step": 953 }, { "epoch": 0.6119307248236049, "grad_norm": 1.200238585472107, "learning_rate": 3.988643668309407e-05, "loss": 0.3963, "step": 954 }, { "epoch": 0.6125721616420783, "grad_norm": 1.287919044494629, "learning_rate": 3.987572316263124e-05, "loss": 0.4597, "step": 955 }, { "epoch": 0.6132135984605517, "grad_norm": 1.528546690940857, "learning_rate": 3.986500964216842e-05, "loss": 0.5698, "step": 956 }, { "epoch": 0.613855035279025, "grad_norm": 1.0881909132003784, "learning_rate": 3.9854296121705595e-05, "loss": 0.3714, "step": 957 }, { "epoch": 0.6144964720974984, "grad_norm": 1.4619585275650024, "learning_rate": 3.9843582601242766e-05, "loss": 0.6199, "step": 958 }, { "epoch": 0.6151379089159718, "grad_norm": 1.2883695363998413, "learning_rate": 3.9832869080779944e-05, "loss": 0.427, "step": 959 }, { "epoch": 0.6157793457344451, "grad_norm": 1.3852620124816895, "learning_rate": 3.982215556031712e-05, "loss": 0.5039, "step": 960 }, { "epoch": 0.6164207825529185, "grad_norm": 1.2993477582931519, "learning_rate": 3.9811442039854294e-05, "loss": 0.439, "step": 961 }, { "epoch": 0.6170622193713919, "grad_norm": 1.2904140949249268, "learning_rate": 3.980072851939147e-05, "loss": 0.465, "step": 962 }, { "epoch": 0.6177036561898653, "grad_norm": 1.3402550220489502, "learning_rate": 3.979001499892865e-05, "loss": 0.4586, "step": 963 }, { "epoch": 0.6183450930083387, "grad_norm": 1.241959571838379, "learning_rate": 3.977930147846582e-05, "loss": 0.3636, "step": 964 }, { "epoch": 0.618986529826812, "grad_norm": 1.3350021839141846, "learning_rate": 3.9768587958003e-05, "loss": 0.4226, "step": 965 }, { "epoch": 0.6196279666452854, "grad_norm": 1.1267197132110596, "learning_rate": 3.975787443754018e-05, "loss": 0.3884, "step": 966 }, { "epoch": 0.6202694034637588, "grad_norm": 1.3216506242752075, "learning_rate": 3.9747160917077356e-05, "loss": 0.5573, "step": 967 }, { "epoch": 0.6209108402822322, "grad_norm": 1.2043335437774658, "learning_rate": 3.973644739661453e-05, "loss": 0.3987, "step": 968 }, { "epoch": 0.6215522771007056, "grad_norm": 1.5777662992477417, "learning_rate": 3.9725733876151706e-05, "loss": 0.6184, "step": 969 }, { "epoch": 0.622193713919179, "grad_norm": 1.0910344123840332, "learning_rate": 3.9715020355688884e-05, "loss": 0.4067, "step": 970 }, { "epoch": 0.6228351507376524, "grad_norm": 1.2690272331237793, "learning_rate": 3.9704306835226055e-05, "loss": 0.4883, "step": 971 }, { "epoch": 0.6234765875561257, "grad_norm": 1.403475046157837, "learning_rate": 3.969359331476323e-05, "loss": 0.5429, "step": 972 }, { "epoch": 0.6241180243745991, "grad_norm": 1.3963898420333862, "learning_rate": 3.968287979430041e-05, "loss": 0.4754, "step": 973 }, { "epoch": 0.6247594611930725, "grad_norm": 1.3634183406829834, "learning_rate": 3.967216627383758e-05, "loss": 0.4782, "step": 974 }, { "epoch": 0.6254008980115459, "grad_norm": 1.1733626127243042, "learning_rate": 3.966145275337476e-05, "loss": 0.4085, "step": 975 }, { "epoch": 0.6260423348300193, "grad_norm": 0.9425251483917236, "learning_rate": 3.965073923291194e-05, "loss": 0.3567, "step": 976 }, { "epoch": 0.6266837716484926, "grad_norm": 1.3632889986038208, "learning_rate": 3.964002571244911e-05, "loss": 0.5498, "step": 977 }, { "epoch": 0.627325208466966, "grad_norm": 1.252289056777954, "learning_rate": 3.962931219198629e-05, "loss": 0.5705, "step": 978 }, { "epoch": 0.6279666452854393, "grad_norm": 1.2713955640792847, "learning_rate": 3.961859867152347e-05, "loss": 0.4826, "step": 979 }, { "epoch": 0.6286080821039127, "grad_norm": 1.2929147481918335, "learning_rate": 3.9607885151060645e-05, "loss": 0.5021, "step": 980 }, { "epoch": 0.6292495189223861, "grad_norm": 1.0491588115692139, "learning_rate": 3.9597171630597816e-05, "loss": 0.3698, "step": 981 }, { "epoch": 0.6298909557408595, "grad_norm": 1.3283371925354004, "learning_rate": 3.9586458110134994e-05, "loss": 0.4637, "step": 982 }, { "epoch": 0.6305323925593329, "grad_norm": 1.2893418073654175, "learning_rate": 3.957574458967217e-05, "loss": 0.4872, "step": 983 }, { "epoch": 0.6311738293778063, "grad_norm": 1.0841765403747559, "learning_rate": 3.9565031069209344e-05, "loss": 0.4158, "step": 984 }, { "epoch": 0.6318152661962797, "grad_norm": 1.3234872817993164, "learning_rate": 3.955431754874652e-05, "loss": 0.5043, "step": 985 }, { "epoch": 0.6324567030147531, "grad_norm": 1.1865918636322021, "learning_rate": 3.95436040282837e-05, "loss": 0.4934, "step": 986 }, { "epoch": 0.6330981398332264, "grad_norm": 1.5995619297027588, "learning_rate": 3.953289050782087e-05, "loss": 0.5106, "step": 987 }, { "epoch": 0.6337395766516998, "grad_norm": 1.2161054611206055, "learning_rate": 3.952217698735805e-05, "loss": 0.4118, "step": 988 }, { "epoch": 0.6343810134701732, "grad_norm": 1.3398590087890625, "learning_rate": 3.951146346689522e-05, "loss": 0.4894, "step": 989 }, { "epoch": 0.6350224502886466, "grad_norm": 0.9474010467529297, "learning_rate": 3.95007499464324e-05, "loss": 0.3623, "step": 990 }, { "epoch": 0.63566388710712, "grad_norm": 1.439287543296814, "learning_rate": 3.949003642596957e-05, "loss": 0.5558, "step": 991 }, { "epoch": 0.6363053239255934, "grad_norm": 1.2617021799087524, "learning_rate": 3.947932290550675e-05, "loss": 0.4338, "step": 992 }, { "epoch": 0.6369467607440668, "grad_norm": 1.0970075130462646, "learning_rate": 3.946860938504393e-05, "loss": 0.425, "step": 993 }, { "epoch": 0.63758819756254, "grad_norm": 1.4124846458435059, "learning_rate": 3.94578958645811e-05, "loss": 0.4563, "step": 994 }, { "epoch": 0.6382296343810134, "grad_norm": 1.3978347778320312, "learning_rate": 3.9447182344118276e-05, "loss": 0.4628, "step": 995 }, { "epoch": 0.6388710711994868, "grad_norm": 1.1034553050994873, "learning_rate": 3.9436468823655455e-05, "loss": 0.4391, "step": 996 }, { "epoch": 0.6395125080179602, "grad_norm": 1.1866822242736816, "learning_rate": 3.942575530319263e-05, "loss": 0.4547, "step": 997 }, { "epoch": 0.6401539448364336, "grad_norm": 1.1259336471557617, "learning_rate": 3.9415041782729804e-05, "loss": 0.4426, "step": 998 }, { "epoch": 0.640795381654907, "grad_norm": 1.297776222229004, "learning_rate": 3.940432826226698e-05, "loss": 0.4719, "step": 999 }, { "epoch": 0.6414368184733804, "grad_norm": 1.4236356019973755, "learning_rate": 3.939361474180416e-05, "loss": 0.5263, "step": 1000 }, { "epoch": 0.6420782552918538, "grad_norm": 1.3605397939682007, "learning_rate": 3.938290122134133e-05, "loss": 0.5243, "step": 1001 }, { "epoch": 0.6427196921103271, "grad_norm": 1.5246000289916992, "learning_rate": 3.937218770087851e-05, "loss": 0.4842, "step": 1002 }, { "epoch": 0.6433611289288005, "grad_norm": 1.131611943244934, "learning_rate": 3.936147418041569e-05, "loss": 0.432, "step": 1003 }, { "epoch": 0.6440025657472739, "grad_norm": 1.7059141397476196, "learning_rate": 3.935076065995286e-05, "loss": 0.6318, "step": 1004 }, { "epoch": 0.6446440025657473, "grad_norm": 1.3596726655960083, "learning_rate": 3.934004713949004e-05, "loss": 0.4175, "step": 1005 }, { "epoch": 0.6452854393842207, "grad_norm": 1.1616674661636353, "learning_rate": 3.9329333619027216e-05, "loss": 0.3858, "step": 1006 }, { "epoch": 0.6459268762026941, "grad_norm": 1.1389764547348022, "learning_rate": 3.931862009856439e-05, "loss": 0.3816, "step": 1007 }, { "epoch": 0.6465683130211675, "grad_norm": 1.343776822090149, "learning_rate": 3.9307906578101565e-05, "loss": 0.4922, "step": 1008 }, { "epoch": 0.6472097498396407, "grad_norm": 1.2048218250274658, "learning_rate": 3.9297193057638743e-05, "loss": 0.4045, "step": 1009 }, { "epoch": 0.6478511866581141, "grad_norm": 1.4092684984207153, "learning_rate": 3.928647953717592e-05, "loss": 0.4977, "step": 1010 }, { "epoch": 0.6484926234765875, "grad_norm": 1.1653190851211548, "learning_rate": 3.927576601671309e-05, "loss": 0.4106, "step": 1011 }, { "epoch": 0.6491340602950609, "grad_norm": 1.3463717699050903, "learning_rate": 3.926505249625027e-05, "loss": 0.5463, "step": 1012 }, { "epoch": 0.6497754971135343, "grad_norm": 1.2555195093154907, "learning_rate": 3.925433897578745e-05, "loss": 0.4729, "step": 1013 }, { "epoch": 0.6504169339320077, "grad_norm": 1.2835227251052856, "learning_rate": 3.924362545532462e-05, "loss": 0.5097, "step": 1014 }, { "epoch": 0.6510583707504811, "grad_norm": 1.0698812007904053, "learning_rate": 3.92329119348618e-05, "loss": 0.4042, "step": 1015 }, { "epoch": 0.6516998075689544, "grad_norm": 1.1693459749221802, "learning_rate": 3.922219841439898e-05, "loss": 0.3965, "step": 1016 }, { "epoch": 0.6523412443874278, "grad_norm": 1.364894151687622, "learning_rate": 3.921148489393615e-05, "loss": 0.3473, "step": 1017 }, { "epoch": 0.6529826812059012, "grad_norm": 0.9167343378067017, "learning_rate": 3.9200771373473326e-05, "loss": 0.3324, "step": 1018 }, { "epoch": 0.6536241180243746, "grad_norm": 1.4452552795410156, "learning_rate": 3.9190057853010505e-05, "loss": 0.5028, "step": 1019 }, { "epoch": 0.654265554842848, "grad_norm": 1.330351710319519, "learning_rate": 3.917934433254768e-05, "loss": 0.4576, "step": 1020 }, { "epoch": 0.6549069916613214, "grad_norm": 1.3711036443710327, "learning_rate": 3.9168630812084854e-05, "loss": 0.4623, "step": 1021 }, { "epoch": 0.6555484284797948, "grad_norm": 1.4441359043121338, "learning_rate": 3.915791729162203e-05, "loss": 0.534, "step": 1022 }, { "epoch": 0.6561898652982682, "grad_norm": 1.0871295928955078, "learning_rate": 3.9147203771159204e-05, "loss": 0.3598, "step": 1023 }, { "epoch": 0.6568313021167415, "grad_norm": 1.2877275943756104, "learning_rate": 3.9136490250696375e-05, "loss": 0.3897, "step": 1024 }, { "epoch": 0.6574727389352149, "grad_norm": 1.5600671768188477, "learning_rate": 3.912577673023355e-05, "loss": 0.5709, "step": 1025 }, { "epoch": 0.6581141757536882, "grad_norm": 1.5129671096801758, "learning_rate": 3.911506320977073e-05, "loss": 0.5223, "step": 1026 }, { "epoch": 0.6587556125721616, "grad_norm": 1.6196608543395996, "learning_rate": 3.910434968930791e-05, "loss": 0.5505, "step": 1027 }, { "epoch": 0.659397049390635, "grad_norm": 1.411557674407959, "learning_rate": 3.909363616884508e-05, "loss": 0.5484, "step": 1028 }, { "epoch": 0.6600384862091084, "grad_norm": 1.2283834218978882, "learning_rate": 3.908292264838226e-05, "loss": 0.4299, "step": 1029 }, { "epoch": 0.6606799230275818, "grad_norm": 1.3428598642349243, "learning_rate": 3.907220912791944e-05, "loss": 0.4613, "step": 1030 }, { "epoch": 0.6613213598460551, "grad_norm": 1.1043540239334106, "learning_rate": 3.906149560745661e-05, "loss": 0.3809, "step": 1031 }, { "epoch": 0.6619627966645285, "grad_norm": 1.1642018556594849, "learning_rate": 3.905078208699379e-05, "loss": 0.4244, "step": 1032 }, { "epoch": 0.6626042334830019, "grad_norm": 1.299756407737732, "learning_rate": 3.9040068566530965e-05, "loss": 0.5412, "step": 1033 }, { "epoch": 0.6632456703014753, "grad_norm": 1.3185372352600098, "learning_rate": 3.9029355046068136e-05, "loss": 0.4088, "step": 1034 }, { "epoch": 0.6638871071199487, "grad_norm": 1.1347286701202393, "learning_rate": 3.9018641525605314e-05, "loss": 0.3927, "step": 1035 }, { "epoch": 0.6645285439384221, "grad_norm": 1.303605318069458, "learning_rate": 3.900792800514249e-05, "loss": 0.3703, "step": 1036 }, { "epoch": 0.6651699807568955, "grad_norm": 1.309631586074829, "learning_rate": 3.8997214484679664e-05, "loss": 0.4566, "step": 1037 }, { "epoch": 0.6658114175753689, "grad_norm": 1.1371171474456787, "learning_rate": 3.898650096421684e-05, "loss": 0.4131, "step": 1038 }, { "epoch": 0.6664528543938422, "grad_norm": 1.0537163019180298, "learning_rate": 3.897578744375402e-05, "loss": 0.3818, "step": 1039 }, { "epoch": 0.6670942912123156, "grad_norm": 1.0710943937301636, "learning_rate": 3.89650739232912e-05, "loss": 0.389, "step": 1040 }, { "epoch": 0.667735728030789, "grad_norm": 1.3136922121047974, "learning_rate": 3.895436040282837e-05, "loss": 0.441, "step": 1041 }, { "epoch": 0.6683771648492624, "grad_norm": 1.1545578241348267, "learning_rate": 3.894364688236555e-05, "loss": 0.4079, "step": 1042 }, { "epoch": 0.6690186016677357, "grad_norm": 1.2992953062057495, "learning_rate": 3.8932933361902726e-05, "loss": 0.4905, "step": 1043 }, { "epoch": 0.6696600384862091, "grad_norm": 1.2386518716812134, "learning_rate": 3.89222198414399e-05, "loss": 0.4112, "step": 1044 }, { "epoch": 0.6703014753046825, "grad_norm": 1.261354684829712, "learning_rate": 3.8911506320977075e-05, "loss": 0.4913, "step": 1045 }, { "epoch": 0.6709429121231558, "grad_norm": 1.143173336982727, "learning_rate": 3.8900792800514254e-05, "loss": 0.43, "step": 1046 }, { "epoch": 0.6715843489416292, "grad_norm": 1.1224448680877686, "learning_rate": 3.8890079280051425e-05, "loss": 0.4081, "step": 1047 }, { "epoch": 0.6722257857601026, "grad_norm": 1.5319575071334839, "learning_rate": 3.88793657595886e-05, "loss": 0.5757, "step": 1048 }, { "epoch": 0.672867222578576, "grad_norm": 1.361948847770691, "learning_rate": 3.886865223912578e-05, "loss": 0.5078, "step": 1049 }, { "epoch": 0.6735086593970494, "grad_norm": 1.4247140884399414, "learning_rate": 3.885793871866296e-05, "loss": 0.5444, "step": 1050 }, { "epoch": 0.6741500962155228, "grad_norm": 1.1457871198654175, "learning_rate": 3.884722519820013e-05, "loss": 0.4284, "step": 1051 }, { "epoch": 0.6747915330339962, "grad_norm": 1.2001783847808838, "learning_rate": 3.883651167773731e-05, "loss": 0.4828, "step": 1052 }, { "epoch": 0.6754329698524695, "grad_norm": 1.1662466526031494, "learning_rate": 3.882579815727449e-05, "loss": 0.418, "step": 1053 }, { "epoch": 0.6760744066709429, "grad_norm": 1.3747336864471436, "learning_rate": 3.881508463681166e-05, "loss": 0.477, "step": 1054 }, { "epoch": 0.6767158434894163, "grad_norm": 1.1026560068130493, "learning_rate": 3.880437111634884e-05, "loss": 0.4593, "step": 1055 }, { "epoch": 0.6773572803078897, "grad_norm": 1.3179329633712769, "learning_rate": 3.8793657595886015e-05, "loss": 0.4604, "step": 1056 }, { "epoch": 0.6779987171263631, "grad_norm": 1.0841675996780396, "learning_rate": 3.8782944075423186e-05, "loss": 0.4024, "step": 1057 }, { "epoch": 0.6786401539448365, "grad_norm": 1.2682255506515503, "learning_rate": 3.877223055496036e-05, "loss": 0.4521, "step": 1058 }, { "epoch": 0.6792815907633099, "grad_norm": 1.4913151264190674, "learning_rate": 3.8761517034497536e-05, "loss": 0.6894, "step": 1059 }, { "epoch": 0.6799230275817832, "grad_norm": 1.2807573080062866, "learning_rate": 3.8750803514034714e-05, "loss": 0.4848, "step": 1060 }, { "epoch": 0.6805644644002565, "grad_norm": 1.38847815990448, "learning_rate": 3.8740089993571885e-05, "loss": 0.5088, "step": 1061 }, { "epoch": 0.6812059012187299, "grad_norm": 1.297224760055542, "learning_rate": 3.872937647310906e-05, "loss": 0.5525, "step": 1062 }, { "epoch": 0.6818473380372033, "grad_norm": 1.5125054121017456, "learning_rate": 3.871866295264624e-05, "loss": 0.6569, "step": 1063 }, { "epoch": 0.6824887748556767, "grad_norm": 1.1970688104629517, "learning_rate": 3.870794943218341e-05, "loss": 0.4488, "step": 1064 }, { "epoch": 0.6831302116741501, "grad_norm": 1.165023922920227, "learning_rate": 3.869723591172059e-05, "loss": 0.3671, "step": 1065 }, { "epoch": 0.6837716484926235, "grad_norm": 1.3013148307800293, "learning_rate": 3.868652239125777e-05, "loss": 0.4601, "step": 1066 }, { "epoch": 0.6844130853110969, "grad_norm": 1.5365447998046875, "learning_rate": 3.867580887079495e-05, "loss": 0.6448, "step": 1067 }, { "epoch": 0.6850545221295702, "grad_norm": 1.3636821508407593, "learning_rate": 3.866509535033212e-05, "loss": 0.3971, "step": 1068 }, { "epoch": 0.6856959589480436, "grad_norm": 1.2537068128585815, "learning_rate": 3.86543818298693e-05, "loss": 0.4577, "step": 1069 }, { "epoch": 0.686337395766517, "grad_norm": 1.2861231565475464, "learning_rate": 3.8643668309406475e-05, "loss": 0.4564, "step": 1070 }, { "epoch": 0.6869788325849904, "grad_norm": 1.3716822862625122, "learning_rate": 3.8632954788943646e-05, "loss": 0.484, "step": 1071 }, { "epoch": 0.6876202694034638, "grad_norm": 1.2120386362075806, "learning_rate": 3.8622241268480825e-05, "loss": 0.4582, "step": 1072 }, { "epoch": 0.6882617062219372, "grad_norm": 1.239851474761963, "learning_rate": 3.8611527748018e-05, "loss": 0.448, "step": 1073 }, { "epoch": 0.6889031430404106, "grad_norm": 1.3657523393630981, "learning_rate": 3.8600814227555174e-05, "loss": 0.4631, "step": 1074 }, { "epoch": 0.689544579858884, "grad_norm": 1.1932767629623413, "learning_rate": 3.859010070709235e-05, "loss": 0.472, "step": 1075 }, { "epoch": 0.6901860166773572, "grad_norm": 1.4575055837631226, "learning_rate": 3.857938718662953e-05, "loss": 0.4724, "step": 1076 }, { "epoch": 0.6908274534958306, "grad_norm": 1.2992273569107056, "learning_rate": 3.85686736661667e-05, "loss": 0.4998, "step": 1077 }, { "epoch": 0.691468890314304, "grad_norm": 1.3318414688110352, "learning_rate": 3.855796014570388e-05, "loss": 0.4106, "step": 1078 }, { "epoch": 0.6921103271327774, "grad_norm": 1.2102125883102417, "learning_rate": 3.854724662524106e-05, "loss": 0.4712, "step": 1079 }, { "epoch": 0.6927517639512508, "grad_norm": 1.2621413469314575, "learning_rate": 3.8536533104778236e-05, "loss": 0.4462, "step": 1080 }, { "epoch": 0.6933932007697242, "grad_norm": 1.3429797887802124, "learning_rate": 3.852581958431541e-05, "loss": 0.5058, "step": 1081 }, { "epoch": 0.6940346375881976, "grad_norm": 1.2602695226669312, "learning_rate": 3.8515106063852586e-05, "loss": 0.4765, "step": 1082 }, { "epoch": 0.6946760744066709, "grad_norm": 1.0386359691619873, "learning_rate": 3.8504392543389764e-05, "loss": 0.3618, "step": 1083 }, { "epoch": 0.6953175112251443, "grad_norm": 1.3359767198562622, "learning_rate": 3.8493679022926935e-05, "loss": 0.4424, "step": 1084 }, { "epoch": 0.6959589480436177, "grad_norm": 1.1742463111877441, "learning_rate": 3.848296550246411e-05, "loss": 0.3804, "step": 1085 }, { "epoch": 0.6966003848620911, "grad_norm": 1.1688586473464966, "learning_rate": 3.847225198200129e-05, "loss": 0.4347, "step": 1086 }, { "epoch": 0.6972418216805645, "grad_norm": 1.3494518995285034, "learning_rate": 3.846153846153846e-05, "loss": 0.4417, "step": 1087 }, { "epoch": 0.6978832584990379, "grad_norm": 1.2400418519973755, "learning_rate": 3.845082494107564e-05, "loss": 0.471, "step": 1088 }, { "epoch": 0.6985246953175113, "grad_norm": 1.5455831289291382, "learning_rate": 3.844011142061282e-05, "loss": 0.5609, "step": 1089 }, { "epoch": 0.6991661321359846, "grad_norm": 1.3504935503005981, "learning_rate": 3.842939790014999e-05, "loss": 0.4547, "step": 1090 }, { "epoch": 0.699807568954458, "grad_norm": 1.0179890394210815, "learning_rate": 3.841868437968716e-05, "loss": 0.3764, "step": 1091 }, { "epoch": 0.7004490057729313, "grad_norm": 1.2095506191253662, "learning_rate": 3.840797085922434e-05, "loss": 0.3908, "step": 1092 }, { "epoch": 0.7010904425914047, "grad_norm": 1.527044653892517, "learning_rate": 3.839725733876152e-05, "loss": 0.4989, "step": 1093 }, { "epoch": 0.7017318794098781, "grad_norm": 1.1942849159240723, "learning_rate": 3.838654381829869e-05, "loss": 0.4112, "step": 1094 }, { "epoch": 0.7023733162283515, "grad_norm": 1.344535231590271, "learning_rate": 3.837583029783587e-05, "loss": 0.4641, "step": 1095 }, { "epoch": 0.7030147530468249, "grad_norm": 1.1275808811187744, "learning_rate": 3.8365116777373046e-05, "loss": 0.3577, "step": 1096 }, { "epoch": 0.7036561898652983, "grad_norm": 1.1432324647903442, "learning_rate": 3.8354403256910224e-05, "loss": 0.4069, "step": 1097 }, { "epoch": 0.7042976266837716, "grad_norm": 1.359153151512146, "learning_rate": 3.8343689736447395e-05, "loss": 0.4918, "step": 1098 }, { "epoch": 0.704939063502245, "grad_norm": 1.139015555381775, "learning_rate": 3.8332976215984574e-05, "loss": 0.4753, "step": 1099 }, { "epoch": 0.7055805003207184, "grad_norm": 0.9990530014038086, "learning_rate": 3.832226269552175e-05, "loss": 0.3506, "step": 1100 }, { "epoch": 0.7062219371391918, "grad_norm": 1.358556866645813, "learning_rate": 3.831154917505892e-05, "loss": 0.5239, "step": 1101 }, { "epoch": 0.7068633739576652, "grad_norm": 1.1609642505645752, "learning_rate": 3.83008356545961e-05, "loss": 0.4218, "step": 1102 }, { "epoch": 0.7075048107761386, "grad_norm": 1.070440411567688, "learning_rate": 3.829012213413328e-05, "loss": 0.3985, "step": 1103 }, { "epoch": 0.708146247594612, "grad_norm": 1.139501929283142, "learning_rate": 3.827940861367045e-05, "loss": 0.3797, "step": 1104 }, { "epoch": 0.7087876844130853, "grad_norm": 1.0836081504821777, "learning_rate": 3.826869509320763e-05, "loss": 0.4185, "step": 1105 }, { "epoch": 0.7094291212315587, "grad_norm": 1.312418818473816, "learning_rate": 3.825798157274481e-05, "loss": 0.5048, "step": 1106 }, { "epoch": 0.710070558050032, "grad_norm": 1.0163265466690063, "learning_rate": 3.824726805228198e-05, "loss": 0.351, "step": 1107 }, { "epoch": 0.7107119948685054, "grad_norm": 1.0763452053070068, "learning_rate": 3.8236554531819157e-05, "loss": 0.4297, "step": 1108 }, { "epoch": 0.7113534316869788, "grad_norm": 1.0293697118759155, "learning_rate": 3.8225841011356335e-05, "loss": 0.3643, "step": 1109 }, { "epoch": 0.7119948685054522, "grad_norm": 1.1435558795928955, "learning_rate": 3.821512749089351e-05, "loss": 0.4283, "step": 1110 }, { "epoch": 0.7126363053239256, "grad_norm": 1.2606201171875, "learning_rate": 3.8204413970430684e-05, "loss": 0.4367, "step": 1111 }, { "epoch": 0.7132777421423989, "grad_norm": 1.2227904796600342, "learning_rate": 3.819370044996786e-05, "loss": 0.4333, "step": 1112 }, { "epoch": 0.7139191789608723, "grad_norm": 1.099092721939087, "learning_rate": 3.818298692950504e-05, "loss": 0.3658, "step": 1113 }, { "epoch": 0.7145606157793457, "grad_norm": 1.1824415922164917, "learning_rate": 3.817227340904221e-05, "loss": 0.3939, "step": 1114 }, { "epoch": 0.7152020525978191, "grad_norm": 1.0330287218093872, "learning_rate": 3.816155988857939e-05, "loss": 0.3389, "step": 1115 }, { "epoch": 0.7158434894162925, "grad_norm": 1.511753797531128, "learning_rate": 3.815084636811657e-05, "loss": 0.5752, "step": 1116 }, { "epoch": 0.7164849262347659, "grad_norm": 1.3854836225509644, "learning_rate": 3.814013284765374e-05, "loss": 0.4704, "step": 1117 }, { "epoch": 0.7171263630532393, "grad_norm": 1.5371021032333374, "learning_rate": 3.812941932719092e-05, "loss": 0.5442, "step": 1118 }, { "epoch": 0.7177677998717127, "grad_norm": 0.962840735912323, "learning_rate": 3.8118705806728096e-05, "loss": 0.3551, "step": 1119 }, { "epoch": 0.718409236690186, "grad_norm": 0.9484176635742188, "learning_rate": 3.810799228626527e-05, "loss": 0.3061, "step": 1120 }, { "epoch": 0.7190506735086594, "grad_norm": 1.061556339263916, "learning_rate": 3.8097278765802445e-05, "loss": 0.3797, "step": 1121 }, { "epoch": 0.7196921103271328, "grad_norm": 1.3126050233840942, "learning_rate": 3.8086565245339624e-05, "loss": 0.4481, "step": 1122 }, { "epoch": 0.7203335471456062, "grad_norm": 1.5290493965148926, "learning_rate": 3.80758517248768e-05, "loss": 0.5751, "step": 1123 }, { "epoch": 0.7209749839640796, "grad_norm": 1.3439395427703857, "learning_rate": 3.806513820441397e-05, "loss": 0.438, "step": 1124 }, { "epoch": 0.721616420782553, "grad_norm": 1.8515596389770508, "learning_rate": 3.805442468395115e-05, "loss": 0.6316, "step": 1125 }, { "epoch": 0.7222578576010263, "grad_norm": 1.1769061088562012, "learning_rate": 3.804371116348832e-05, "loss": 0.3847, "step": 1126 }, { "epoch": 0.7228992944194996, "grad_norm": 1.2564125061035156, "learning_rate": 3.80329976430255e-05, "loss": 0.4195, "step": 1127 }, { "epoch": 0.723540731237973, "grad_norm": 1.2171711921691895, "learning_rate": 3.802228412256267e-05, "loss": 0.4022, "step": 1128 }, { "epoch": 0.7241821680564464, "grad_norm": 1.6462509632110596, "learning_rate": 3.801157060209985e-05, "loss": 0.4284, "step": 1129 }, { "epoch": 0.7248236048749198, "grad_norm": 1.1755093336105347, "learning_rate": 3.800085708163703e-05, "loss": 0.4487, "step": 1130 }, { "epoch": 0.7254650416933932, "grad_norm": 1.0763304233551025, "learning_rate": 3.79901435611742e-05, "loss": 0.3536, "step": 1131 }, { "epoch": 0.7261064785118666, "grad_norm": 1.1821320056915283, "learning_rate": 3.797943004071138e-05, "loss": 0.4384, "step": 1132 }, { "epoch": 0.72674791533034, "grad_norm": 1.155545711517334, "learning_rate": 3.7968716520248556e-05, "loss": 0.4344, "step": 1133 }, { "epoch": 0.7273893521488134, "grad_norm": 1.3574512004852295, "learning_rate": 3.795800299978573e-05, "loss": 0.4982, "step": 1134 }, { "epoch": 0.7280307889672867, "grad_norm": 1.3821054697036743, "learning_rate": 3.7947289479322906e-05, "loss": 0.552, "step": 1135 }, { "epoch": 0.7286722257857601, "grad_norm": 1.0708322525024414, "learning_rate": 3.7936575958860084e-05, "loss": 0.3918, "step": 1136 }, { "epoch": 0.7293136626042335, "grad_norm": 1.0427716970443726, "learning_rate": 3.7925862438397255e-05, "loss": 0.3828, "step": 1137 }, { "epoch": 0.7299550994227069, "grad_norm": 1.110722541809082, "learning_rate": 3.791514891793443e-05, "loss": 0.4581, "step": 1138 }, { "epoch": 0.7305965362411803, "grad_norm": 1.3519303798675537, "learning_rate": 3.790443539747161e-05, "loss": 0.4459, "step": 1139 }, { "epoch": 0.7312379730596537, "grad_norm": 1.1868494749069214, "learning_rate": 3.789372187700879e-05, "loss": 0.4129, "step": 1140 }, { "epoch": 0.731879409878127, "grad_norm": 1.1005388498306274, "learning_rate": 3.788300835654596e-05, "loss": 0.3881, "step": 1141 }, { "epoch": 0.7325208466966003, "grad_norm": 1.1505495309829712, "learning_rate": 3.787229483608314e-05, "loss": 0.4045, "step": 1142 }, { "epoch": 0.7331622835150737, "grad_norm": 1.7534098625183105, "learning_rate": 3.786158131562032e-05, "loss": 0.4702, "step": 1143 }, { "epoch": 0.7338037203335471, "grad_norm": 1.2809951305389404, "learning_rate": 3.785086779515749e-05, "loss": 0.4474, "step": 1144 }, { "epoch": 0.7344451571520205, "grad_norm": 1.3113625049591064, "learning_rate": 3.784015427469467e-05, "loss": 0.506, "step": 1145 }, { "epoch": 0.7350865939704939, "grad_norm": 1.3460543155670166, "learning_rate": 3.7829440754231845e-05, "loss": 0.5224, "step": 1146 }, { "epoch": 0.7357280307889673, "grad_norm": 2.380023717880249, "learning_rate": 3.7818727233769016e-05, "loss": 0.5721, "step": 1147 }, { "epoch": 0.7363694676074407, "grad_norm": 1.1970770359039307, "learning_rate": 3.7808013713306194e-05, "loss": 0.4362, "step": 1148 }, { "epoch": 0.737010904425914, "grad_norm": 1.3605365753173828, "learning_rate": 3.779730019284337e-05, "loss": 0.4099, "step": 1149 }, { "epoch": 0.7376523412443874, "grad_norm": 1.2413519620895386, "learning_rate": 3.7786586672380544e-05, "loss": 0.4441, "step": 1150 }, { "epoch": 0.7382937780628608, "grad_norm": 1.1885340213775635, "learning_rate": 3.777587315191772e-05, "loss": 0.3879, "step": 1151 }, { "epoch": 0.7389352148813342, "grad_norm": 1.1902191638946533, "learning_rate": 3.77651596314549e-05, "loss": 0.4392, "step": 1152 }, { "epoch": 0.7395766516998076, "grad_norm": 1.3818460702896118, "learning_rate": 3.775444611099208e-05, "loss": 0.5412, "step": 1153 }, { "epoch": 0.740218088518281, "grad_norm": 1.3318384885787964, "learning_rate": 3.774373259052925e-05, "loss": 0.4831, "step": 1154 }, { "epoch": 0.7408595253367544, "grad_norm": 1.0206453800201416, "learning_rate": 3.773301907006643e-05, "loss": 0.3728, "step": 1155 }, { "epoch": 0.7415009621552278, "grad_norm": 1.213383674621582, "learning_rate": 3.7722305549603606e-05, "loss": 0.3816, "step": 1156 }, { "epoch": 0.742142398973701, "grad_norm": 1.155503749847412, "learning_rate": 3.771159202914078e-05, "loss": 0.4145, "step": 1157 }, { "epoch": 0.7427838357921744, "grad_norm": 1.1206563711166382, "learning_rate": 3.7700878508677956e-05, "loss": 0.4443, "step": 1158 }, { "epoch": 0.7434252726106478, "grad_norm": 1.4185236692428589, "learning_rate": 3.7690164988215134e-05, "loss": 0.4611, "step": 1159 }, { "epoch": 0.7440667094291212, "grad_norm": 1.4579391479492188, "learning_rate": 3.7679451467752305e-05, "loss": 0.3694, "step": 1160 }, { "epoch": 0.7447081462475946, "grad_norm": 1.6860296726226807, "learning_rate": 3.7668737947289476e-05, "loss": 0.6304, "step": 1161 }, { "epoch": 0.745349583066068, "grad_norm": 1.180160403251648, "learning_rate": 3.7658024426826655e-05, "loss": 0.4231, "step": 1162 }, { "epoch": 0.7459910198845414, "grad_norm": 1.4433215856552124, "learning_rate": 3.764731090636383e-05, "loss": 0.4559, "step": 1163 }, { "epoch": 0.7466324567030147, "grad_norm": 1.4613738059997559, "learning_rate": 3.7636597385901004e-05, "loss": 0.5602, "step": 1164 }, { "epoch": 0.7472738935214881, "grad_norm": 1.3459186553955078, "learning_rate": 3.762588386543818e-05, "loss": 0.4538, "step": 1165 }, { "epoch": 0.7479153303399615, "grad_norm": 1.665813684463501, "learning_rate": 3.761517034497536e-05, "loss": 0.4768, "step": 1166 }, { "epoch": 0.7485567671584349, "grad_norm": 1.3028192520141602, "learning_rate": 3.760445682451253e-05, "loss": 0.4303, "step": 1167 }, { "epoch": 0.7491982039769083, "grad_norm": 1.168064832687378, "learning_rate": 3.759374330404971e-05, "loss": 0.4315, "step": 1168 }, { "epoch": 0.7498396407953817, "grad_norm": 1.589247703552246, "learning_rate": 3.758302978358689e-05, "loss": 0.4978, "step": 1169 }, { "epoch": 0.7504810776138551, "grad_norm": 1.0153563022613525, "learning_rate": 3.7572316263124066e-05, "loss": 0.368, "step": 1170 }, { "epoch": 0.7511225144323285, "grad_norm": 1.275264024734497, "learning_rate": 3.756160274266124e-05, "loss": 0.5404, "step": 1171 }, { "epoch": 0.7517639512508018, "grad_norm": 1.227818250656128, "learning_rate": 3.7550889222198416e-05, "loss": 0.5056, "step": 1172 }, { "epoch": 0.7524053880692751, "grad_norm": 1.0738743543624878, "learning_rate": 3.7540175701735594e-05, "loss": 0.4003, "step": 1173 }, { "epoch": 0.7530468248877485, "grad_norm": 1.3961069583892822, "learning_rate": 3.7529462181272765e-05, "loss": 0.5222, "step": 1174 }, { "epoch": 0.7536882617062219, "grad_norm": 1.15729820728302, "learning_rate": 3.7518748660809943e-05, "loss": 0.4188, "step": 1175 }, { "epoch": 0.7543296985246953, "grad_norm": 1.1083964109420776, "learning_rate": 3.750803514034712e-05, "loss": 0.3697, "step": 1176 }, { "epoch": 0.7549711353431687, "grad_norm": 1.2570427656173706, "learning_rate": 3.749732161988429e-05, "loss": 0.5172, "step": 1177 }, { "epoch": 0.7556125721616421, "grad_norm": 1.171955943107605, "learning_rate": 3.748660809942147e-05, "loss": 0.4253, "step": 1178 }, { "epoch": 0.7562540089801154, "grad_norm": 1.0206798315048218, "learning_rate": 3.747589457895865e-05, "loss": 0.3635, "step": 1179 }, { "epoch": 0.7568954457985888, "grad_norm": 1.073226809501648, "learning_rate": 3.746518105849583e-05, "loss": 0.3579, "step": 1180 }, { "epoch": 0.7575368826170622, "grad_norm": 1.09874427318573, "learning_rate": 3.7454467538033e-05, "loss": 0.3704, "step": 1181 }, { "epoch": 0.7581783194355356, "grad_norm": 1.3540855646133423, "learning_rate": 3.744375401757018e-05, "loss": 0.4301, "step": 1182 }, { "epoch": 0.758819756254009, "grad_norm": 1.209734559059143, "learning_rate": 3.7433040497107355e-05, "loss": 0.4075, "step": 1183 }, { "epoch": 0.7594611930724824, "grad_norm": 1.42978835105896, "learning_rate": 3.7422326976644526e-05, "loss": 0.4864, "step": 1184 }, { "epoch": 0.7601026298909558, "grad_norm": 1.2536567449569702, "learning_rate": 3.7411613456181705e-05, "loss": 0.4272, "step": 1185 }, { "epoch": 0.7607440667094291, "grad_norm": 1.120833396911621, "learning_rate": 3.740089993571888e-05, "loss": 0.3793, "step": 1186 }, { "epoch": 0.7613855035279025, "grad_norm": 1.3219529390335083, "learning_rate": 3.7390186415256054e-05, "loss": 0.4436, "step": 1187 }, { "epoch": 0.7620269403463759, "grad_norm": 1.1832911968231201, "learning_rate": 3.737947289479323e-05, "loss": 0.4062, "step": 1188 }, { "epoch": 0.7626683771648493, "grad_norm": 1.2027345895767212, "learning_rate": 3.736875937433041e-05, "loss": 0.3648, "step": 1189 }, { "epoch": 0.7633098139833226, "grad_norm": 1.2253721952438354, "learning_rate": 3.735804585386758e-05, "loss": 0.4222, "step": 1190 }, { "epoch": 0.763951250801796, "grad_norm": 1.578138828277588, "learning_rate": 3.734733233340476e-05, "loss": 0.5429, "step": 1191 }, { "epoch": 0.7645926876202694, "grad_norm": 1.2755351066589355, "learning_rate": 3.733661881294194e-05, "loss": 0.4498, "step": 1192 }, { "epoch": 0.7652341244387428, "grad_norm": 1.3397070169448853, "learning_rate": 3.7325905292479116e-05, "loss": 0.5235, "step": 1193 }, { "epoch": 0.7658755612572161, "grad_norm": 1.183610439300537, "learning_rate": 3.731519177201628e-05, "loss": 0.3624, "step": 1194 }, { "epoch": 0.7665169980756895, "grad_norm": 1.2139278650283813, "learning_rate": 3.730447825155346e-05, "loss": 0.3976, "step": 1195 }, { "epoch": 0.7671584348941629, "grad_norm": 1.0537134408950806, "learning_rate": 3.729376473109064e-05, "loss": 0.4377, "step": 1196 }, { "epoch": 0.7677998717126363, "grad_norm": 1.2525177001953125, "learning_rate": 3.728305121062781e-05, "loss": 0.4258, "step": 1197 }, { "epoch": 0.7684413085311097, "grad_norm": 1.5108003616333008, "learning_rate": 3.727233769016499e-05, "loss": 0.5405, "step": 1198 }, { "epoch": 0.7690827453495831, "grad_norm": 1.2295806407928467, "learning_rate": 3.7261624169702165e-05, "loss": 0.4842, "step": 1199 }, { "epoch": 0.7697241821680565, "grad_norm": 1.1465166807174683, "learning_rate": 3.725091064923934e-05, "loss": 0.4557, "step": 1200 }, { "epoch": 0.7703656189865298, "grad_norm": 1.2438173294067383, "learning_rate": 3.7240197128776514e-05, "loss": 0.4205, "step": 1201 }, { "epoch": 0.7710070558050032, "grad_norm": 1.3431932926177979, "learning_rate": 3.722948360831369e-05, "loss": 0.5427, "step": 1202 }, { "epoch": 0.7716484926234766, "grad_norm": 1.2178674936294556, "learning_rate": 3.721877008785087e-05, "loss": 0.4776, "step": 1203 }, { "epoch": 0.77228992944195, "grad_norm": 1.0947680473327637, "learning_rate": 3.720805656738804e-05, "loss": 0.3811, "step": 1204 }, { "epoch": 0.7729313662604234, "grad_norm": 1.1898974180221558, "learning_rate": 3.719734304692522e-05, "loss": 0.4113, "step": 1205 }, { "epoch": 0.7735728030788968, "grad_norm": 1.4633293151855469, "learning_rate": 3.71866295264624e-05, "loss": 0.5661, "step": 1206 }, { "epoch": 0.7742142398973701, "grad_norm": 1.1548844575881958, "learning_rate": 3.717591600599957e-05, "loss": 0.4645, "step": 1207 }, { "epoch": 0.7748556767158435, "grad_norm": 1.1602681875228882, "learning_rate": 3.716520248553675e-05, "loss": 0.4305, "step": 1208 }, { "epoch": 0.7754971135343168, "grad_norm": 1.2642673254013062, "learning_rate": 3.7154488965073926e-05, "loss": 0.4924, "step": 1209 }, { "epoch": 0.7761385503527902, "grad_norm": 1.4365172386169434, "learning_rate": 3.7143775444611104e-05, "loss": 0.5208, "step": 1210 }, { "epoch": 0.7767799871712636, "grad_norm": 1.307018518447876, "learning_rate": 3.7133061924148275e-05, "loss": 0.4595, "step": 1211 }, { "epoch": 0.777421423989737, "grad_norm": 1.163933277130127, "learning_rate": 3.7122348403685454e-05, "loss": 0.4229, "step": 1212 }, { "epoch": 0.7780628608082104, "grad_norm": 1.2816798686981201, "learning_rate": 3.711163488322263e-05, "loss": 0.4408, "step": 1213 }, { "epoch": 0.7787042976266838, "grad_norm": 1.202072024345398, "learning_rate": 3.71009213627598e-05, "loss": 0.4046, "step": 1214 }, { "epoch": 0.7793457344451572, "grad_norm": 1.695486307144165, "learning_rate": 3.709020784229698e-05, "loss": 0.5437, "step": 1215 }, { "epoch": 0.7799871712636305, "grad_norm": 1.1456395387649536, "learning_rate": 3.707949432183416e-05, "loss": 0.4376, "step": 1216 }, { "epoch": 0.7806286080821039, "grad_norm": 1.3664106130599976, "learning_rate": 3.706878080137133e-05, "loss": 0.5624, "step": 1217 }, { "epoch": 0.7812700449005773, "grad_norm": 1.3274571895599365, "learning_rate": 3.705806728090851e-05, "loss": 0.4179, "step": 1218 }, { "epoch": 0.7819114817190507, "grad_norm": 1.2680270671844482, "learning_rate": 3.704735376044569e-05, "loss": 0.4254, "step": 1219 }, { "epoch": 0.7825529185375241, "grad_norm": 1.2166998386383057, "learning_rate": 3.703664023998286e-05, "loss": 0.4931, "step": 1220 }, { "epoch": 0.7831943553559975, "grad_norm": 0.9994713664054871, "learning_rate": 3.702592671952004e-05, "loss": 0.3297, "step": 1221 }, { "epoch": 0.7838357921744709, "grad_norm": 1.3439527750015259, "learning_rate": 3.7015213199057215e-05, "loss": 0.4514, "step": 1222 }, { "epoch": 0.7844772289929441, "grad_norm": 1.040511131286621, "learning_rate": 3.700449967859439e-05, "loss": 0.3604, "step": 1223 }, { "epoch": 0.7851186658114175, "grad_norm": 1.3185997009277344, "learning_rate": 3.6993786158131564e-05, "loss": 0.3821, "step": 1224 }, { "epoch": 0.7857601026298909, "grad_norm": 1.0463542938232422, "learning_rate": 3.698307263766874e-05, "loss": 0.4101, "step": 1225 }, { "epoch": 0.7864015394483643, "grad_norm": 1.1836129426956177, "learning_rate": 3.697235911720592e-05, "loss": 0.4393, "step": 1226 }, { "epoch": 0.7870429762668377, "grad_norm": 1.2741938829421997, "learning_rate": 3.696164559674309e-05, "loss": 0.4709, "step": 1227 }, { "epoch": 0.7876844130853111, "grad_norm": 1.084947109222412, "learning_rate": 3.695093207628027e-05, "loss": 0.4027, "step": 1228 }, { "epoch": 0.7883258499037845, "grad_norm": 1.0888118743896484, "learning_rate": 3.694021855581744e-05, "loss": 0.4293, "step": 1229 }, { "epoch": 0.7889672867222579, "grad_norm": 1.2168490886688232, "learning_rate": 3.692950503535462e-05, "loss": 0.4522, "step": 1230 }, { "epoch": 0.7896087235407312, "grad_norm": 1.039088487625122, "learning_rate": 3.691879151489179e-05, "loss": 0.3533, "step": 1231 }, { "epoch": 0.7902501603592046, "grad_norm": 1.1220341920852661, "learning_rate": 3.690807799442897e-05, "loss": 0.3933, "step": 1232 }, { "epoch": 0.790891597177678, "grad_norm": 1.3062716722488403, "learning_rate": 3.689736447396615e-05, "loss": 0.4657, "step": 1233 }, { "epoch": 0.7915330339961514, "grad_norm": 1.1984823942184448, "learning_rate": 3.688665095350332e-05, "loss": 0.4645, "step": 1234 }, { "epoch": 0.7921744708146248, "grad_norm": 0.9890860319137573, "learning_rate": 3.68759374330405e-05, "loss": 0.3412, "step": 1235 }, { "epoch": 0.7928159076330982, "grad_norm": 1.1147280931472778, "learning_rate": 3.6865223912577675e-05, "loss": 0.3978, "step": 1236 }, { "epoch": 0.7934573444515716, "grad_norm": 1.2585309743881226, "learning_rate": 3.6854510392114846e-05, "loss": 0.5169, "step": 1237 }, { "epoch": 0.7940987812700449, "grad_norm": 1.4564473628997803, "learning_rate": 3.6843796871652024e-05, "loss": 0.4719, "step": 1238 }, { "epoch": 0.7947402180885182, "grad_norm": 1.3545230627059937, "learning_rate": 3.68330833511892e-05, "loss": 0.4452, "step": 1239 }, { "epoch": 0.7953816549069916, "grad_norm": 1.2099623680114746, "learning_rate": 3.682236983072638e-05, "loss": 0.4651, "step": 1240 }, { "epoch": 0.796023091725465, "grad_norm": 1.134440541267395, "learning_rate": 3.681165631026355e-05, "loss": 0.385, "step": 1241 }, { "epoch": 0.7966645285439384, "grad_norm": 1.096327781677246, "learning_rate": 3.680094278980073e-05, "loss": 0.3833, "step": 1242 }, { "epoch": 0.7973059653624118, "grad_norm": 1.0990830659866333, "learning_rate": 3.679022926933791e-05, "loss": 0.3698, "step": 1243 }, { "epoch": 0.7979474021808852, "grad_norm": 1.1587586402893066, "learning_rate": 3.677951574887508e-05, "loss": 0.379, "step": 1244 }, { "epoch": 0.7985888389993585, "grad_norm": 1.322913646697998, "learning_rate": 3.676880222841226e-05, "loss": 0.4228, "step": 1245 }, { "epoch": 0.7992302758178319, "grad_norm": 1.26240873336792, "learning_rate": 3.6758088707949436e-05, "loss": 0.3963, "step": 1246 }, { "epoch": 0.7998717126363053, "grad_norm": 1.4368200302124023, "learning_rate": 3.674737518748661e-05, "loss": 0.538, "step": 1247 }, { "epoch": 0.8005131494547787, "grad_norm": 1.2090222835540771, "learning_rate": 3.6736661667023786e-05, "loss": 0.3821, "step": 1248 }, { "epoch": 0.8011545862732521, "grad_norm": 1.392318844795227, "learning_rate": 3.6725948146560964e-05, "loss": 0.416, "step": 1249 }, { "epoch": 0.8017960230917255, "grad_norm": 1.241204023361206, "learning_rate": 3.6715234626098135e-05, "loss": 0.3862, "step": 1250 }, { "epoch": 0.8024374599101989, "grad_norm": 1.3822706937789917, "learning_rate": 3.670452110563531e-05, "loss": 0.5331, "step": 1251 }, { "epoch": 0.8030788967286723, "grad_norm": 1.1920838356018066, "learning_rate": 3.669380758517249e-05, "loss": 0.3934, "step": 1252 }, { "epoch": 0.8037203335471456, "grad_norm": 1.4353491067886353, "learning_rate": 3.668309406470967e-05, "loss": 0.4726, "step": 1253 }, { "epoch": 0.804361770365619, "grad_norm": 1.2777469158172607, "learning_rate": 3.667238054424684e-05, "loss": 0.4324, "step": 1254 }, { "epoch": 0.8050032071840924, "grad_norm": 1.135968565940857, "learning_rate": 3.666166702378402e-05, "loss": 0.3976, "step": 1255 }, { "epoch": 0.8056446440025657, "grad_norm": 1.0746190547943115, "learning_rate": 3.66509535033212e-05, "loss": 0.3948, "step": 1256 }, { "epoch": 0.8062860808210391, "grad_norm": 1.1536766290664673, "learning_rate": 3.664023998285837e-05, "loss": 0.4186, "step": 1257 }, { "epoch": 0.8069275176395125, "grad_norm": 1.350358486175537, "learning_rate": 3.662952646239555e-05, "loss": 0.5515, "step": 1258 }, { "epoch": 0.8075689544579859, "grad_norm": 1.4384512901306152, "learning_rate": 3.6618812941932725e-05, "loss": 0.5292, "step": 1259 }, { "epoch": 0.8082103912764592, "grad_norm": 1.3189529180526733, "learning_rate": 3.6608099421469896e-05, "loss": 0.5506, "step": 1260 }, { "epoch": 0.8088518280949326, "grad_norm": 1.0947953462600708, "learning_rate": 3.6597385901007074e-05, "loss": 0.4341, "step": 1261 }, { "epoch": 0.809493264913406, "grad_norm": 1.348351001739502, "learning_rate": 3.658667238054425e-05, "loss": 0.472, "step": 1262 }, { "epoch": 0.8101347017318794, "grad_norm": 1.2500568628311157, "learning_rate": 3.6575958860081424e-05, "loss": 0.3671, "step": 1263 }, { "epoch": 0.8107761385503528, "grad_norm": 1.308712124824524, "learning_rate": 3.6565245339618595e-05, "loss": 0.5236, "step": 1264 }, { "epoch": 0.8114175753688262, "grad_norm": 1.0822592973709106, "learning_rate": 3.6554531819155774e-05, "loss": 0.423, "step": 1265 }, { "epoch": 0.8120590121872996, "grad_norm": 1.2644033432006836, "learning_rate": 3.654381829869295e-05, "loss": 0.4939, "step": 1266 }, { "epoch": 0.812700449005773, "grad_norm": 1.2622339725494385, "learning_rate": 3.653310477823012e-05, "loss": 0.4095, "step": 1267 }, { "epoch": 0.8133418858242463, "grad_norm": 1.1020022630691528, "learning_rate": 3.65223912577673e-05, "loss": 0.4106, "step": 1268 }, { "epoch": 0.8139833226427197, "grad_norm": 1.1881436109542847, "learning_rate": 3.651167773730448e-05, "loss": 0.3367, "step": 1269 }, { "epoch": 0.8146247594611931, "grad_norm": 1.2743650674819946, "learning_rate": 3.650096421684166e-05, "loss": 0.4595, "step": 1270 }, { "epoch": 0.8152661962796665, "grad_norm": 1.2757247686386108, "learning_rate": 3.649025069637883e-05, "loss": 0.4455, "step": 1271 }, { "epoch": 0.8159076330981399, "grad_norm": 1.1708858013153076, "learning_rate": 3.647953717591601e-05, "loss": 0.398, "step": 1272 }, { "epoch": 0.8165490699166132, "grad_norm": 1.6543279886245728, "learning_rate": 3.6468823655453185e-05, "loss": 0.6693, "step": 1273 }, { "epoch": 0.8171905067350866, "grad_norm": 1.4076522588729858, "learning_rate": 3.6458110134990357e-05, "loss": 0.5264, "step": 1274 }, { "epoch": 0.8178319435535599, "grad_norm": 1.3309688568115234, "learning_rate": 3.6447396614527535e-05, "loss": 0.4536, "step": 1275 }, { "epoch": 0.8184733803720333, "grad_norm": 1.6172213554382324, "learning_rate": 3.643668309406471e-05, "loss": 0.4418, "step": 1276 }, { "epoch": 0.8191148171905067, "grad_norm": 1.1154173612594604, "learning_rate": 3.6425969573601884e-05, "loss": 0.3812, "step": 1277 }, { "epoch": 0.8197562540089801, "grad_norm": 1.399690866470337, "learning_rate": 3.641525605313906e-05, "loss": 0.4648, "step": 1278 }, { "epoch": 0.8203976908274535, "grad_norm": 1.0137075185775757, "learning_rate": 3.640454253267624e-05, "loss": 0.3433, "step": 1279 }, { "epoch": 0.8210391276459269, "grad_norm": 1.188151478767395, "learning_rate": 3.639382901221341e-05, "loss": 0.4443, "step": 1280 }, { "epoch": 0.8216805644644003, "grad_norm": 0.9639889001846313, "learning_rate": 3.638311549175059e-05, "loss": 0.3533, "step": 1281 }, { "epoch": 0.8223220012828736, "grad_norm": 1.0599385499954224, "learning_rate": 3.637240197128777e-05, "loss": 0.3845, "step": 1282 }, { "epoch": 0.822963438101347, "grad_norm": 1.450360894203186, "learning_rate": 3.6361688450824946e-05, "loss": 0.603, "step": 1283 }, { "epoch": 0.8236048749198204, "grad_norm": 1.2312804460525513, "learning_rate": 3.635097493036212e-05, "loss": 0.4257, "step": 1284 }, { "epoch": 0.8242463117382938, "grad_norm": 1.3465194702148438, "learning_rate": 3.6340261409899296e-05, "loss": 0.5098, "step": 1285 }, { "epoch": 0.8248877485567672, "grad_norm": 0.9687694907188416, "learning_rate": 3.6329547889436474e-05, "loss": 0.3422, "step": 1286 }, { "epoch": 0.8255291853752406, "grad_norm": 1.0820081233978271, "learning_rate": 3.6318834368973645e-05, "loss": 0.3896, "step": 1287 }, { "epoch": 0.826170622193714, "grad_norm": 1.4262521266937256, "learning_rate": 3.6308120848510823e-05, "loss": 0.5888, "step": 1288 }, { "epoch": 0.8268120590121874, "grad_norm": 1.3093100786209106, "learning_rate": 3.6297407328048e-05, "loss": 0.4504, "step": 1289 }, { "epoch": 0.8274534958306606, "grad_norm": 1.1063042879104614, "learning_rate": 3.628669380758517e-05, "loss": 0.3591, "step": 1290 }, { "epoch": 0.828094932649134, "grad_norm": 1.125356912612915, "learning_rate": 3.627598028712235e-05, "loss": 0.388, "step": 1291 }, { "epoch": 0.8287363694676074, "grad_norm": 0.9333001971244812, "learning_rate": 3.626526676665953e-05, "loss": 0.3349, "step": 1292 }, { "epoch": 0.8293778062860808, "grad_norm": 1.0782759189605713, "learning_rate": 3.625455324619671e-05, "loss": 0.3865, "step": 1293 }, { "epoch": 0.8300192431045542, "grad_norm": 1.2450157403945923, "learning_rate": 3.624383972573388e-05, "loss": 0.4343, "step": 1294 }, { "epoch": 0.8306606799230276, "grad_norm": 1.457427978515625, "learning_rate": 3.623312620527106e-05, "loss": 0.5367, "step": 1295 }, { "epoch": 0.831302116741501, "grad_norm": 1.0607439279556274, "learning_rate": 3.6222412684808235e-05, "loss": 0.3179, "step": 1296 }, { "epoch": 0.8319435535599743, "grad_norm": 1.293769359588623, "learning_rate": 3.62116991643454e-05, "loss": 0.423, "step": 1297 }, { "epoch": 0.8325849903784477, "grad_norm": 1.1589988470077515, "learning_rate": 3.620098564388258e-05, "loss": 0.4364, "step": 1298 }, { "epoch": 0.8332264271969211, "grad_norm": 1.2471736669540405, "learning_rate": 3.6190272123419756e-05, "loss": 0.473, "step": 1299 }, { "epoch": 0.8338678640153945, "grad_norm": 1.056077003479004, "learning_rate": 3.6179558602956934e-05, "loss": 0.3718, "step": 1300 }, { "epoch": 0.8345093008338679, "grad_norm": 1.1283036470413208, "learning_rate": 3.6168845082494106e-05, "loss": 0.452, "step": 1301 }, { "epoch": 0.8351507376523413, "grad_norm": 1.2855172157287598, "learning_rate": 3.6158131562031284e-05, "loss": 0.3933, "step": 1302 }, { "epoch": 0.8357921744708147, "grad_norm": 1.4239755868911743, "learning_rate": 3.614741804156846e-05, "loss": 0.5423, "step": 1303 }, { "epoch": 0.8364336112892881, "grad_norm": 1.106705665588379, "learning_rate": 3.613670452110563e-05, "loss": 0.381, "step": 1304 }, { "epoch": 0.8370750481077613, "grad_norm": 1.3987032175064087, "learning_rate": 3.612599100064281e-05, "loss": 0.5657, "step": 1305 }, { "epoch": 0.8377164849262347, "grad_norm": 1.269360899925232, "learning_rate": 3.611527748017999e-05, "loss": 0.5045, "step": 1306 }, { "epoch": 0.8383579217447081, "grad_norm": 1.304186463356018, "learning_rate": 3.610456395971716e-05, "loss": 0.4178, "step": 1307 }, { "epoch": 0.8389993585631815, "grad_norm": 1.0038466453552246, "learning_rate": 3.609385043925434e-05, "loss": 0.3697, "step": 1308 }, { "epoch": 0.8396407953816549, "grad_norm": 1.217729091644287, "learning_rate": 3.608313691879152e-05, "loss": 0.4148, "step": 1309 }, { "epoch": 0.8402822322001283, "grad_norm": 1.004557728767395, "learning_rate": 3.607242339832869e-05, "loss": 0.3663, "step": 1310 }, { "epoch": 0.8409236690186017, "grad_norm": 1.0046577453613281, "learning_rate": 3.606170987786587e-05, "loss": 0.4012, "step": 1311 }, { "epoch": 0.841565105837075, "grad_norm": 1.0771732330322266, "learning_rate": 3.6050996357403045e-05, "loss": 0.3983, "step": 1312 }, { "epoch": 0.8422065426555484, "grad_norm": 1.2151448726654053, "learning_rate": 3.604028283694022e-05, "loss": 0.4746, "step": 1313 }, { "epoch": 0.8428479794740218, "grad_norm": 1.371724247932434, "learning_rate": 3.6029569316477394e-05, "loss": 0.4567, "step": 1314 }, { "epoch": 0.8434894162924952, "grad_norm": 1.17530357837677, "learning_rate": 3.601885579601457e-05, "loss": 0.4272, "step": 1315 }, { "epoch": 0.8441308531109686, "grad_norm": 1.1833417415618896, "learning_rate": 3.600814227555175e-05, "loss": 0.4287, "step": 1316 }, { "epoch": 0.844772289929442, "grad_norm": 1.2373700141906738, "learning_rate": 3.599742875508892e-05, "loss": 0.4104, "step": 1317 }, { "epoch": 0.8454137267479154, "grad_norm": 1.125473141670227, "learning_rate": 3.59867152346261e-05, "loss": 0.445, "step": 1318 }, { "epoch": 0.8460551635663887, "grad_norm": 1.0092172622680664, "learning_rate": 3.597600171416328e-05, "loss": 0.3453, "step": 1319 }, { "epoch": 0.846696600384862, "grad_norm": 1.2834059000015259, "learning_rate": 3.596528819370045e-05, "loss": 0.4642, "step": 1320 }, { "epoch": 0.8473380372033354, "grad_norm": 1.3266371488571167, "learning_rate": 3.595457467323763e-05, "loss": 0.4918, "step": 1321 }, { "epoch": 0.8479794740218088, "grad_norm": 1.1090730428695679, "learning_rate": 3.5943861152774806e-05, "loss": 0.4335, "step": 1322 }, { "epoch": 0.8486209108402822, "grad_norm": 1.1641976833343506, "learning_rate": 3.5933147632311984e-05, "loss": 0.4603, "step": 1323 }, { "epoch": 0.8492623476587556, "grad_norm": 1.4762920141220093, "learning_rate": 3.5922434111849156e-05, "loss": 0.5789, "step": 1324 }, { "epoch": 0.849903784477229, "grad_norm": 1.1487139463424683, "learning_rate": 3.5911720591386334e-05, "loss": 0.4247, "step": 1325 }, { "epoch": 0.8505452212957024, "grad_norm": 1.3110415935516357, "learning_rate": 3.590100707092351e-05, "loss": 0.5086, "step": 1326 }, { "epoch": 0.8511866581141757, "grad_norm": 1.1736412048339844, "learning_rate": 3.589029355046068e-05, "loss": 0.4121, "step": 1327 }, { "epoch": 0.8518280949326491, "grad_norm": 1.091137170791626, "learning_rate": 3.587958002999786e-05, "loss": 0.4224, "step": 1328 }, { "epoch": 0.8524695317511225, "grad_norm": 1.1411383152008057, "learning_rate": 3.586886650953504e-05, "loss": 0.4352, "step": 1329 }, { "epoch": 0.8531109685695959, "grad_norm": 1.168563961982727, "learning_rate": 3.585815298907221e-05, "loss": 0.4552, "step": 1330 }, { "epoch": 0.8537524053880693, "grad_norm": 1.421905517578125, "learning_rate": 3.584743946860939e-05, "loss": 0.5068, "step": 1331 }, { "epoch": 0.8543938422065427, "grad_norm": 1.5715810060501099, "learning_rate": 3.583672594814656e-05, "loss": 0.5837, "step": 1332 }, { "epoch": 0.8550352790250161, "grad_norm": 1.355263113975525, "learning_rate": 3.582601242768374e-05, "loss": 0.5025, "step": 1333 }, { "epoch": 0.8556767158434894, "grad_norm": 1.3533897399902344, "learning_rate": 3.581529890722091e-05, "loss": 0.4624, "step": 1334 }, { "epoch": 0.8563181526619628, "grad_norm": 1.313255786895752, "learning_rate": 3.580458538675809e-05, "loss": 0.4134, "step": 1335 }, { "epoch": 0.8569595894804362, "grad_norm": 1.1491698026657104, "learning_rate": 3.5793871866295266e-05, "loss": 0.4197, "step": 1336 }, { "epoch": 0.8576010262989096, "grad_norm": 1.4836511611938477, "learning_rate": 3.578315834583244e-05, "loss": 0.5476, "step": 1337 }, { "epoch": 0.858242463117383, "grad_norm": 0.9136806726455688, "learning_rate": 3.5772444825369616e-05, "loss": 0.3811, "step": 1338 }, { "epoch": 0.8588838999358563, "grad_norm": 1.3284391164779663, "learning_rate": 3.5761731304906794e-05, "loss": 0.4917, "step": 1339 }, { "epoch": 0.8595253367543297, "grad_norm": 1.1878312826156616, "learning_rate": 3.575101778444397e-05, "loss": 0.4332, "step": 1340 }, { "epoch": 0.8601667735728031, "grad_norm": 1.056952714920044, "learning_rate": 3.5740304263981143e-05, "loss": 0.4211, "step": 1341 }, { "epoch": 0.8608082103912764, "grad_norm": 1.2794196605682373, "learning_rate": 3.572959074351832e-05, "loss": 0.4861, "step": 1342 }, { "epoch": 0.8614496472097498, "grad_norm": 1.335671305656433, "learning_rate": 3.57188772230555e-05, "loss": 0.5039, "step": 1343 }, { "epoch": 0.8620910840282232, "grad_norm": 1.0212416648864746, "learning_rate": 3.570816370259267e-05, "loss": 0.3636, "step": 1344 }, { "epoch": 0.8627325208466966, "grad_norm": 0.8712825775146484, "learning_rate": 3.569745018212985e-05, "loss": 0.2927, "step": 1345 }, { "epoch": 0.86337395766517, "grad_norm": 1.2823173999786377, "learning_rate": 3.568673666166703e-05, "loss": 0.3885, "step": 1346 }, { "epoch": 0.8640153944836434, "grad_norm": 1.2773710489273071, "learning_rate": 3.56760231412042e-05, "loss": 0.413, "step": 1347 }, { "epoch": 0.8646568313021168, "grad_norm": 1.0762258768081665, "learning_rate": 3.566530962074138e-05, "loss": 0.4266, "step": 1348 }, { "epoch": 0.8652982681205901, "grad_norm": 1.4064565896987915, "learning_rate": 3.5654596100278555e-05, "loss": 0.471, "step": 1349 }, { "epoch": 0.8659397049390635, "grad_norm": 1.2153288125991821, "learning_rate": 3.5643882579815726e-05, "loss": 0.4259, "step": 1350 }, { "epoch": 0.8665811417575369, "grad_norm": 1.280938982963562, "learning_rate": 3.5633169059352905e-05, "loss": 0.4328, "step": 1351 }, { "epoch": 0.8672225785760103, "grad_norm": 1.4693936109542847, "learning_rate": 3.562245553889008e-05, "loss": 0.5969, "step": 1352 }, { "epoch": 0.8678640153944837, "grad_norm": 1.567034125328064, "learning_rate": 3.561174201842726e-05, "loss": 0.5476, "step": 1353 }, { "epoch": 0.868505452212957, "grad_norm": 1.1075841188430786, "learning_rate": 3.560102849796443e-05, "loss": 0.3808, "step": 1354 }, { "epoch": 0.8691468890314304, "grad_norm": 1.3853063583374023, "learning_rate": 3.559031497750161e-05, "loss": 0.5132, "step": 1355 }, { "epoch": 0.8697883258499037, "grad_norm": 1.2431472539901733, "learning_rate": 3.557960145703879e-05, "loss": 0.4417, "step": 1356 }, { "epoch": 0.8704297626683771, "grad_norm": 1.2675364017486572, "learning_rate": 3.556888793657596e-05, "loss": 0.4137, "step": 1357 }, { "epoch": 0.8710711994868505, "grad_norm": 1.0582603216171265, "learning_rate": 3.555817441611314e-05, "loss": 0.3468, "step": 1358 }, { "epoch": 0.8717126363053239, "grad_norm": 1.2529672384262085, "learning_rate": 3.5547460895650316e-05, "loss": 0.3801, "step": 1359 }, { "epoch": 0.8723540731237973, "grad_norm": 1.3560267686843872, "learning_rate": 3.553674737518749e-05, "loss": 0.5041, "step": 1360 }, { "epoch": 0.8729955099422707, "grad_norm": 1.146608829498291, "learning_rate": 3.5526033854724666e-05, "loss": 0.3483, "step": 1361 }, { "epoch": 0.8736369467607441, "grad_norm": 1.16940438747406, "learning_rate": 3.5515320334261844e-05, "loss": 0.4402, "step": 1362 }, { "epoch": 0.8742783835792175, "grad_norm": 1.3136053085327148, "learning_rate": 3.5504606813799015e-05, "loss": 0.5824, "step": 1363 }, { "epoch": 0.8749198203976908, "grad_norm": 1.2033991813659668, "learning_rate": 3.549389329333619e-05, "loss": 0.4097, "step": 1364 }, { "epoch": 0.8755612572161642, "grad_norm": 1.241545557975769, "learning_rate": 3.548317977287337e-05, "loss": 0.457, "step": 1365 }, { "epoch": 0.8762026940346376, "grad_norm": 1.000309705734253, "learning_rate": 3.547246625241054e-05, "loss": 0.3364, "step": 1366 }, { "epoch": 0.876844130853111, "grad_norm": 1.032185435295105, "learning_rate": 3.5461752731947714e-05, "loss": 0.3701, "step": 1367 }, { "epoch": 0.8774855676715844, "grad_norm": 1.2594916820526123, "learning_rate": 3.545103921148489e-05, "loss": 0.4432, "step": 1368 }, { "epoch": 0.8781270044900578, "grad_norm": 1.1493149995803833, "learning_rate": 3.544032569102207e-05, "loss": 0.41, "step": 1369 }, { "epoch": 0.8787684413085312, "grad_norm": 1.3157507181167603, "learning_rate": 3.542961217055925e-05, "loss": 0.4656, "step": 1370 }, { "epoch": 0.8794098781270044, "grad_norm": 1.1358764171600342, "learning_rate": 3.541889865009642e-05, "loss": 0.361, "step": 1371 }, { "epoch": 0.8800513149454778, "grad_norm": 1.3330280780792236, "learning_rate": 3.54081851296336e-05, "loss": 0.4615, "step": 1372 }, { "epoch": 0.8806927517639512, "grad_norm": 1.0165965557098389, "learning_rate": 3.5397471609170776e-05, "loss": 0.3897, "step": 1373 }, { "epoch": 0.8813341885824246, "grad_norm": 1.2654017210006714, "learning_rate": 3.538675808870795e-05, "loss": 0.4579, "step": 1374 }, { "epoch": 0.881975625400898, "grad_norm": 1.2479932308197021, "learning_rate": 3.5376044568245126e-05, "loss": 0.4437, "step": 1375 }, { "epoch": 0.8826170622193714, "grad_norm": 1.4770394563674927, "learning_rate": 3.5365331047782304e-05, "loss": 0.5727, "step": 1376 }, { "epoch": 0.8832584990378448, "grad_norm": 1.079190731048584, "learning_rate": 3.5354617527319475e-05, "loss": 0.3777, "step": 1377 }, { "epoch": 0.8838999358563181, "grad_norm": 1.3013262748718262, "learning_rate": 3.5343904006856654e-05, "loss": 0.4516, "step": 1378 }, { "epoch": 0.8845413726747915, "grad_norm": 1.091715693473816, "learning_rate": 3.533319048639383e-05, "loss": 0.3513, "step": 1379 }, { "epoch": 0.8851828094932649, "grad_norm": 1.3752143383026123, "learning_rate": 3.5322476965931e-05, "loss": 0.5855, "step": 1380 }, { "epoch": 0.8858242463117383, "grad_norm": 1.3995963335037231, "learning_rate": 3.531176344546818e-05, "loss": 0.5289, "step": 1381 }, { "epoch": 0.8864656831302117, "grad_norm": 1.0981346368789673, "learning_rate": 3.530104992500536e-05, "loss": 0.4174, "step": 1382 }, { "epoch": 0.8871071199486851, "grad_norm": 1.188820242881775, "learning_rate": 3.529033640454254e-05, "loss": 0.389, "step": 1383 }, { "epoch": 0.8877485567671585, "grad_norm": 1.1232380867004395, "learning_rate": 3.527962288407971e-05, "loss": 0.354, "step": 1384 }, { "epoch": 0.8883899935856319, "grad_norm": 1.1085604429244995, "learning_rate": 3.526890936361689e-05, "loss": 0.351, "step": 1385 }, { "epoch": 0.8890314304041051, "grad_norm": 1.2353515625, "learning_rate": 3.5258195843154065e-05, "loss": 0.4745, "step": 1386 }, { "epoch": 0.8896728672225785, "grad_norm": 1.3009014129638672, "learning_rate": 3.5247482322691237e-05, "loss": 0.4855, "step": 1387 }, { "epoch": 0.8903143040410519, "grad_norm": 1.3954253196716309, "learning_rate": 3.5236768802228415e-05, "loss": 0.4645, "step": 1388 }, { "epoch": 0.8909557408595253, "grad_norm": 1.3225860595703125, "learning_rate": 3.522605528176559e-05, "loss": 0.5187, "step": 1389 }, { "epoch": 0.8915971776779987, "grad_norm": 1.2177236080169678, "learning_rate": 3.5215341761302764e-05, "loss": 0.4057, "step": 1390 }, { "epoch": 0.8922386144964721, "grad_norm": 1.161612629890442, "learning_rate": 3.520462824083994e-05, "loss": 0.4301, "step": 1391 }, { "epoch": 0.8928800513149455, "grad_norm": 1.1627649068832397, "learning_rate": 3.519391472037712e-05, "loss": 0.3512, "step": 1392 }, { "epoch": 0.8935214881334188, "grad_norm": 1.0093971490859985, "learning_rate": 3.518320119991429e-05, "loss": 0.4232, "step": 1393 }, { "epoch": 0.8941629249518922, "grad_norm": 1.2771202325820923, "learning_rate": 3.517248767945147e-05, "loss": 0.4341, "step": 1394 }, { "epoch": 0.8948043617703656, "grad_norm": 1.350734829902649, "learning_rate": 3.516177415898865e-05, "loss": 0.4666, "step": 1395 }, { "epoch": 0.895445798588839, "grad_norm": 1.6287397146224976, "learning_rate": 3.5151060638525826e-05, "loss": 0.7483, "step": 1396 }, { "epoch": 0.8960872354073124, "grad_norm": 1.1388802528381348, "learning_rate": 3.5140347118063e-05, "loss": 0.4286, "step": 1397 }, { "epoch": 0.8967286722257858, "grad_norm": 0.8768635988235474, "learning_rate": 3.5129633597600176e-05, "loss": 0.3126, "step": 1398 }, { "epoch": 0.8973701090442592, "grad_norm": 1.2920732498168945, "learning_rate": 3.5118920077137354e-05, "loss": 0.4244, "step": 1399 }, { "epoch": 0.8980115458627326, "grad_norm": 1.0402652025222778, "learning_rate": 3.5108206556674525e-05, "loss": 0.3785, "step": 1400 }, { "epoch": 0.8986529826812059, "grad_norm": 1.0762907266616821, "learning_rate": 3.50974930362117e-05, "loss": 0.3599, "step": 1401 }, { "epoch": 0.8992944194996793, "grad_norm": 0.9122511744499207, "learning_rate": 3.5086779515748875e-05, "loss": 0.3181, "step": 1402 }, { "epoch": 0.8999358563181526, "grad_norm": 1.0915861129760742, "learning_rate": 3.507606599528605e-05, "loss": 0.3774, "step": 1403 }, { "epoch": 0.900577293136626, "grad_norm": 1.11159086227417, "learning_rate": 3.5065352474823224e-05, "loss": 0.3962, "step": 1404 }, { "epoch": 0.9012187299550994, "grad_norm": 1.129893183708191, "learning_rate": 3.50546389543604e-05, "loss": 0.444, "step": 1405 }, { "epoch": 0.9018601667735728, "grad_norm": 1.5131349563598633, "learning_rate": 3.504392543389758e-05, "loss": 0.5087, "step": 1406 }, { "epoch": 0.9025016035920462, "grad_norm": 1.1436340808868408, "learning_rate": 3.503321191343475e-05, "loss": 0.3499, "step": 1407 }, { "epoch": 0.9031430404105195, "grad_norm": 1.443771243095398, "learning_rate": 3.502249839297193e-05, "loss": 0.437, "step": 1408 }, { "epoch": 0.9037844772289929, "grad_norm": 1.4456332921981812, "learning_rate": 3.501178487250911e-05, "loss": 0.4732, "step": 1409 }, { "epoch": 0.9044259140474663, "grad_norm": 1.2813087701797485, "learning_rate": 3.500107135204628e-05, "loss": 0.4258, "step": 1410 }, { "epoch": 0.9050673508659397, "grad_norm": 1.2978564500808716, "learning_rate": 3.499035783158346e-05, "loss": 0.4284, "step": 1411 }, { "epoch": 0.9057087876844131, "grad_norm": 1.2177964448928833, "learning_rate": 3.4979644311120636e-05, "loss": 0.4612, "step": 1412 }, { "epoch": 0.9063502245028865, "grad_norm": 1.4674562215805054, "learning_rate": 3.4968930790657814e-05, "loss": 0.4573, "step": 1413 }, { "epoch": 0.9069916613213599, "grad_norm": 1.153295874595642, "learning_rate": 3.4958217270194986e-05, "loss": 0.4428, "step": 1414 }, { "epoch": 0.9076330981398332, "grad_norm": 1.1469570398330688, "learning_rate": 3.4947503749732164e-05, "loss": 0.4046, "step": 1415 }, { "epoch": 0.9082745349583066, "grad_norm": 1.2772934436798096, "learning_rate": 3.493679022926934e-05, "loss": 0.4743, "step": 1416 }, { "epoch": 0.90891597177678, "grad_norm": 1.3536537885665894, "learning_rate": 3.492607670880651e-05, "loss": 0.535, "step": 1417 }, { "epoch": 0.9095574085952534, "grad_norm": 1.1429961919784546, "learning_rate": 3.491536318834369e-05, "loss": 0.389, "step": 1418 }, { "epoch": 0.9101988454137268, "grad_norm": 1.155817985534668, "learning_rate": 3.490464966788087e-05, "loss": 0.4618, "step": 1419 }, { "epoch": 0.9108402822322001, "grad_norm": 1.4844298362731934, "learning_rate": 3.489393614741804e-05, "loss": 0.6629, "step": 1420 }, { "epoch": 0.9114817190506735, "grad_norm": 1.2616537809371948, "learning_rate": 3.488322262695522e-05, "loss": 0.4169, "step": 1421 }, { "epoch": 0.9121231558691469, "grad_norm": 1.4028006792068481, "learning_rate": 3.48725091064924e-05, "loss": 0.5297, "step": 1422 }, { "epoch": 0.9127645926876202, "grad_norm": 1.0676428079605103, "learning_rate": 3.486179558602957e-05, "loss": 0.403, "step": 1423 }, { "epoch": 0.9134060295060936, "grad_norm": 2.969416618347168, "learning_rate": 3.485108206556675e-05, "loss": 0.4672, "step": 1424 }, { "epoch": 0.914047466324567, "grad_norm": 1.074312686920166, "learning_rate": 3.4840368545103925e-05, "loss": 0.3551, "step": 1425 }, { "epoch": 0.9146889031430404, "grad_norm": 1.2340816259384155, "learning_rate": 3.48296550246411e-05, "loss": 0.4561, "step": 1426 }, { "epoch": 0.9153303399615138, "grad_norm": 1.2310305833816528, "learning_rate": 3.4818941504178274e-05, "loss": 0.4989, "step": 1427 }, { "epoch": 0.9159717767799872, "grad_norm": 1.4233126640319824, "learning_rate": 3.480822798371545e-05, "loss": 0.4735, "step": 1428 }, { "epoch": 0.9166132135984606, "grad_norm": 1.2440595626831055, "learning_rate": 3.479751446325263e-05, "loss": 0.419, "step": 1429 }, { "epoch": 0.9172546504169339, "grad_norm": 1.4350582361221313, "learning_rate": 3.47868009427898e-05, "loss": 0.4664, "step": 1430 }, { "epoch": 0.9178960872354073, "grad_norm": 1.449074625968933, "learning_rate": 3.477608742232698e-05, "loss": 0.4413, "step": 1431 }, { "epoch": 0.9185375240538807, "grad_norm": 1.1677863597869873, "learning_rate": 3.476537390186416e-05, "loss": 0.4333, "step": 1432 }, { "epoch": 0.9191789608723541, "grad_norm": 1.2238701581954956, "learning_rate": 3.475466038140133e-05, "loss": 0.4443, "step": 1433 }, { "epoch": 0.9198203976908275, "grad_norm": 1.396475911140442, "learning_rate": 3.474394686093851e-05, "loss": 0.548, "step": 1434 }, { "epoch": 0.9204618345093009, "grad_norm": 1.1883496046066284, "learning_rate": 3.473323334047568e-05, "loss": 0.3934, "step": 1435 }, { "epoch": 0.9211032713277743, "grad_norm": 1.2977975606918335, "learning_rate": 3.472251982001286e-05, "loss": 0.4908, "step": 1436 }, { "epoch": 0.9217447081462476, "grad_norm": 1.3239063024520874, "learning_rate": 3.471180629955003e-05, "loss": 0.4482, "step": 1437 }, { "epoch": 0.9223861449647209, "grad_norm": 1.1045584678649902, "learning_rate": 3.470109277908721e-05, "loss": 0.3716, "step": 1438 }, { "epoch": 0.9230275817831943, "grad_norm": 1.2022833824157715, "learning_rate": 3.4690379258624385e-05, "loss": 0.4563, "step": 1439 }, { "epoch": 0.9236690186016677, "grad_norm": 1.127478003501892, "learning_rate": 3.4679665738161556e-05, "loss": 0.4017, "step": 1440 }, { "epoch": 0.9243104554201411, "grad_norm": 1.5157753229141235, "learning_rate": 3.4668952217698735e-05, "loss": 0.6435, "step": 1441 }, { "epoch": 0.9249518922386145, "grad_norm": 0.985283374786377, "learning_rate": 3.465823869723591e-05, "loss": 0.37, "step": 1442 }, { "epoch": 0.9255933290570879, "grad_norm": 1.0868579149246216, "learning_rate": 3.464752517677309e-05, "loss": 0.3522, "step": 1443 }, { "epoch": 0.9262347658755613, "grad_norm": 1.1324695348739624, "learning_rate": 3.463681165631026e-05, "loss": 0.3861, "step": 1444 }, { "epoch": 0.9268762026940346, "grad_norm": 1.52614164352417, "learning_rate": 3.462609813584744e-05, "loss": 0.5775, "step": 1445 }, { "epoch": 0.927517639512508, "grad_norm": 1.1970676183700562, "learning_rate": 3.461538461538462e-05, "loss": 0.4713, "step": 1446 }, { "epoch": 0.9281590763309814, "grad_norm": 1.1674381494522095, "learning_rate": 3.460467109492179e-05, "loss": 0.4238, "step": 1447 }, { "epoch": 0.9288005131494548, "grad_norm": 1.0862704515457153, "learning_rate": 3.459395757445897e-05, "loss": 0.3729, "step": 1448 }, { "epoch": 0.9294419499679282, "grad_norm": 1.4649041891098022, "learning_rate": 3.4583244053996146e-05, "loss": 0.4767, "step": 1449 }, { "epoch": 0.9300833867864016, "grad_norm": 1.2568931579589844, "learning_rate": 3.457253053353332e-05, "loss": 0.4401, "step": 1450 }, { "epoch": 0.930724823604875, "grad_norm": 1.0767163038253784, "learning_rate": 3.4561817013070496e-05, "loss": 0.3785, "step": 1451 }, { "epoch": 0.9313662604233482, "grad_norm": 1.1949728727340698, "learning_rate": 3.4551103492607674e-05, "loss": 0.4538, "step": 1452 }, { "epoch": 0.9320076972418216, "grad_norm": 1.372469425201416, "learning_rate": 3.454038997214485e-05, "loss": 0.4367, "step": 1453 }, { "epoch": 0.932649134060295, "grad_norm": 1.1734060049057007, "learning_rate": 3.4529676451682023e-05, "loss": 0.4405, "step": 1454 }, { "epoch": 0.9332905708787684, "grad_norm": 1.156927227973938, "learning_rate": 3.45189629312192e-05, "loss": 0.4335, "step": 1455 }, { "epoch": 0.9339320076972418, "grad_norm": 1.1099722385406494, "learning_rate": 3.450824941075638e-05, "loss": 0.3913, "step": 1456 }, { "epoch": 0.9345734445157152, "grad_norm": 1.1643191576004028, "learning_rate": 3.449753589029355e-05, "loss": 0.4226, "step": 1457 }, { "epoch": 0.9352148813341886, "grad_norm": 1.3728033304214478, "learning_rate": 3.448682236983073e-05, "loss": 0.4212, "step": 1458 }, { "epoch": 0.935856318152662, "grad_norm": 1.0322200059890747, "learning_rate": 3.447610884936791e-05, "loss": 0.3992, "step": 1459 }, { "epoch": 0.9364977549711353, "grad_norm": 1.1653929948806763, "learning_rate": 3.446539532890508e-05, "loss": 0.4091, "step": 1460 }, { "epoch": 0.9371391917896087, "grad_norm": 1.1583338975906372, "learning_rate": 3.445468180844226e-05, "loss": 0.4576, "step": 1461 }, { "epoch": 0.9377806286080821, "grad_norm": 1.3023165464401245, "learning_rate": 3.4443968287979435e-05, "loss": 0.3972, "step": 1462 }, { "epoch": 0.9384220654265555, "grad_norm": 1.079431414604187, "learning_rate": 3.4433254767516606e-05, "loss": 0.3817, "step": 1463 }, { "epoch": 0.9390635022450289, "grad_norm": 1.0074185132980347, "learning_rate": 3.4422541247053785e-05, "loss": 0.3251, "step": 1464 }, { "epoch": 0.9397049390635023, "grad_norm": 1.079647421836853, "learning_rate": 3.441182772659096e-05, "loss": 0.4115, "step": 1465 }, { "epoch": 0.9403463758819757, "grad_norm": 0.9395571351051331, "learning_rate": 3.440111420612814e-05, "loss": 0.3129, "step": 1466 }, { "epoch": 0.940987812700449, "grad_norm": 1.2454732656478882, "learning_rate": 3.439040068566531e-05, "loss": 0.4766, "step": 1467 }, { "epoch": 0.9416292495189224, "grad_norm": 1.2707446813583374, "learning_rate": 3.437968716520249e-05, "loss": 0.4361, "step": 1468 }, { "epoch": 0.9422706863373957, "grad_norm": 1.3941679000854492, "learning_rate": 3.436897364473966e-05, "loss": 0.4633, "step": 1469 }, { "epoch": 0.9429121231558691, "grad_norm": 1.3077425956726074, "learning_rate": 3.435826012427683e-05, "loss": 0.4331, "step": 1470 }, { "epoch": 0.9435535599743425, "grad_norm": 1.200730800628662, "learning_rate": 3.434754660381401e-05, "loss": 0.4029, "step": 1471 }, { "epoch": 0.9441949967928159, "grad_norm": 1.1924530267715454, "learning_rate": 3.433683308335119e-05, "loss": 0.458, "step": 1472 }, { "epoch": 0.9448364336112893, "grad_norm": 1.1162751913070679, "learning_rate": 3.432611956288837e-05, "loss": 0.4093, "step": 1473 }, { "epoch": 0.9454778704297627, "grad_norm": 1.3162943124771118, "learning_rate": 3.431540604242554e-05, "loss": 0.4701, "step": 1474 }, { "epoch": 0.946119307248236, "grad_norm": 1.1370679140090942, "learning_rate": 3.430469252196272e-05, "loss": 0.3911, "step": 1475 }, { "epoch": 0.9467607440667094, "grad_norm": 1.0785884857177734, "learning_rate": 3.4293979001499895e-05, "loss": 0.4066, "step": 1476 }, { "epoch": 0.9474021808851828, "grad_norm": 1.3394076824188232, "learning_rate": 3.428326548103707e-05, "loss": 0.5458, "step": 1477 }, { "epoch": 0.9480436177036562, "grad_norm": 1.2105019092559814, "learning_rate": 3.4272551960574245e-05, "loss": 0.4722, "step": 1478 }, { "epoch": 0.9486850545221296, "grad_norm": 1.225311279296875, "learning_rate": 3.426183844011142e-05, "loss": 0.4859, "step": 1479 }, { "epoch": 0.949326491340603, "grad_norm": 1.2117393016815186, "learning_rate": 3.4251124919648594e-05, "loss": 0.525, "step": 1480 }, { "epoch": 0.9499679281590764, "grad_norm": 1.2004717588424683, "learning_rate": 3.424041139918577e-05, "loss": 0.4046, "step": 1481 }, { "epoch": 0.9506093649775497, "grad_norm": 1.2227466106414795, "learning_rate": 3.422969787872295e-05, "loss": 0.4174, "step": 1482 }, { "epoch": 0.9512508017960231, "grad_norm": 1.0807464122772217, "learning_rate": 3.421898435826013e-05, "loss": 0.3949, "step": 1483 }, { "epoch": 0.9518922386144965, "grad_norm": 1.144384503364563, "learning_rate": 3.42082708377973e-05, "loss": 0.4182, "step": 1484 }, { "epoch": 0.9525336754329699, "grad_norm": 1.277060627937317, "learning_rate": 3.419755731733448e-05, "loss": 0.5319, "step": 1485 }, { "epoch": 0.9531751122514432, "grad_norm": 1.0632539987564087, "learning_rate": 3.4186843796871656e-05, "loss": 0.3816, "step": 1486 }, { "epoch": 0.9538165490699166, "grad_norm": 1.077534556388855, "learning_rate": 3.417613027640883e-05, "loss": 0.3463, "step": 1487 }, { "epoch": 0.95445798588839, "grad_norm": 1.1041734218597412, "learning_rate": 3.4165416755946006e-05, "loss": 0.407, "step": 1488 }, { "epoch": 0.9550994227068633, "grad_norm": 1.1637133359909058, "learning_rate": 3.4154703235483184e-05, "loss": 0.4371, "step": 1489 }, { "epoch": 0.9557408595253367, "grad_norm": 1.2822209596633911, "learning_rate": 3.4143989715020356e-05, "loss": 0.4346, "step": 1490 }, { "epoch": 0.9563822963438101, "grad_norm": 1.011036992073059, "learning_rate": 3.4133276194557534e-05, "loss": 0.3726, "step": 1491 }, { "epoch": 0.9570237331622835, "grad_norm": 1.3253637552261353, "learning_rate": 3.412256267409471e-05, "loss": 0.4108, "step": 1492 }, { "epoch": 0.9576651699807569, "grad_norm": 1.0183583498001099, "learning_rate": 3.411184915363188e-05, "loss": 0.3581, "step": 1493 }, { "epoch": 0.9583066067992303, "grad_norm": 1.2252857685089111, "learning_rate": 3.410113563316906e-05, "loss": 0.4651, "step": 1494 }, { "epoch": 0.9589480436177037, "grad_norm": 1.0212311744689941, "learning_rate": 3.409042211270624e-05, "loss": 0.3702, "step": 1495 }, { "epoch": 0.9595894804361771, "grad_norm": 1.4126304388046265, "learning_rate": 3.407970859224342e-05, "loss": 0.5184, "step": 1496 }, { "epoch": 0.9602309172546504, "grad_norm": 1.3240644931793213, "learning_rate": 3.406899507178059e-05, "loss": 0.4891, "step": 1497 }, { "epoch": 0.9608723540731238, "grad_norm": 0.9317913055419922, "learning_rate": 3.405828155131777e-05, "loss": 0.3688, "step": 1498 }, { "epoch": 0.9615137908915972, "grad_norm": 1.2497907876968384, "learning_rate": 3.4047568030854945e-05, "loss": 0.3924, "step": 1499 }, { "epoch": 0.9621552277100706, "grad_norm": 1.1223868131637573, "learning_rate": 3.403685451039212e-05, "loss": 0.4299, "step": 1500 }, { "epoch": 0.962796664528544, "grad_norm": 1.3462743759155273, "learning_rate": 3.4026140989929295e-05, "loss": 0.5251, "step": 1501 }, { "epoch": 0.9634381013470174, "grad_norm": 1.0431034564971924, "learning_rate": 3.401542746946647e-05, "loss": 0.3879, "step": 1502 }, { "epoch": 0.9640795381654907, "grad_norm": 1.2478803396224976, "learning_rate": 3.4004713949003644e-05, "loss": 0.3809, "step": 1503 }, { "epoch": 0.964720974983964, "grad_norm": 1.0675264596939087, "learning_rate": 3.3994000428540816e-05, "loss": 0.3979, "step": 1504 }, { "epoch": 0.9653624118024374, "grad_norm": 1.1529419422149658, "learning_rate": 3.3983286908077994e-05, "loss": 0.4672, "step": 1505 }, { "epoch": 0.9660038486209108, "grad_norm": 1.0833505392074585, "learning_rate": 3.397257338761517e-05, "loss": 0.4371, "step": 1506 }, { "epoch": 0.9666452854393842, "grad_norm": 1.38273024559021, "learning_rate": 3.396185986715234e-05, "loss": 0.5831, "step": 1507 }, { "epoch": 0.9672867222578576, "grad_norm": 1.1137045621871948, "learning_rate": 3.395114634668952e-05, "loss": 0.3997, "step": 1508 }, { "epoch": 0.967928159076331, "grad_norm": 1.2682740688323975, "learning_rate": 3.39404328262267e-05, "loss": 0.4337, "step": 1509 }, { "epoch": 0.9685695958948044, "grad_norm": 1.0573724508285522, "learning_rate": 3.392971930576387e-05, "loss": 0.3477, "step": 1510 }, { "epoch": 0.9692110327132777, "grad_norm": 1.1143664121627808, "learning_rate": 3.391900578530105e-05, "loss": 0.4131, "step": 1511 }, { "epoch": 0.9698524695317511, "grad_norm": 1.1251217126846313, "learning_rate": 3.390829226483823e-05, "loss": 0.403, "step": 1512 }, { "epoch": 0.9704939063502245, "grad_norm": 1.253141164779663, "learning_rate": 3.3897578744375405e-05, "loss": 0.5177, "step": 1513 }, { "epoch": 0.9711353431686979, "grad_norm": 1.26360023021698, "learning_rate": 3.388686522391258e-05, "loss": 0.4369, "step": 1514 }, { "epoch": 0.9717767799871713, "grad_norm": 1.1826064586639404, "learning_rate": 3.3876151703449755e-05, "loss": 0.4404, "step": 1515 }, { "epoch": 0.9724182168056447, "grad_norm": 1.2624759674072266, "learning_rate": 3.386543818298693e-05, "loss": 0.4203, "step": 1516 }, { "epoch": 0.9730596536241181, "grad_norm": 1.0649871826171875, "learning_rate": 3.3854724662524105e-05, "loss": 0.4334, "step": 1517 }, { "epoch": 0.9737010904425915, "grad_norm": 1.014760971069336, "learning_rate": 3.384401114206128e-05, "loss": 0.3615, "step": 1518 }, { "epoch": 0.9743425272610647, "grad_norm": 1.2541155815124512, "learning_rate": 3.383329762159846e-05, "loss": 0.5604, "step": 1519 }, { "epoch": 0.9749839640795381, "grad_norm": 1.201366662979126, "learning_rate": 3.382258410113563e-05, "loss": 0.4173, "step": 1520 }, { "epoch": 0.9756254008980115, "grad_norm": 1.1050529479980469, "learning_rate": 3.381187058067281e-05, "loss": 0.3994, "step": 1521 }, { "epoch": 0.9762668377164849, "grad_norm": 1.1332107782363892, "learning_rate": 3.380115706020999e-05, "loss": 0.3473, "step": 1522 }, { "epoch": 0.9769082745349583, "grad_norm": 1.1018368005752563, "learning_rate": 3.379044353974716e-05, "loss": 0.4527, "step": 1523 }, { "epoch": 0.9775497113534317, "grad_norm": 1.1820083856582642, "learning_rate": 3.377973001928434e-05, "loss": 0.4471, "step": 1524 }, { "epoch": 0.9781911481719051, "grad_norm": 1.1958509683609009, "learning_rate": 3.3769016498821516e-05, "loss": 0.4402, "step": 1525 }, { "epoch": 0.9788325849903784, "grad_norm": 1.122699499130249, "learning_rate": 3.3758302978358694e-05, "loss": 0.4338, "step": 1526 }, { "epoch": 0.9794740218088518, "grad_norm": 1.5076205730438232, "learning_rate": 3.3747589457895866e-05, "loss": 0.6312, "step": 1527 }, { "epoch": 0.9801154586273252, "grad_norm": 0.9571488499641418, "learning_rate": 3.3736875937433044e-05, "loss": 0.3632, "step": 1528 }, { "epoch": 0.9807568954457986, "grad_norm": 1.2851033210754395, "learning_rate": 3.372616241697022e-05, "loss": 0.4328, "step": 1529 }, { "epoch": 0.981398332264272, "grad_norm": 1.155519962310791, "learning_rate": 3.371544889650739e-05, "loss": 0.4426, "step": 1530 }, { "epoch": 0.9820397690827454, "grad_norm": 1.0159082412719727, "learning_rate": 3.370473537604457e-05, "loss": 0.3869, "step": 1531 }, { "epoch": 0.9826812059012188, "grad_norm": 1.4760991334915161, "learning_rate": 3.369402185558175e-05, "loss": 0.4924, "step": 1532 }, { "epoch": 0.9833226427196922, "grad_norm": 1.1233769655227661, "learning_rate": 3.368330833511892e-05, "loss": 0.4065, "step": 1533 }, { "epoch": 0.9839640795381654, "grad_norm": 1.3267955780029297, "learning_rate": 3.36725948146561e-05, "loss": 0.4443, "step": 1534 }, { "epoch": 0.9846055163566388, "grad_norm": 0.913865327835083, "learning_rate": 3.366188129419328e-05, "loss": 0.3457, "step": 1535 }, { "epoch": 0.9852469531751122, "grad_norm": 1.0850825309753418, "learning_rate": 3.365116777373045e-05, "loss": 0.3989, "step": 1536 }, { "epoch": 0.9858883899935856, "grad_norm": 1.7353767156600952, "learning_rate": 3.364045425326763e-05, "loss": 0.4553, "step": 1537 }, { "epoch": 0.986529826812059, "grad_norm": 1.1444684267044067, "learning_rate": 3.36297407328048e-05, "loss": 0.3899, "step": 1538 }, { "epoch": 0.9871712636305324, "grad_norm": 1.3041508197784424, "learning_rate": 3.3619027212341976e-05, "loss": 0.4372, "step": 1539 }, { "epoch": 0.9878127004490058, "grad_norm": 1.1646732091903687, "learning_rate": 3.360831369187915e-05, "loss": 0.4424, "step": 1540 }, { "epoch": 0.9884541372674791, "grad_norm": 1.0738924741744995, "learning_rate": 3.3597600171416326e-05, "loss": 0.3974, "step": 1541 }, { "epoch": 0.9890955740859525, "grad_norm": 1.2688877582550049, "learning_rate": 3.3586886650953504e-05, "loss": 0.4747, "step": 1542 }, { "epoch": 0.9897370109044259, "grad_norm": 1.267547607421875, "learning_rate": 3.357617313049068e-05, "loss": 0.4216, "step": 1543 }, { "epoch": 0.9903784477228993, "grad_norm": 1.2116271257400513, "learning_rate": 3.3565459610027854e-05, "loss": 0.422, "step": 1544 }, { "epoch": 0.9910198845413727, "grad_norm": 1.113978385925293, "learning_rate": 3.355474608956503e-05, "loss": 0.3575, "step": 1545 }, { "epoch": 0.9916613213598461, "grad_norm": 1.2534047365188599, "learning_rate": 3.354403256910221e-05, "loss": 0.4203, "step": 1546 }, { "epoch": 0.9923027581783195, "grad_norm": 1.5745474100112915, "learning_rate": 3.353331904863938e-05, "loss": 0.4666, "step": 1547 }, { "epoch": 0.9929441949967928, "grad_norm": 1.3384294509887695, "learning_rate": 3.352260552817656e-05, "loss": 0.5202, "step": 1548 }, { "epoch": 0.9935856318152662, "grad_norm": 1.2252525091171265, "learning_rate": 3.351189200771374e-05, "loss": 0.5086, "step": 1549 }, { "epoch": 0.9942270686337396, "grad_norm": 1.2064217329025269, "learning_rate": 3.350117848725091e-05, "loss": 0.4524, "step": 1550 }, { "epoch": 0.994868505452213, "grad_norm": 1.2279422283172607, "learning_rate": 3.349046496678809e-05, "loss": 0.4677, "step": 1551 }, { "epoch": 0.9955099422706863, "grad_norm": 0.9597011804580688, "learning_rate": 3.3479751446325265e-05, "loss": 0.3904, "step": 1552 }, { "epoch": 0.9961513790891597, "grad_norm": 1.138494610786438, "learning_rate": 3.3469037925862437e-05, "loss": 0.4424, "step": 1553 }, { "epoch": 0.9967928159076331, "grad_norm": 1.2074379920959473, "learning_rate": 3.3458324405399615e-05, "loss": 0.4963, "step": 1554 }, { "epoch": 0.9974342527261065, "grad_norm": 1.5350494384765625, "learning_rate": 3.344761088493679e-05, "loss": 0.387, "step": 1555 }, { "epoch": 0.9980756895445798, "grad_norm": 0.8643363118171692, "learning_rate": 3.343689736447397e-05, "loss": 0.3416, "step": 1556 }, { "epoch": 0.9987171263630532, "grad_norm": 1.1334692239761353, "learning_rate": 3.342618384401114e-05, "loss": 0.3831, "step": 1557 }, { "epoch": 0.9993585631815266, "grad_norm": 1.0347250699996948, "learning_rate": 3.341547032354832e-05, "loss": 0.3952, "step": 1558 }, { "epoch": 1.0, "grad_norm": 1.1243289709091187, "learning_rate": 3.34047568030855e-05, "loss": 0.4065, "step": 1559 }, { "epoch": 1.0006414368184733, "grad_norm": 0.9909212589263916, "learning_rate": 3.339404328262267e-05, "loss": 0.3526, "step": 1560 }, { "epoch": 1.0012828736369468, "grad_norm": 0.8288801908493042, "learning_rate": 3.338332976215985e-05, "loss": 0.2604, "step": 1561 }, { "epoch": 1.00192431045542, "grad_norm": 1.01459538936615, "learning_rate": 3.3372616241697026e-05, "loss": 0.308, "step": 1562 }, { "epoch": 1.0025657472738936, "grad_norm": 0.8572037220001221, "learning_rate": 3.33619027212342e-05, "loss": 0.3024, "step": 1563 }, { "epoch": 1.0032071840923669, "grad_norm": 0.8682429194450378, "learning_rate": 3.3351189200771376e-05, "loss": 0.2565, "step": 1564 }, { "epoch": 1.0038486209108404, "grad_norm": 0.9437769651412964, "learning_rate": 3.3340475680308554e-05, "loss": 0.3143, "step": 1565 }, { "epoch": 1.0044900577293137, "grad_norm": 1.1304742097854614, "learning_rate": 3.332976215984573e-05, "loss": 0.3104, "step": 1566 }, { "epoch": 1.005131494547787, "grad_norm": 1.1983102560043335, "learning_rate": 3.3319048639382904e-05, "loss": 0.2806, "step": 1567 }, { "epoch": 1.0057729313662604, "grad_norm": 1.2069429159164429, "learning_rate": 3.330833511892008e-05, "loss": 0.2515, "step": 1568 }, { "epoch": 1.0064143681847337, "grad_norm": 0.9942494630813599, "learning_rate": 3.329762159845726e-05, "loss": 0.2348, "step": 1569 }, { "epoch": 1.0070558050032072, "grad_norm": 1.140851616859436, "learning_rate": 3.328690807799443e-05, "loss": 0.2646, "step": 1570 }, { "epoch": 1.0076972418216805, "grad_norm": 1.2082077264785767, "learning_rate": 3.327619455753161e-05, "loss": 0.3106, "step": 1571 }, { "epoch": 1.008338678640154, "grad_norm": 1.165634274482727, "learning_rate": 3.326548103706878e-05, "loss": 0.2728, "step": 1572 }, { "epoch": 1.0089801154586273, "grad_norm": 1.1365700960159302, "learning_rate": 3.325476751660596e-05, "loss": 0.2356, "step": 1573 }, { "epoch": 1.0096215522771006, "grad_norm": 1.21422278881073, "learning_rate": 3.324405399614313e-05, "loss": 0.2686, "step": 1574 }, { "epoch": 1.010262989095574, "grad_norm": 1.2469522953033447, "learning_rate": 3.323334047568031e-05, "loss": 0.2839, "step": 1575 }, { "epoch": 1.0109044259140474, "grad_norm": 1.385231852531433, "learning_rate": 3.3222626955217487e-05, "loss": 0.294, "step": 1576 }, { "epoch": 1.011545862732521, "grad_norm": 1.054368495941162, "learning_rate": 3.321191343475466e-05, "loss": 0.2535, "step": 1577 }, { "epoch": 1.0121872995509942, "grad_norm": 1.3012949228286743, "learning_rate": 3.3201199914291836e-05, "loss": 0.3114, "step": 1578 }, { "epoch": 1.0128287363694677, "grad_norm": 1.0164227485656738, "learning_rate": 3.3190486393829014e-05, "loss": 0.2319, "step": 1579 }, { "epoch": 1.013470173187941, "grad_norm": 1.1687341928482056, "learning_rate": 3.3179772873366186e-05, "loss": 0.2456, "step": 1580 }, { "epoch": 1.0141116100064145, "grad_norm": 1.039780855178833, "learning_rate": 3.3169059352903364e-05, "loss": 0.2352, "step": 1581 }, { "epoch": 1.0147530468248878, "grad_norm": 1.2436814308166504, "learning_rate": 3.315834583244054e-05, "loss": 0.2993, "step": 1582 }, { "epoch": 1.015394483643361, "grad_norm": 1.0947320461273193, "learning_rate": 3.314763231197771e-05, "loss": 0.2447, "step": 1583 }, { "epoch": 1.0160359204618346, "grad_norm": 1.0783756971359253, "learning_rate": 3.313691879151489e-05, "loss": 0.2669, "step": 1584 }, { "epoch": 1.0166773572803078, "grad_norm": 1.3323214054107666, "learning_rate": 3.312620527105207e-05, "loss": 0.2777, "step": 1585 }, { "epoch": 1.0173187940987813, "grad_norm": 1.2719640731811523, "learning_rate": 3.311549175058925e-05, "loss": 0.251, "step": 1586 }, { "epoch": 1.0179602309172546, "grad_norm": 1.19078528881073, "learning_rate": 3.310477823012642e-05, "loss": 0.2755, "step": 1587 }, { "epoch": 1.0186016677357281, "grad_norm": 2.0031490325927734, "learning_rate": 3.30940647096636e-05, "loss": 0.2317, "step": 1588 }, { "epoch": 1.0192431045542014, "grad_norm": 0.9493137001991272, "learning_rate": 3.3083351189200775e-05, "loss": 0.2398, "step": 1589 }, { "epoch": 1.0198845413726747, "grad_norm": 1.2562671899795532, "learning_rate": 3.307263766873795e-05, "loss": 0.2735, "step": 1590 }, { "epoch": 1.0205259781911482, "grad_norm": 1.2003397941589355, "learning_rate": 3.3061924148275125e-05, "loss": 0.2624, "step": 1591 }, { "epoch": 1.0211674150096215, "grad_norm": 1.5910186767578125, "learning_rate": 3.30512106278123e-05, "loss": 0.3891, "step": 1592 }, { "epoch": 1.021808851828095, "grad_norm": 1.1166589260101318, "learning_rate": 3.3040497107349474e-05, "loss": 0.2266, "step": 1593 }, { "epoch": 1.0224502886465683, "grad_norm": 1.2358137369155884, "learning_rate": 3.302978358688665e-05, "loss": 0.2861, "step": 1594 }, { "epoch": 1.0230917254650418, "grad_norm": 1.6489481925964355, "learning_rate": 3.301907006642383e-05, "loss": 0.3094, "step": 1595 }, { "epoch": 1.023733162283515, "grad_norm": 1.304017186164856, "learning_rate": 3.300835654596101e-05, "loss": 0.2666, "step": 1596 }, { "epoch": 1.0243745991019884, "grad_norm": 1.0270297527313232, "learning_rate": 3.299764302549818e-05, "loss": 0.2271, "step": 1597 }, { "epoch": 1.0250160359204619, "grad_norm": 1.2481133937835693, "learning_rate": 3.298692950503536e-05, "loss": 0.3064, "step": 1598 }, { "epoch": 1.0256574727389351, "grad_norm": 1.3509958982467651, "learning_rate": 3.2976215984572537e-05, "loss": 0.2987, "step": 1599 }, { "epoch": 1.0262989095574087, "grad_norm": 1.5971059799194336, "learning_rate": 3.296550246410971e-05, "loss": 0.3603, "step": 1600 }, { "epoch": 1.026940346375882, "grad_norm": 1.3404232263565063, "learning_rate": 3.2954788943646886e-05, "loss": 0.2567, "step": 1601 }, { "epoch": 1.0275817831943554, "grad_norm": 1.197204828262329, "learning_rate": 3.2944075423184064e-05, "loss": 0.2865, "step": 1602 }, { "epoch": 1.0282232200128287, "grad_norm": 1.236595869064331, "learning_rate": 3.2933361902721236e-05, "loss": 0.2904, "step": 1603 }, { "epoch": 1.028864656831302, "grad_norm": 1.2968101501464844, "learning_rate": 3.2922648382258414e-05, "loss": 0.2773, "step": 1604 }, { "epoch": 1.0295060936497755, "grad_norm": 1.0145114660263062, "learning_rate": 3.291193486179559e-05, "loss": 0.2436, "step": 1605 }, { "epoch": 1.0301475304682488, "grad_norm": 1.06461763381958, "learning_rate": 3.290122134133276e-05, "loss": 0.2292, "step": 1606 }, { "epoch": 1.0307889672867223, "grad_norm": 1.0704096555709839, "learning_rate": 3.2890507820869935e-05, "loss": 0.2547, "step": 1607 }, { "epoch": 1.0314304041051956, "grad_norm": 1.081409215927124, "learning_rate": 3.287979430040711e-05, "loss": 0.2197, "step": 1608 }, { "epoch": 1.032071840923669, "grad_norm": 1.284887671470642, "learning_rate": 3.286908077994429e-05, "loss": 0.2587, "step": 1609 }, { "epoch": 1.0327132777421424, "grad_norm": 1.0014721155166626, "learning_rate": 3.285836725948146e-05, "loss": 0.2419, "step": 1610 }, { "epoch": 1.0333547145606157, "grad_norm": 1.189270257949829, "learning_rate": 3.284765373901864e-05, "loss": 0.2632, "step": 1611 }, { "epoch": 1.0339961513790892, "grad_norm": 0.9984138607978821, "learning_rate": 3.283694021855582e-05, "loss": 0.2269, "step": 1612 }, { "epoch": 1.0346375881975625, "grad_norm": 1.4433982372283936, "learning_rate": 3.282622669809299e-05, "loss": 0.2832, "step": 1613 }, { "epoch": 1.035279025016036, "grad_norm": 1.2183836698532104, "learning_rate": 3.281551317763017e-05, "loss": 0.2429, "step": 1614 }, { "epoch": 1.0359204618345093, "grad_norm": 1.3442944288253784, "learning_rate": 3.2804799657167346e-05, "loss": 0.244, "step": 1615 }, { "epoch": 1.0365618986529828, "grad_norm": 1.4222527742385864, "learning_rate": 3.2794086136704524e-05, "loss": 0.2976, "step": 1616 }, { "epoch": 1.037203335471456, "grad_norm": 1.0452775955200195, "learning_rate": 3.2783372616241696e-05, "loss": 0.2454, "step": 1617 }, { "epoch": 1.0378447722899296, "grad_norm": 1.3194725513458252, "learning_rate": 3.2772659095778874e-05, "loss": 0.2944, "step": 1618 }, { "epoch": 1.0384862091084028, "grad_norm": 1.3915255069732666, "learning_rate": 3.276194557531605e-05, "loss": 0.3286, "step": 1619 }, { "epoch": 1.0391276459268761, "grad_norm": 1.1345527172088623, "learning_rate": 3.2751232054853223e-05, "loss": 0.2319, "step": 1620 }, { "epoch": 1.0397690827453496, "grad_norm": 1.199087381362915, "learning_rate": 3.27405185343904e-05, "loss": 0.2337, "step": 1621 }, { "epoch": 1.040410519563823, "grad_norm": 1.1934250593185425, "learning_rate": 3.272980501392758e-05, "loss": 0.2387, "step": 1622 }, { "epoch": 1.0410519563822964, "grad_norm": 1.2609918117523193, "learning_rate": 3.271909149346475e-05, "loss": 0.2855, "step": 1623 }, { "epoch": 1.0416933932007697, "grad_norm": 1.3600924015045166, "learning_rate": 3.270837797300193e-05, "loss": 0.253, "step": 1624 }, { "epoch": 1.0423348300192432, "grad_norm": 1.1617226600646973, "learning_rate": 3.269766445253911e-05, "loss": 0.27, "step": 1625 }, { "epoch": 1.0429762668377165, "grad_norm": 1.153713583946228, "learning_rate": 3.2686950932076286e-05, "loss": 0.2321, "step": 1626 }, { "epoch": 1.0436177036561898, "grad_norm": 1.2336434125900269, "learning_rate": 3.267623741161346e-05, "loss": 0.2621, "step": 1627 }, { "epoch": 1.0442591404746633, "grad_norm": 1.1582156419754028, "learning_rate": 3.2665523891150635e-05, "loss": 0.2461, "step": 1628 }, { "epoch": 1.0449005772931366, "grad_norm": 1.507310390472412, "learning_rate": 3.265481037068781e-05, "loss": 0.3146, "step": 1629 }, { "epoch": 1.04554201411161, "grad_norm": 1.1567755937576294, "learning_rate": 3.2644096850224985e-05, "loss": 0.2629, "step": 1630 }, { "epoch": 1.0461834509300834, "grad_norm": 1.2963244915008545, "learning_rate": 3.263338332976216e-05, "loss": 0.2594, "step": 1631 }, { "epoch": 1.0468248877485569, "grad_norm": 1.2978538274765015, "learning_rate": 3.262266980929934e-05, "loss": 0.3174, "step": 1632 }, { "epoch": 1.0474663245670301, "grad_norm": 1.436224341392517, "learning_rate": 3.261195628883651e-05, "loss": 0.2984, "step": 1633 }, { "epoch": 1.0481077613855034, "grad_norm": 1.2408339977264404, "learning_rate": 3.260124276837369e-05, "loss": 0.2682, "step": 1634 }, { "epoch": 1.048749198203977, "grad_norm": 1.1101409196853638, "learning_rate": 3.259052924791087e-05, "loss": 0.2442, "step": 1635 }, { "epoch": 1.0493906350224502, "grad_norm": 1.031803846359253, "learning_rate": 3.257981572744804e-05, "loss": 0.2622, "step": 1636 }, { "epoch": 1.0500320718409237, "grad_norm": 1.239999771118164, "learning_rate": 3.256910220698522e-05, "loss": 0.2684, "step": 1637 }, { "epoch": 1.050673508659397, "grad_norm": 1.2790424823760986, "learning_rate": 3.2558388686522396e-05, "loss": 0.2823, "step": 1638 }, { "epoch": 1.0513149454778705, "grad_norm": 1.1564692258834839, "learning_rate": 3.2547675166059574e-05, "loss": 0.221, "step": 1639 }, { "epoch": 1.0519563822963438, "grad_norm": 0.9896730780601501, "learning_rate": 3.2536961645596746e-05, "loss": 0.2157, "step": 1640 }, { "epoch": 1.052597819114817, "grad_norm": 1.1452999114990234, "learning_rate": 3.252624812513392e-05, "loss": 0.288, "step": 1641 }, { "epoch": 1.0532392559332906, "grad_norm": 1.108796238899231, "learning_rate": 3.2515534604671095e-05, "loss": 0.2322, "step": 1642 }, { "epoch": 1.0538806927517639, "grad_norm": 0.9585800170898438, "learning_rate": 3.2504821084208273e-05, "loss": 0.226, "step": 1643 }, { "epoch": 1.0545221295702374, "grad_norm": 1.1401550769805908, "learning_rate": 3.2494107563745445e-05, "loss": 0.2478, "step": 1644 }, { "epoch": 1.0551635663887107, "grad_norm": 1.568485975265503, "learning_rate": 3.248339404328262e-05, "loss": 0.3414, "step": 1645 }, { "epoch": 1.0558050032071842, "grad_norm": 1.2202028036117554, "learning_rate": 3.24726805228198e-05, "loss": 0.233, "step": 1646 }, { "epoch": 1.0564464400256575, "grad_norm": 1.0701996088027954, "learning_rate": 3.246196700235697e-05, "loss": 0.214, "step": 1647 }, { "epoch": 1.0570878768441307, "grad_norm": 1.4246587753295898, "learning_rate": 3.245125348189415e-05, "loss": 0.3033, "step": 1648 }, { "epoch": 1.0577293136626043, "grad_norm": 1.3255528211593628, "learning_rate": 3.244053996143133e-05, "loss": 0.2819, "step": 1649 }, { "epoch": 1.0583707504810775, "grad_norm": 1.1544817686080933, "learning_rate": 3.24298264409685e-05, "loss": 0.2565, "step": 1650 }, { "epoch": 1.059012187299551, "grad_norm": 1.1047202348709106, "learning_rate": 3.241911292050568e-05, "loss": 0.2187, "step": 1651 }, { "epoch": 1.0596536241180243, "grad_norm": 1.2759366035461426, "learning_rate": 3.2408399400042856e-05, "loss": 0.2406, "step": 1652 }, { "epoch": 1.0602950609364978, "grad_norm": 1.0989835262298584, "learning_rate": 3.239768587958003e-05, "loss": 0.2406, "step": 1653 }, { "epoch": 1.0609364977549711, "grad_norm": 1.9500775337219238, "learning_rate": 3.2386972359117206e-05, "loss": 0.349, "step": 1654 }, { "epoch": 1.0615779345734446, "grad_norm": 1.2381494045257568, "learning_rate": 3.2376258838654384e-05, "loss": 0.277, "step": 1655 }, { "epoch": 1.062219371391918, "grad_norm": 1.3622938394546509, "learning_rate": 3.236554531819156e-05, "loss": 0.293, "step": 1656 }, { "epoch": 1.0628608082103912, "grad_norm": 1.2148008346557617, "learning_rate": 3.2354831797728734e-05, "loss": 0.2597, "step": 1657 }, { "epoch": 1.0635022450288647, "grad_norm": 1.2621982097625732, "learning_rate": 3.234411827726591e-05, "loss": 0.303, "step": 1658 }, { "epoch": 1.064143681847338, "grad_norm": 1.4254831075668335, "learning_rate": 3.233340475680309e-05, "loss": 0.2799, "step": 1659 }, { "epoch": 1.0647851186658115, "grad_norm": 1.364515781402588, "learning_rate": 3.232269123634026e-05, "loss": 0.275, "step": 1660 }, { "epoch": 1.0654265554842848, "grad_norm": 1.0657709836959839, "learning_rate": 3.231197771587744e-05, "loss": 0.2293, "step": 1661 }, { "epoch": 1.066067992302758, "grad_norm": 1.310761570930481, "learning_rate": 3.230126419541462e-05, "loss": 0.2887, "step": 1662 }, { "epoch": 1.0667094291212316, "grad_norm": 1.235467791557312, "learning_rate": 3.229055067495179e-05, "loss": 0.2982, "step": 1663 }, { "epoch": 1.0673508659397049, "grad_norm": 1.1122605800628662, "learning_rate": 3.227983715448897e-05, "loss": 0.2446, "step": 1664 }, { "epoch": 1.0679923027581784, "grad_norm": 1.3184270858764648, "learning_rate": 3.2269123634026145e-05, "loss": 0.2727, "step": 1665 }, { "epoch": 1.0686337395766516, "grad_norm": 1.2212953567504883, "learning_rate": 3.225841011356332e-05, "loss": 0.2459, "step": 1666 }, { "epoch": 1.0692751763951251, "grad_norm": 0.9483336210250854, "learning_rate": 3.2247696593100495e-05, "loss": 0.2312, "step": 1667 }, { "epoch": 1.0699166132135984, "grad_norm": 1.4409099817276, "learning_rate": 3.223698307263767e-05, "loss": 0.3284, "step": 1668 }, { "epoch": 1.070558050032072, "grad_norm": 1.1478780508041382, "learning_rate": 3.222626955217485e-05, "loss": 0.2786, "step": 1669 }, { "epoch": 1.0711994868505452, "grad_norm": 1.229030728340149, "learning_rate": 3.221555603171202e-05, "loss": 0.2434, "step": 1670 }, { "epoch": 1.0718409236690185, "grad_norm": 1.144455909729004, "learning_rate": 3.22048425112492e-05, "loss": 0.2438, "step": 1671 }, { "epoch": 1.072482360487492, "grad_norm": 1.1226165294647217, "learning_rate": 3.219412899078638e-05, "loss": 0.2782, "step": 1672 }, { "epoch": 1.0731237973059653, "grad_norm": 1.0669699907302856, "learning_rate": 3.218341547032355e-05, "loss": 0.245, "step": 1673 }, { "epoch": 1.0737652341244388, "grad_norm": 1.079188585281372, "learning_rate": 3.217270194986073e-05, "loss": 0.2232, "step": 1674 }, { "epoch": 1.074406670942912, "grad_norm": 1.47645103931427, "learning_rate": 3.21619884293979e-05, "loss": 0.2906, "step": 1675 }, { "epoch": 1.0750481077613856, "grad_norm": 1.5105061531066895, "learning_rate": 3.215127490893508e-05, "loss": 0.3042, "step": 1676 }, { "epoch": 1.0756895445798589, "grad_norm": 1.016976237297058, "learning_rate": 3.214056138847225e-05, "loss": 0.214, "step": 1677 }, { "epoch": 1.0763309813983322, "grad_norm": 1.0850286483764648, "learning_rate": 3.212984786800943e-05, "loss": 0.238, "step": 1678 }, { "epoch": 1.0769724182168057, "grad_norm": 1.1481115818023682, "learning_rate": 3.2119134347546605e-05, "loss": 0.2326, "step": 1679 }, { "epoch": 1.077613855035279, "grad_norm": 1.7233725786209106, "learning_rate": 3.210842082708378e-05, "loss": 0.3099, "step": 1680 }, { "epoch": 1.0782552918537525, "grad_norm": 1.4680899381637573, "learning_rate": 3.2097707306620955e-05, "loss": 0.2937, "step": 1681 }, { "epoch": 1.0788967286722257, "grad_norm": 1.350703477859497, "learning_rate": 3.208699378615813e-05, "loss": 0.2555, "step": 1682 }, { "epoch": 1.0795381654906993, "grad_norm": 1.092850923538208, "learning_rate": 3.2076280265695304e-05, "loss": 0.2282, "step": 1683 }, { "epoch": 1.0801796023091725, "grad_norm": 1.603314995765686, "learning_rate": 3.206556674523248e-05, "loss": 0.2814, "step": 1684 }, { "epoch": 1.0808210391276458, "grad_norm": 1.0002048015594482, "learning_rate": 3.205485322476966e-05, "loss": 0.2308, "step": 1685 }, { "epoch": 1.0814624759461193, "grad_norm": 1.256288766860962, "learning_rate": 3.204413970430684e-05, "loss": 0.2624, "step": 1686 }, { "epoch": 1.0821039127645926, "grad_norm": 1.5281333923339844, "learning_rate": 3.203342618384401e-05, "loss": 0.3035, "step": 1687 }, { "epoch": 1.0827453495830661, "grad_norm": 1.6815483570098877, "learning_rate": 3.202271266338119e-05, "loss": 0.3304, "step": 1688 }, { "epoch": 1.0833867864015394, "grad_norm": 1.6212788820266724, "learning_rate": 3.201199914291837e-05, "loss": 0.3554, "step": 1689 }, { "epoch": 1.084028223220013, "grad_norm": 1.129518747329712, "learning_rate": 3.200128562245554e-05, "loss": 0.2721, "step": 1690 }, { "epoch": 1.0846696600384862, "grad_norm": 0.8708721995353699, "learning_rate": 3.1990572101992716e-05, "loss": 0.227, "step": 1691 }, { "epoch": 1.0853110968569597, "grad_norm": 1.1681898832321167, "learning_rate": 3.1979858581529894e-05, "loss": 0.2559, "step": 1692 }, { "epoch": 1.085952533675433, "grad_norm": 1.3394943475723267, "learning_rate": 3.1969145061067066e-05, "loss": 0.2837, "step": 1693 }, { "epoch": 1.0865939704939063, "grad_norm": 1.4903208017349243, "learning_rate": 3.1958431540604244e-05, "loss": 0.3807, "step": 1694 }, { "epoch": 1.0872354073123798, "grad_norm": 1.1772823333740234, "learning_rate": 3.194771802014142e-05, "loss": 0.263, "step": 1695 }, { "epoch": 1.087876844130853, "grad_norm": 1.3219623565673828, "learning_rate": 3.193700449967859e-05, "loss": 0.3043, "step": 1696 }, { "epoch": 1.0885182809493266, "grad_norm": 1.1636810302734375, "learning_rate": 3.192629097921577e-05, "loss": 0.2545, "step": 1697 }, { "epoch": 1.0891597177677999, "grad_norm": 1.2887579202651978, "learning_rate": 3.191557745875295e-05, "loss": 0.2862, "step": 1698 }, { "epoch": 1.0898011545862731, "grad_norm": 0.9161859750747681, "learning_rate": 3.190486393829013e-05, "loss": 0.236, "step": 1699 }, { "epoch": 1.0904425914047466, "grad_norm": 0.9985333681106567, "learning_rate": 3.18941504178273e-05, "loss": 0.2282, "step": 1700 }, { "epoch": 1.09108402822322, "grad_norm": 1.1627607345581055, "learning_rate": 3.188343689736448e-05, "loss": 0.2604, "step": 1701 }, { "epoch": 1.0917254650416934, "grad_norm": 1.2538411617279053, "learning_rate": 3.1872723376901655e-05, "loss": 0.2577, "step": 1702 }, { "epoch": 1.0923669018601667, "grad_norm": 1.4644441604614258, "learning_rate": 3.186200985643883e-05, "loss": 0.3283, "step": 1703 }, { "epoch": 1.0930083386786402, "grad_norm": 1.0916913747787476, "learning_rate": 3.1851296335976005e-05, "loss": 0.2405, "step": 1704 }, { "epoch": 1.0936497754971135, "grad_norm": 1.506492257118225, "learning_rate": 3.184058281551318e-05, "loss": 0.3184, "step": 1705 }, { "epoch": 1.094291212315587, "grad_norm": 1.3376269340515137, "learning_rate": 3.1829869295050354e-05, "loss": 0.294, "step": 1706 }, { "epoch": 1.0949326491340603, "grad_norm": 1.5132657289505005, "learning_rate": 3.181915577458753e-05, "loss": 0.2964, "step": 1707 }, { "epoch": 1.0955740859525336, "grad_norm": 1.357140064239502, "learning_rate": 3.180844225412471e-05, "loss": 0.2739, "step": 1708 }, { "epoch": 1.096215522771007, "grad_norm": 1.1863962411880493, "learning_rate": 3.179772873366188e-05, "loss": 0.2363, "step": 1709 }, { "epoch": 1.0968569595894804, "grad_norm": 1.1980233192443848, "learning_rate": 3.1787015213199054e-05, "loss": 0.2767, "step": 1710 }, { "epoch": 1.0974983964079539, "grad_norm": 1.0616666078567505, "learning_rate": 3.177630169273623e-05, "loss": 0.2222, "step": 1711 }, { "epoch": 1.0981398332264272, "grad_norm": 1.169255018234253, "learning_rate": 3.176558817227341e-05, "loss": 0.2521, "step": 1712 }, { "epoch": 1.0987812700449007, "grad_norm": 1.2913552522659302, "learning_rate": 3.175487465181058e-05, "loss": 0.2721, "step": 1713 }, { "epoch": 1.099422706863374, "grad_norm": 1.5852444171905518, "learning_rate": 3.174416113134776e-05, "loss": 0.3251, "step": 1714 }, { "epoch": 1.1000641436818472, "grad_norm": 1.2010270357131958, "learning_rate": 3.173344761088494e-05, "loss": 0.2804, "step": 1715 }, { "epoch": 1.1007055805003207, "grad_norm": 1.1181026697158813, "learning_rate": 3.1722734090422116e-05, "loss": 0.2344, "step": 1716 }, { "epoch": 1.101347017318794, "grad_norm": 1.4707783460617065, "learning_rate": 3.171202056995929e-05, "loss": 0.3182, "step": 1717 }, { "epoch": 1.1019884541372675, "grad_norm": 1.328292727470398, "learning_rate": 3.1701307049496465e-05, "loss": 0.2998, "step": 1718 }, { "epoch": 1.1026298909557408, "grad_norm": 1.2194278240203857, "learning_rate": 3.169059352903364e-05, "loss": 0.2443, "step": 1719 }, { "epoch": 1.1032713277742143, "grad_norm": 1.0357102155685425, "learning_rate": 3.1679880008570815e-05, "loss": 0.2453, "step": 1720 }, { "epoch": 1.1039127645926876, "grad_norm": 1.2554672956466675, "learning_rate": 3.166916648810799e-05, "loss": 0.254, "step": 1721 }, { "epoch": 1.104554201411161, "grad_norm": 1.1022683382034302, "learning_rate": 3.165845296764517e-05, "loss": 0.2465, "step": 1722 }, { "epoch": 1.1051956382296344, "grad_norm": 1.2427548170089722, "learning_rate": 3.164773944718234e-05, "loss": 0.2785, "step": 1723 }, { "epoch": 1.1058370750481077, "grad_norm": 1.138269305229187, "learning_rate": 3.163702592671952e-05, "loss": 0.2349, "step": 1724 }, { "epoch": 1.1064785118665812, "grad_norm": 1.2576749324798584, "learning_rate": 3.16263124062567e-05, "loss": 0.276, "step": 1725 }, { "epoch": 1.1071199486850545, "grad_norm": 1.0822328329086304, "learning_rate": 3.161559888579388e-05, "loss": 0.2485, "step": 1726 }, { "epoch": 1.107761385503528, "grad_norm": 1.3351306915283203, "learning_rate": 3.160488536533105e-05, "loss": 0.2739, "step": 1727 }, { "epoch": 1.1084028223220013, "grad_norm": 1.3313041925430298, "learning_rate": 3.1594171844868226e-05, "loss": 0.2944, "step": 1728 }, { "epoch": 1.1090442591404748, "grad_norm": 1.3250852823257446, "learning_rate": 3.1583458324405404e-05, "loss": 0.2532, "step": 1729 }, { "epoch": 1.109685695958948, "grad_norm": 1.4998754262924194, "learning_rate": 3.1572744803942576e-05, "loss": 0.2605, "step": 1730 }, { "epoch": 1.1103271327774213, "grad_norm": 1.395700454711914, "learning_rate": 3.1562031283479754e-05, "loss": 0.314, "step": 1731 }, { "epoch": 1.1109685695958949, "grad_norm": 1.2873538732528687, "learning_rate": 3.155131776301693e-05, "loss": 0.2305, "step": 1732 }, { "epoch": 1.1116100064143681, "grad_norm": 1.1695716381072998, "learning_rate": 3.1540604242554104e-05, "loss": 0.2637, "step": 1733 }, { "epoch": 1.1122514432328416, "grad_norm": 1.685302972793579, "learning_rate": 3.152989072209128e-05, "loss": 0.3827, "step": 1734 }, { "epoch": 1.112892880051315, "grad_norm": 0.9876381754875183, "learning_rate": 3.151917720162846e-05, "loss": 0.2259, "step": 1735 }, { "epoch": 1.1135343168697882, "grad_norm": 1.0688424110412598, "learning_rate": 3.150846368116563e-05, "loss": 0.2467, "step": 1736 }, { "epoch": 1.1141757536882617, "grad_norm": 1.3278477191925049, "learning_rate": 3.149775016070281e-05, "loss": 0.3231, "step": 1737 }, { "epoch": 1.114817190506735, "grad_norm": 1.1092079877853394, "learning_rate": 3.148703664023999e-05, "loss": 0.2574, "step": 1738 }, { "epoch": 1.1154586273252085, "grad_norm": 0.9009851813316345, "learning_rate": 3.1476323119777166e-05, "loss": 0.2298, "step": 1739 }, { "epoch": 1.1161000641436818, "grad_norm": 1.5371452569961548, "learning_rate": 3.146560959931434e-05, "loss": 0.4185, "step": 1740 }, { "epoch": 1.1167415009621553, "grad_norm": 1.3545371294021606, "learning_rate": 3.1454896078851515e-05, "loss": 0.2585, "step": 1741 }, { "epoch": 1.1173829377806286, "grad_norm": 1.2213901281356812, "learning_rate": 3.144418255838869e-05, "loss": 0.2646, "step": 1742 }, { "epoch": 1.118024374599102, "grad_norm": 1.016082763671875, "learning_rate": 3.143346903792586e-05, "loss": 0.2193, "step": 1743 }, { "epoch": 1.1186658114175754, "grad_norm": 1.2312854528427124, "learning_rate": 3.1422755517463036e-05, "loss": 0.2725, "step": 1744 }, { "epoch": 1.1193072482360487, "grad_norm": 1.1475516557693481, "learning_rate": 3.1412041997000214e-05, "loss": 0.2302, "step": 1745 }, { "epoch": 1.1199486850545222, "grad_norm": 1.1982461214065552, "learning_rate": 3.140132847653739e-05, "loss": 0.2943, "step": 1746 }, { "epoch": 1.1205901218729954, "grad_norm": 1.0476558208465576, "learning_rate": 3.1390614956074564e-05, "loss": 0.2501, "step": 1747 }, { "epoch": 1.121231558691469, "grad_norm": 1.312286615371704, "learning_rate": 3.137990143561174e-05, "loss": 0.2704, "step": 1748 }, { "epoch": 1.1218729955099422, "grad_norm": 1.2284574508666992, "learning_rate": 3.136918791514892e-05, "loss": 0.2962, "step": 1749 }, { "epoch": 1.1225144323284157, "grad_norm": 1.2323616743087769, "learning_rate": 3.135847439468609e-05, "loss": 0.2236, "step": 1750 }, { "epoch": 1.123155869146889, "grad_norm": 1.2112178802490234, "learning_rate": 3.134776087422327e-05, "loss": 0.2564, "step": 1751 }, { "epoch": 1.1237973059653623, "grad_norm": 1.1948295831680298, "learning_rate": 3.133704735376045e-05, "loss": 0.2482, "step": 1752 }, { "epoch": 1.1244387427838358, "grad_norm": 1.1523715257644653, "learning_rate": 3.132633383329762e-05, "loss": 0.2509, "step": 1753 }, { "epoch": 1.125080179602309, "grad_norm": 1.9786096811294556, "learning_rate": 3.13156203128348e-05, "loss": 0.3806, "step": 1754 }, { "epoch": 1.1257216164207826, "grad_norm": 1.2104295492172241, "learning_rate": 3.1304906792371975e-05, "loss": 0.239, "step": 1755 }, { "epoch": 1.126363053239256, "grad_norm": 1.212213158607483, "learning_rate": 3.1294193271909153e-05, "loss": 0.2187, "step": 1756 }, { "epoch": 1.1270044900577294, "grad_norm": 0.932525634765625, "learning_rate": 3.1283479751446325e-05, "loss": 0.2281, "step": 1757 }, { "epoch": 1.1276459268762027, "grad_norm": 1.0874391794204712, "learning_rate": 3.12727662309835e-05, "loss": 0.2411, "step": 1758 }, { "epoch": 1.128287363694676, "grad_norm": 1.3953773975372314, "learning_rate": 3.126205271052068e-05, "loss": 0.3184, "step": 1759 }, { "epoch": 1.1289288005131495, "grad_norm": 1.2713137865066528, "learning_rate": 3.125133919005785e-05, "loss": 0.2605, "step": 1760 }, { "epoch": 1.1295702373316228, "grad_norm": 1.262933611869812, "learning_rate": 3.124062566959503e-05, "loss": 0.2344, "step": 1761 }, { "epoch": 1.1302116741500963, "grad_norm": 1.1996710300445557, "learning_rate": 3.122991214913221e-05, "loss": 0.2643, "step": 1762 }, { "epoch": 1.1308531109685696, "grad_norm": 1.248801827430725, "learning_rate": 3.121919862866938e-05, "loss": 0.2742, "step": 1763 }, { "epoch": 1.131494547787043, "grad_norm": 1.390032410621643, "learning_rate": 3.120848510820656e-05, "loss": 0.3228, "step": 1764 }, { "epoch": 1.1321359846055163, "grad_norm": 1.2225689888000488, "learning_rate": 3.1197771587743737e-05, "loss": 0.2384, "step": 1765 }, { "epoch": 1.1327774214239898, "grad_norm": 1.1680916547775269, "learning_rate": 3.118705806728091e-05, "loss": 0.2539, "step": 1766 }, { "epoch": 1.1334188582424631, "grad_norm": 1.0090447664260864, "learning_rate": 3.1176344546818086e-05, "loss": 0.2329, "step": 1767 }, { "epoch": 1.1340602950609364, "grad_norm": 1.3549940586090088, "learning_rate": 3.1165631026355264e-05, "loss": 0.3116, "step": 1768 }, { "epoch": 1.13470173187941, "grad_norm": 1.1987990140914917, "learning_rate": 3.115491750589244e-05, "loss": 0.2472, "step": 1769 }, { "epoch": 1.1353431686978832, "grad_norm": 1.3087477684020996, "learning_rate": 3.1144203985429614e-05, "loss": 0.2774, "step": 1770 }, { "epoch": 1.1359846055163567, "grad_norm": 1.252406358718872, "learning_rate": 3.113349046496679e-05, "loss": 0.262, "step": 1771 }, { "epoch": 1.13662604233483, "grad_norm": 1.167203426361084, "learning_rate": 3.112277694450397e-05, "loss": 0.2719, "step": 1772 }, { "epoch": 1.1372674791533033, "grad_norm": 1.0668740272521973, "learning_rate": 3.111206342404114e-05, "loss": 0.2195, "step": 1773 }, { "epoch": 1.1379089159717768, "grad_norm": 1.5704457759857178, "learning_rate": 3.110134990357832e-05, "loss": 0.3453, "step": 1774 }, { "epoch": 1.13855035279025, "grad_norm": 1.2317373752593994, "learning_rate": 3.10906363831155e-05, "loss": 0.2362, "step": 1775 }, { "epoch": 1.1391917896087236, "grad_norm": 1.3047735691070557, "learning_rate": 3.107992286265267e-05, "loss": 0.2878, "step": 1776 }, { "epoch": 1.1398332264271969, "grad_norm": 1.1734083890914917, "learning_rate": 3.106920934218985e-05, "loss": 0.2311, "step": 1777 }, { "epoch": 1.1404746632456704, "grad_norm": 1.3577936887741089, "learning_rate": 3.105849582172702e-05, "loss": 0.2998, "step": 1778 }, { "epoch": 1.1411161000641437, "grad_norm": 1.0302964448928833, "learning_rate": 3.10477823012642e-05, "loss": 0.2298, "step": 1779 }, { "epoch": 1.1417575368826172, "grad_norm": 1.2786140441894531, "learning_rate": 3.103706878080137e-05, "loss": 0.2409, "step": 1780 }, { "epoch": 1.1423989737010904, "grad_norm": 1.3907922506332397, "learning_rate": 3.1026355260338546e-05, "loss": 0.3, "step": 1781 }, { "epoch": 1.1430404105195637, "grad_norm": 1.307353138923645, "learning_rate": 3.1015641739875724e-05, "loss": 0.2668, "step": 1782 }, { "epoch": 1.1436818473380372, "grad_norm": 1.3313788175582886, "learning_rate": 3.1004928219412896e-05, "loss": 0.2792, "step": 1783 }, { "epoch": 1.1443232841565105, "grad_norm": 1.3196295499801636, "learning_rate": 3.0994214698950074e-05, "loss": 0.2321, "step": 1784 }, { "epoch": 1.144964720974984, "grad_norm": 1.1636484861373901, "learning_rate": 3.098350117848725e-05, "loss": 0.2565, "step": 1785 }, { "epoch": 1.1456061577934573, "grad_norm": 1.3464003801345825, "learning_rate": 3.097278765802443e-05, "loss": 0.285, "step": 1786 }, { "epoch": 1.1462475946119308, "grad_norm": 1.3024969100952148, "learning_rate": 3.09620741375616e-05, "loss": 0.2771, "step": 1787 }, { "epoch": 1.146889031430404, "grad_norm": 1.523419976234436, "learning_rate": 3.095136061709878e-05, "loss": 0.3062, "step": 1788 }, { "epoch": 1.1475304682488774, "grad_norm": 1.4900380373001099, "learning_rate": 3.094064709663596e-05, "loss": 0.3263, "step": 1789 }, { "epoch": 1.148171905067351, "grad_norm": 1.1574034690856934, "learning_rate": 3.092993357617313e-05, "loss": 0.274, "step": 1790 }, { "epoch": 1.1488133418858242, "grad_norm": 1.0678225755691528, "learning_rate": 3.091922005571031e-05, "loss": 0.2184, "step": 1791 }, { "epoch": 1.1494547787042977, "grad_norm": 1.3314555883407593, "learning_rate": 3.0908506535247486e-05, "loss": 0.2769, "step": 1792 }, { "epoch": 1.150096215522771, "grad_norm": 1.435508370399475, "learning_rate": 3.089779301478466e-05, "loss": 0.283, "step": 1793 }, { "epoch": 1.1507376523412445, "grad_norm": 1.5291543006896973, "learning_rate": 3.0887079494321835e-05, "loss": 0.3306, "step": 1794 }, { "epoch": 1.1513790891597178, "grad_norm": 1.3923008441925049, "learning_rate": 3.087636597385901e-05, "loss": 0.3263, "step": 1795 }, { "epoch": 1.152020525978191, "grad_norm": 1.0011439323425293, "learning_rate": 3.0865652453396185e-05, "loss": 0.2317, "step": 1796 }, { "epoch": 1.1526619627966646, "grad_norm": 1.5026017427444458, "learning_rate": 3.085493893293336e-05, "loss": 0.2754, "step": 1797 }, { "epoch": 1.1533033996151378, "grad_norm": 1.1389257907867432, "learning_rate": 3.084422541247054e-05, "loss": 0.2424, "step": 1798 }, { "epoch": 1.1539448364336113, "grad_norm": 1.1351308822631836, "learning_rate": 3.083351189200772e-05, "loss": 0.248, "step": 1799 }, { "epoch": 1.1545862732520846, "grad_norm": 1.035607099533081, "learning_rate": 3.082279837154489e-05, "loss": 0.252, "step": 1800 }, { "epoch": 1.1552277100705581, "grad_norm": 1.4505589008331299, "learning_rate": 3.081208485108207e-05, "loss": 0.2978, "step": 1801 }, { "epoch": 1.1558691468890314, "grad_norm": 1.4244664907455444, "learning_rate": 3.080137133061925e-05, "loss": 0.2435, "step": 1802 }, { "epoch": 1.156510583707505, "grad_norm": 0.8343948125839233, "learning_rate": 3.079065781015642e-05, "loss": 0.2097, "step": 1803 }, { "epoch": 1.1571520205259782, "grad_norm": 1.3261137008666992, "learning_rate": 3.0779944289693596e-05, "loss": 0.2642, "step": 1804 }, { "epoch": 1.1577934573444515, "grad_norm": 1.324812889099121, "learning_rate": 3.0769230769230774e-05, "loss": 0.2613, "step": 1805 }, { "epoch": 1.158434894162925, "grad_norm": 1.249629259109497, "learning_rate": 3.0758517248767946e-05, "loss": 0.277, "step": 1806 }, { "epoch": 1.1590763309813983, "grad_norm": 1.406079888343811, "learning_rate": 3.0747803728305124e-05, "loss": 0.2636, "step": 1807 }, { "epoch": 1.1597177677998718, "grad_norm": 1.4343608617782593, "learning_rate": 3.07370902078423e-05, "loss": 0.2664, "step": 1808 }, { "epoch": 1.160359204618345, "grad_norm": 1.5167814493179321, "learning_rate": 3.0726376687379473e-05, "loss": 0.3003, "step": 1809 }, { "epoch": 1.1610006414368184, "grad_norm": 1.3708610534667969, "learning_rate": 3.071566316691665e-05, "loss": 0.2286, "step": 1810 }, { "epoch": 1.1616420782552919, "grad_norm": 1.500029444694519, "learning_rate": 3.070494964645383e-05, "loss": 0.267, "step": 1811 }, { "epoch": 1.1622835150737651, "grad_norm": 1.3078869581222534, "learning_rate": 3.0694236125991e-05, "loss": 0.2695, "step": 1812 }, { "epoch": 1.1629249518922387, "grad_norm": 1.4466824531555176, "learning_rate": 3.068352260552817e-05, "loss": 0.2574, "step": 1813 }, { "epoch": 1.163566388710712, "grad_norm": 1.721961498260498, "learning_rate": 3.067280908506535e-05, "loss": 0.3235, "step": 1814 }, { "epoch": 1.1642078255291854, "grad_norm": 1.5999109745025635, "learning_rate": 3.066209556460253e-05, "loss": 0.318, "step": 1815 }, { "epoch": 1.1648492623476587, "grad_norm": 1.0447837114334106, "learning_rate": 3.065138204413971e-05, "loss": 0.2347, "step": 1816 }, { "epoch": 1.1654906991661322, "grad_norm": 1.347228765487671, "learning_rate": 3.064066852367688e-05, "loss": 0.2344, "step": 1817 }, { "epoch": 1.1661321359846055, "grad_norm": 1.1254432201385498, "learning_rate": 3.0629955003214056e-05, "loss": 0.2267, "step": 1818 }, { "epoch": 1.1667735728030788, "grad_norm": 0.9930309653282166, "learning_rate": 3.0619241482751235e-05, "loss": 0.2157, "step": 1819 }, { "epoch": 1.1674150096215523, "grad_norm": 1.320136547088623, "learning_rate": 3.0608527962288406e-05, "loss": 0.2835, "step": 1820 }, { "epoch": 1.1680564464400256, "grad_norm": 1.1864675283432007, "learning_rate": 3.0597814441825584e-05, "loss": 0.2446, "step": 1821 }, { "epoch": 1.168697883258499, "grad_norm": 1.1406612396240234, "learning_rate": 3.058710092136276e-05, "loss": 0.2406, "step": 1822 }, { "epoch": 1.1693393200769724, "grad_norm": 1.4162524938583374, "learning_rate": 3.0576387400899934e-05, "loss": 0.2586, "step": 1823 }, { "epoch": 1.169980756895446, "grad_norm": 1.7006795406341553, "learning_rate": 3.056567388043711e-05, "loss": 0.3672, "step": 1824 }, { "epoch": 1.1706221937139192, "grad_norm": 1.144527792930603, "learning_rate": 3.055496035997429e-05, "loss": 0.2355, "step": 1825 }, { "epoch": 1.1712636305323925, "grad_norm": 1.3949517011642456, "learning_rate": 3.054424683951146e-05, "loss": 0.2841, "step": 1826 }, { "epoch": 1.171905067350866, "grad_norm": 1.2356165647506714, "learning_rate": 3.053353331904864e-05, "loss": 0.2183, "step": 1827 }, { "epoch": 1.1725465041693393, "grad_norm": 1.024776816368103, "learning_rate": 3.052281979858582e-05, "loss": 0.2308, "step": 1828 }, { "epoch": 1.1731879409878128, "grad_norm": 1.227466106414795, "learning_rate": 3.0512106278122992e-05, "loss": 0.25, "step": 1829 }, { "epoch": 1.173829377806286, "grad_norm": 1.0365229845046997, "learning_rate": 3.050139275766017e-05, "loss": 0.2231, "step": 1830 }, { "epoch": 1.1744708146247596, "grad_norm": 1.5035008192062378, "learning_rate": 3.0490679237197345e-05, "loss": 0.2694, "step": 1831 }, { "epoch": 1.1751122514432328, "grad_norm": 1.4965932369232178, "learning_rate": 3.047996571673452e-05, "loss": 0.3128, "step": 1832 }, { "epoch": 1.1757536882617061, "grad_norm": 1.176712989807129, "learning_rate": 3.0469252196271698e-05, "loss": 0.2433, "step": 1833 }, { "epoch": 1.1763951250801796, "grad_norm": 1.4075161218643188, "learning_rate": 3.0458538675808873e-05, "loss": 0.3026, "step": 1834 }, { "epoch": 1.177036561898653, "grad_norm": 1.8542299270629883, "learning_rate": 3.0447825155346048e-05, "loss": 0.4017, "step": 1835 }, { "epoch": 1.1776779987171264, "grad_norm": 1.027180552482605, "learning_rate": 3.0437111634883226e-05, "loss": 0.2388, "step": 1836 }, { "epoch": 1.1783194355355997, "grad_norm": 1.2693336009979248, "learning_rate": 3.04263981144204e-05, "loss": 0.265, "step": 1837 }, { "epoch": 1.1789608723540732, "grad_norm": 1.2850329875946045, "learning_rate": 3.041568459395758e-05, "loss": 0.2678, "step": 1838 }, { "epoch": 1.1796023091725465, "grad_norm": 1.4167850017547607, "learning_rate": 3.0404971073494753e-05, "loss": 0.3188, "step": 1839 }, { "epoch": 1.18024374599102, "grad_norm": 1.7082457542419434, "learning_rate": 3.0394257553031928e-05, "loss": 0.3906, "step": 1840 }, { "epoch": 1.1808851828094933, "grad_norm": 1.4058476686477661, "learning_rate": 3.0383544032569106e-05, "loss": 0.2939, "step": 1841 }, { "epoch": 1.1815266196279666, "grad_norm": 1.2596383094787598, "learning_rate": 3.037283051210628e-05, "loss": 0.2502, "step": 1842 }, { "epoch": 1.18216805644644, "grad_norm": 1.0605247020721436, "learning_rate": 3.036211699164346e-05, "loss": 0.2199, "step": 1843 }, { "epoch": 1.1828094932649134, "grad_norm": 1.5231672525405884, "learning_rate": 3.0351403471180634e-05, "loss": 0.2898, "step": 1844 }, { "epoch": 1.1834509300833869, "grad_norm": 1.3335903882980347, "learning_rate": 3.034068995071781e-05, "loss": 0.2941, "step": 1845 }, { "epoch": 1.1840923669018601, "grad_norm": 1.1310360431671143, "learning_rate": 3.032997643025498e-05, "loss": 0.237, "step": 1846 }, { "epoch": 1.1847338037203334, "grad_norm": 1.2280181646347046, "learning_rate": 3.0319262909792155e-05, "loss": 0.2856, "step": 1847 }, { "epoch": 1.185375240538807, "grad_norm": 1.2742186784744263, "learning_rate": 3.0308549389329333e-05, "loss": 0.2744, "step": 1848 }, { "epoch": 1.1860166773572802, "grad_norm": 1.5922244787216187, "learning_rate": 3.0297835868866508e-05, "loss": 0.3563, "step": 1849 }, { "epoch": 1.1866581141757537, "grad_norm": 1.144774317741394, "learning_rate": 3.0287122348403686e-05, "loss": 0.2675, "step": 1850 }, { "epoch": 1.187299550994227, "grad_norm": 1.2558132410049438, "learning_rate": 3.027640882794086e-05, "loss": 0.2627, "step": 1851 }, { "epoch": 1.1879409878127005, "grad_norm": 1.0611011981964111, "learning_rate": 3.0265695307478036e-05, "loss": 0.233, "step": 1852 }, { "epoch": 1.1885824246311738, "grad_norm": 1.1282334327697754, "learning_rate": 3.0254981787015214e-05, "loss": 0.2414, "step": 1853 }, { "epoch": 1.1892238614496473, "grad_norm": 1.0916255712509155, "learning_rate": 3.024426826655239e-05, "loss": 0.2801, "step": 1854 }, { "epoch": 1.1898652982681206, "grad_norm": 1.1136806011199951, "learning_rate": 3.0233554746089567e-05, "loss": 0.2537, "step": 1855 }, { "epoch": 1.1905067350865939, "grad_norm": 1.1340285539627075, "learning_rate": 3.022284122562674e-05, "loss": 0.2567, "step": 1856 }, { "epoch": 1.1911481719050674, "grad_norm": 1.2744938135147095, "learning_rate": 3.0212127705163916e-05, "loss": 0.2884, "step": 1857 }, { "epoch": 1.1917896087235407, "grad_norm": 1.3639729022979736, "learning_rate": 3.0201414184701094e-05, "loss": 0.2935, "step": 1858 }, { "epoch": 1.1924310455420142, "grad_norm": 1.2709541320800781, "learning_rate": 3.019070066423827e-05, "loss": 0.281, "step": 1859 }, { "epoch": 1.1930724823604875, "grad_norm": 1.118071436882019, "learning_rate": 3.0179987143775447e-05, "loss": 0.2551, "step": 1860 }, { "epoch": 1.193713919178961, "grad_norm": 1.204897403717041, "learning_rate": 3.0169273623312622e-05, "loss": 0.2681, "step": 1861 }, { "epoch": 1.1943553559974343, "grad_norm": 1.3830336332321167, "learning_rate": 3.0158560102849797e-05, "loss": 0.3011, "step": 1862 }, { "epoch": 1.1949967928159075, "grad_norm": 1.3286031484603882, "learning_rate": 3.0147846582386975e-05, "loss": 0.2868, "step": 1863 }, { "epoch": 1.195638229634381, "grad_norm": 1.3557132482528687, "learning_rate": 3.013713306192415e-05, "loss": 0.2451, "step": 1864 }, { "epoch": 1.1962796664528543, "grad_norm": 1.0660762786865234, "learning_rate": 3.0126419541461324e-05, "loss": 0.2676, "step": 1865 }, { "epoch": 1.1969211032713278, "grad_norm": 0.998414158821106, "learning_rate": 3.0115706020998503e-05, "loss": 0.2292, "step": 1866 }, { "epoch": 1.1975625400898011, "grad_norm": 1.3690241575241089, "learning_rate": 3.0104992500535677e-05, "loss": 0.3535, "step": 1867 }, { "epoch": 1.1982039769082746, "grad_norm": 1.2076692581176758, "learning_rate": 3.0094278980072855e-05, "loss": 0.2773, "step": 1868 }, { "epoch": 1.198845413726748, "grad_norm": 1.0724468231201172, "learning_rate": 3.008356545961003e-05, "loss": 0.244, "step": 1869 }, { "epoch": 1.1994868505452212, "grad_norm": 1.3701461553573608, "learning_rate": 3.0072851939147205e-05, "loss": 0.3086, "step": 1870 }, { "epoch": 1.2001282873636947, "grad_norm": 1.4121193885803223, "learning_rate": 3.0062138418684383e-05, "loss": 0.2499, "step": 1871 }, { "epoch": 1.200769724182168, "grad_norm": 1.1294076442718506, "learning_rate": 3.0051424898221558e-05, "loss": 0.2404, "step": 1872 }, { "epoch": 1.2014111610006415, "grad_norm": 1.180383563041687, "learning_rate": 3.0040711377758736e-05, "loss": 0.243, "step": 1873 }, { "epoch": 1.2020525978191148, "grad_norm": 1.0873130559921265, "learning_rate": 3.002999785729591e-05, "loss": 0.2365, "step": 1874 }, { "epoch": 1.2026940346375883, "grad_norm": 1.4282220602035522, "learning_rate": 3.0019284336833086e-05, "loss": 0.2544, "step": 1875 }, { "epoch": 1.2033354714560616, "grad_norm": 1.3637213706970215, "learning_rate": 3.0008570816370264e-05, "loss": 0.2709, "step": 1876 }, { "epoch": 1.203976908274535, "grad_norm": 1.4549216032028198, "learning_rate": 2.999785729590744e-05, "loss": 0.2949, "step": 1877 }, { "epoch": 1.2046183450930084, "grad_norm": 1.2456687688827515, "learning_rate": 2.9987143775444617e-05, "loss": 0.2422, "step": 1878 }, { "epoch": 1.2052597819114816, "grad_norm": 1.7739601135253906, "learning_rate": 2.997643025498179e-05, "loss": 0.296, "step": 1879 }, { "epoch": 1.2059012187299551, "grad_norm": 1.1847565174102783, "learning_rate": 2.9965716734518966e-05, "loss": 0.2593, "step": 1880 }, { "epoch": 1.2065426555484284, "grad_norm": 1.2416304349899292, "learning_rate": 2.9955003214056137e-05, "loss": 0.2455, "step": 1881 }, { "epoch": 1.207184092366902, "grad_norm": 1.3328155279159546, "learning_rate": 2.9944289693593312e-05, "loss": 0.3019, "step": 1882 }, { "epoch": 1.2078255291853752, "grad_norm": 1.5038000345230103, "learning_rate": 2.993357617313049e-05, "loss": 0.3242, "step": 1883 }, { "epoch": 1.2084669660038485, "grad_norm": 1.2246983051300049, "learning_rate": 2.9922862652667665e-05, "loss": 0.2754, "step": 1884 }, { "epoch": 1.209108402822322, "grad_norm": 1.2365587949752808, "learning_rate": 2.9912149132204843e-05, "loss": 0.2427, "step": 1885 }, { "epoch": 1.2097498396407953, "grad_norm": 1.2412590980529785, "learning_rate": 2.9901435611742018e-05, "loss": 0.2854, "step": 1886 }, { "epoch": 1.2103912764592688, "grad_norm": 1.2579689025878906, "learning_rate": 2.9890722091279193e-05, "loss": 0.2937, "step": 1887 }, { "epoch": 1.211032713277742, "grad_norm": 1.051466703414917, "learning_rate": 2.988000857081637e-05, "loss": 0.2416, "step": 1888 }, { "epoch": 1.2116741500962156, "grad_norm": 1.4181816577911377, "learning_rate": 2.9869295050353546e-05, "loss": 0.2827, "step": 1889 }, { "epoch": 1.2123155869146889, "grad_norm": 2.131148338317871, "learning_rate": 2.9858581529890724e-05, "loss": 0.299, "step": 1890 }, { "epoch": 1.2129570237331624, "grad_norm": 1.0510796308517456, "learning_rate": 2.98478680094279e-05, "loss": 0.2025, "step": 1891 }, { "epoch": 1.2135984605516357, "grad_norm": 1.1008985042572021, "learning_rate": 2.9837154488965073e-05, "loss": 0.2555, "step": 1892 }, { "epoch": 1.214239897370109, "grad_norm": 1.1011000871658325, "learning_rate": 2.982644096850225e-05, "loss": 0.2146, "step": 1893 }, { "epoch": 1.2148813341885825, "grad_norm": 1.1553045511245728, "learning_rate": 2.9815727448039426e-05, "loss": 0.2647, "step": 1894 }, { "epoch": 1.2155227710070557, "grad_norm": 1.2263069152832031, "learning_rate": 2.98050139275766e-05, "loss": 0.2513, "step": 1895 }, { "epoch": 1.2161642078255293, "grad_norm": 1.1350029706954956, "learning_rate": 2.979430040711378e-05, "loss": 0.2401, "step": 1896 }, { "epoch": 1.2168056446440025, "grad_norm": 1.3641287088394165, "learning_rate": 2.9783586886650954e-05, "loss": 0.3017, "step": 1897 }, { "epoch": 1.2174470814624758, "grad_norm": 1.396684169769287, "learning_rate": 2.9772873366188132e-05, "loss": 0.2714, "step": 1898 }, { "epoch": 1.2180885182809493, "grad_norm": 1.2694017887115479, "learning_rate": 2.9762159845725307e-05, "loss": 0.2581, "step": 1899 }, { "epoch": 1.2187299550994226, "grad_norm": 1.5152878761291504, "learning_rate": 2.975144632526248e-05, "loss": 0.2645, "step": 1900 }, { "epoch": 1.2193713919178961, "grad_norm": 1.4767051935195923, "learning_rate": 2.974073280479966e-05, "loss": 0.2982, "step": 1901 }, { "epoch": 1.2200128287363694, "grad_norm": 1.3525218963623047, "learning_rate": 2.9730019284336835e-05, "loss": 0.261, "step": 1902 }, { "epoch": 1.220654265554843, "grad_norm": 1.2717903852462769, "learning_rate": 2.9719305763874013e-05, "loss": 0.2532, "step": 1903 }, { "epoch": 1.2212957023733162, "grad_norm": 1.3896381855010986, "learning_rate": 2.9708592243411187e-05, "loss": 0.2767, "step": 1904 }, { "epoch": 1.2219371391917897, "grad_norm": 1.5318806171417236, "learning_rate": 2.9697878722948362e-05, "loss": 0.2793, "step": 1905 }, { "epoch": 1.222578576010263, "grad_norm": 1.2687650918960571, "learning_rate": 2.968716520248554e-05, "loss": 0.2575, "step": 1906 }, { "epoch": 1.2232200128287363, "grad_norm": 1.3511168956756592, "learning_rate": 2.9676451682022715e-05, "loss": 0.2609, "step": 1907 }, { "epoch": 1.2238614496472098, "grad_norm": 1.609208106994629, "learning_rate": 2.9665738161559893e-05, "loss": 0.3676, "step": 1908 }, { "epoch": 1.224502886465683, "grad_norm": 1.330182433128357, "learning_rate": 2.9655024641097068e-05, "loss": 0.285, "step": 1909 }, { "epoch": 1.2251443232841566, "grad_norm": 1.248788595199585, "learning_rate": 2.9644311120634243e-05, "loss": 0.2622, "step": 1910 }, { "epoch": 1.2257857601026299, "grad_norm": 1.3777700662612915, "learning_rate": 2.963359760017142e-05, "loss": 0.3098, "step": 1911 }, { "epoch": 1.2264271969211034, "grad_norm": 1.4602515697479248, "learning_rate": 2.9622884079708596e-05, "loss": 0.3521, "step": 1912 }, { "epoch": 1.2270686337395766, "grad_norm": 1.359995722770691, "learning_rate": 2.961217055924577e-05, "loss": 0.2446, "step": 1913 }, { "epoch": 1.2277100705580501, "grad_norm": 1.344071388244629, "learning_rate": 2.960145703878295e-05, "loss": 0.2804, "step": 1914 }, { "epoch": 1.2283515073765234, "grad_norm": 1.2927820682525635, "learning_rate": 2.959074351832012e-05, "loss": 0.2573, "step": 1915 }, { "epoch": 1.2289929441949967, "grad_norm": 0.9473354816436768, "learning_rate": 2.9580029997857295e-05, "loss": 0.235, "step": 1916 }, { "epoch": 1.2296343810134702, "grad_norm": 1.2079333066940308, "learning_rate": 2.956931647739447e-05, "loss": 0.2662, "step": 1917 }, { "epoch": 1.2302758178319435, "grad_norm": 1.0674856901168823, "learning_rate": 2.9558602956931648e-05, "loss": 0.2348, "step": 1918 }, { "epoch": 1.230917254650417, "grad_norm": 1.4427571296691895, "learning_rate": 2.9547889436468822e-05, "loss": 0.3503, "step": 1919 }, { "epoch": 1.2315586914688903, "grad_norm": 1.471297025680542, "learning_rate": 2.9537175916006e-05, "loss": 0.3367, "step": 1920 }, { "epoch": 1.2322001282873636, "grad_norm": 1.2942475080490112, "learning_rate": 2.9526462395543175e-05, "loss": 0.2746, "step": 1921 }, { "epoch": 1.232841565105837, "grad_norm": 1.295088768005371, "learning_rate": 2.951574887508035e-05, "loss": 0.2542, "step": 1922 }, { "epoch": 1.2334830019243104, "grad_norm": 0.8972923755645752, "learning_rate": 2.9505035354617528e-05, "loss": 0.2043, "step": 1923 }, { "epoch": 1.2341244387427839, "grad_norm": 1.2455761432647705, "learning_rate": 2.9494321834154703e-05, "loss": 0.2828, "step": 1924 }, { "epoch": 1.2347658755612572, "grad_norm": 1.1511039733886719, "learning_rate": 2.948360831369188e-05, "loss": 0.2218, "step": 1925 }, { "epoch": 1.2354073123797307, "grad_norm": 1.328898549079895, "learning_rate": 2.9472894793229056e-05, "loss": 0.3332, "step": 1926 }, { "epoch": 1.236048749198204, "grad_norm": 1.114574670791626, "learning_rate": 2.946218127276623e-05, "loss": 0.2568, "step": 1927 }, { "epoch": 1.2366901860166775, "grad_norm": 1.451870083808899, "learning_rate": 2.945146775230341e-05, "loss": 0.3175, "step": 1928 }, { "epoch": 1.2373316228351507, "grad_norm": 1.140001893043518, "learning_rate": 2.9440754231840584e-05, "loss": 0.2371, "step": 1929 }, { "epoch": 1.237973059653624, "grad_norm": 1.5889358520507812, "learning_rate": 2.943004071137776e-05, "loss": 0.4219, "step": 1930 }, { "epoch": 1.2386144964720975, "grad_norm": 1.4305144548416138, "learning_rate": 2.9419327190914936e-05, "loss": 0.3262, "step": 1931 }, { "epoch": 1.2392559332905708, "grad_norm": 1.1153689622879028, "learning_rate": 2.940861367045211e-05, "loss": 0.2329, "step": 1932 }, { "epoch": 1.2398973701090443, "grad_norm": 1.1426358222961426, "learning_rate": 2.939790014998929e-05, "loss": 0.2417, "step": 1933 }, { "epoch": 1.2405388069275176, "grad_norm": 1.2044950723648071, "learning_rate": 2.9387186629526464e-05, "loss": 0.2756, "step": 1934 }, { "epoch": 1.241180243745991, "grad_norm": 1.0273677110671997, "learning_rate": 2.937647310906364e-05, "loss": 0.2364, "step": 1935 }, { "epoch": 1.2418216805644644, "grad_norm": 1.1607811450958252, "learning_rate": 2.9365759588600817e-05, "loss": 0.2629, "step": 1936 }, { "epoch": 1.2424631173829377, "grad_norm": 1.402310848236084, "learning_rate": 2.9355046068137992e-05, "loss": 0.3016, "step": 1937 }, { "epoch": 1.2431045542014112, "grad_norm": 1.3346490859985352, "learning_rate": 2.934433254767517e-05, "loss": 0.2834, "step": 1938 }, { "epoch": 1.2437459910198845, "grad_norm": 1.1933923959732056, "learning_rate": 2.9333619027212345e-05, "loss": 0.2376, "step": 1939 }, { "epoch": 1.244387427838358, "grad_norm": 1.1837691068649292, "learning_rate": 2.932290550674952e-05, "loss": 0.25, "step": 1940 }, { "epoch": 1.2450288646568313, "grad_norm": 1.1687060594558716, "learning_rate": 2.9312191986286698e-05, "loss": 0.2685, "step": 1941 }, { "epoch": 1.2456703014753048, "grad_norm": 0.9374935626983643, "learning_rate": 2.9301478465823872e-05, "loss": 0.2095, "step": 1942 }, { "epoch": 1.246311738293778, "grad_norm": 1.1784930229187012, "learning_rate": 2.929076494536105e-05, "loss": 0.2374, "step": 1943 }, { "epoch": 1.2469531751122513, "grad_norm": 1.4752088785171509, "learning_rate": 2.9280051424898225e-05, "loss": 0.2609, "step": 1944 }, { "epoch": 1.2475946119307249, "grad_norm": 1.3494505882263184, "learning_rate": 2.92693379044354e-05, "loss": 0.2957, "step": 1945 }, { "epoch": 1.2482360487491981, "grad_norm": 0.9712784886360168, "learning_rate": 2.9258624383972578e-05, "loss": 0.2329, "step": 1946 }, { "epoch": 1.2488774855676716, "grad_norm": 1.3111125230789185, "learning_rate": 2.9247910863509753e-05, "loss": 0.2644, "step": 1947 }, { "epoch": 1.249518922386145, "grad_norm": 1.176190733909607, "learning_rate": 2.9237197343046928e-05, "loss": 0.2565, "step": 1948 }, { "epoch": 1.2501603592046182, "grad_norm": 1.226569414138794, "learning_rate": 2.92264838225841e-05, "loss": 0.254, "step": 1949 }, { "epoch": 1.2508017960230917, "grad_norm": 1.4919874668121338, "learning_rate": 2.9215770302121277e-05, "loss": 0.3216, "step": 1950 }, { "epoch": 1.2514432328415652, "grad_norm": 1.8347322940826416, "learning_rate": 2.9205056781658452e-05, "loss": 0.3456, "step": 1951 }, { "epoch": 1.2520846696600385, "grad_norm": 1.0684161186218262, "learning_rate": 2.9194343261195627e-05, "loss": 0.2525, "step": 1952 }, { "epoch": 1.2527261064785118, "grad_norm": 1.1241205930709839, "learning_rate": 2.9183629740732805e-05, "loss": 0.2512, "step": 1953 }, { "epoch": 1.2533675432969853, "grad_norm": 1.6647653579711914, "learning_rate": 2.917291622026998e-05, "loss": 0.3422, "step": 1954 }, { "epoch": 1.2540089801154586, "grad_norm": 1.199062466621399, "learning_rate": 2.9162202699807158e-05, "loss": 0.2683, "step": 1955 }, { "epoch": 1.254650416933932, "grad_norm": 1.4710321426391602, "learning_rate": 2.9151489179344333e-05, "loss": 0.2968, "step": 1956 }, { "epoch": 1.2552918537524054, "grad_norm": 1.1328563690185547, "learning_rate": 2.9140775658881507e-05, "loss": 0.2415, "step": 1957 }, { "epoch": 1.2559332905708787, "grad_norm": 1.194152593612671, "learning_rate": 2.9130062138418685e-05, "loss": 0.2533, "step": 1958 }, { "epoch": 1.2565747273893522, "grad_norm": 1.0462009906768799, "learning_rate": 2.911934861795586e-05, "loss": 0.2335, "step": 1959 }, { "epoch": 1.2572161642078254, "grad_norm": 1.1438086032867432, "learning_rate": 2.9108635097493035e-05, "loss": 0.247, "step": 1960 }, { "epoch": 1.257857601026299, "grad_norm": 1.4787466526031494, "learning_rate": 2.9097921577030213e-05, "loss": 0.3154, "step": 1961 }, { "epoch": 1.2584990378447722, "grad_norm": 1.2500156164169312, "learning_rate": 2.9087208056567388e-05, "loss": 0.2869, "step": 1962 }, { "epoch": 1.2591404746632457, "grad_norm": 1.131058692932129, "learning_rate": 2.9076494536104566e-05, "loss": 0.248, "step": 1963 }, { "epoch": 1.259781911481719, "grad_norm": 1.1576061248779297, "learning_rate": 2.906578101564174e-05, "loss": 0.242, "step": 1964 }, { "epoch": 1.2604233483001925, "grad_norm": 0.9678919911384583, "learning_rate": 2.9055067495178916e-05, "loss": 0.2171, "step": 1965 }, { "epoch": 1.2610647851186658, "grad_norm": 0.9977291822433472, "learning_rate": 2.9044353974716094e-05, "loss": 0.2248, "step": 1966 }, { "epoch": 1.261706221937139, "grad_norm": 1.3479397296905518, "learning_rate": 2.903364045425327e-05, "loss": 0.252, "step": 1967 }, { "epoch": 1.2623476587556126, "grad_norm": 1.3209221363067627, "learning_rate": 2.9022926933790447e-05, "loss": 0.3113, "step": 1968 }, { "epoch": 1.262989095574086, "grad_norm": 1.1806073188781738, "learning_rate": 2.901221341332762e-05, "loss": 0.2407, "step": 1969 }, { "epoch": 1.2636305323925594, "grad_norm": 1.3103184700012207, "learning_rate": 2.9001499892864796e-05, "loss": 0.259, "step": 1970 }, { "epoch": 1.2642719692110327, "grad_norm": 1.6088337898254395, "learning_rate": 2.8990786372401974e-05, "loss": 0.3092, "step": 1971 }, { "epoch": 1.264913406029506, "grad_norm": 1.3020786046981812, "learning_rate": 2.898007285193915e-05, "loss": 0.2536, "step": 1972 }, { "epoch": 1.2655548428479795, "grad_norm": 1.7900121212005615, "learning_rate": 2.8969359331476327e-05, "loss": 0.3443, "step": 1973 }, { "epoch": 1.266196279666453, "grad_norm": 1.3307406902313232, "learning_rate": 2.8958645811013502e-05, "loss": 0.2767, "step": 1974 }, { "epoch": 1.2668377164849263, "grad_norm": 1.3275266885757446, "learning_rate": 2.8947932290550677e-05, "loss": 0.3002, "step": 1975 }, { "epoch": 1.2674791533033996, "grad_norm": 1.137252688407898, "learning_rate": 2.8937218770087855e-05, "loss": 0.2355, "step": 1976 }, { "epoch": 1.268120590121873, "grad_norm": 1.685514211654663, "learning_rate": 2.892650524962503e-05, "loss": 0.3293, "step": 1977 }, { "epoch": 1.2687620269403463, "grad_norm": 1.2127524614334106, "learning_rate": 2.8915791729162204e-05, "loss": 0.2488, "step": 1978 }, { "epoch": 1.2694034637588198, "grad_norm": 1.4672411680221558, "learning_rate": 2.8905078208699383e-05, "loss": 0.2886, "step": 1979 }, { "epoch": 1.2700449005772931, "grad_norm": 1.365289330482483, "learning_rate": 2.8894364688236557e-05, "loss": 0.2543, "step": 1980 }, { "epoch": 1.2706863373957664, "grad_norm": 1.2184007167816162, "learning_rate": 2.8883651167773735e-05, "loss": 0.2529, "step": 1981 }, { "epoch": 1.27132777421424, "grad_norm": 1.4795604944229126, "learning_rate": 2.887293764731091e-05, "loss": 0.2907, "step": 1982 }, { "epoch": 1.2719692110327132, "grad_norm": 1.3182679414749146, "learning_rate": 2.8862224126848085e-05, "loss": 0.254, "step": 1983 }, { "epoch": 1.2726106478511867, "grad_norm": 1.1584763526916504, "learning_rate": 2.8851510606385256e-05, "loss": 0.2614, "step": 1984 }, { "epoch": 1.27325208466966, "grad_norm": 1.2153655290603638, "learning_rate": 2.8840797085922435e-05, "loss": 0.255, "step": 1985 }, { "epoch": 1.2738935214881333, "grad_norm": 1.270003318786621, "learning_rate": 2.883008356545961e-05, "loss": 0.2909, "step": 1986 }, { "epoch": 1.2745349583066068, "grad_norm": 1.0349539518356323, "learning_rate": 2.8819370044996784e-05, "loss": 0.2431, "step": 1987 }, { "epoch": 1.2751763951250803, "grad_norm": 1.298378825187683, "learning_rate": 2.8808656524533962e-05, "loss": 0.2764, "step": 1988 }, { "epoch": 1.2758178319435536, "grad_norm": 1.2657160758972168, "learning_rate": 2.8797943004071137e-05, "loss": 0.2711, "step": 1989 }, { "epoch": 1.2764592687620269, "grad_norm": 1.3734103441238403, "learning_rate": 2.8787229483608312e-05, "loss": 0.3486, "step": 1990 }, { "epoch": 1.2771007055805004, "grad_norm": 1.1151583194732666, "learning_rate": 2.877651596314549e-05, "loss": 0.2434, "step": 1991 }, { "epoch": 1.2777421423989737, "grad_norm": 1.1271939277648926, "learning_rate": 2.8765802442682665e-05, "loss": 0.2565, "step": 1992 }, { "epoch": 1.2783835792174472, "grad_norm": 1.143045425415039, "learning_rate": 2.8755088922219843e-05, "loss": 0.261, "step": 1993 }, { "epoch": 1.2790250160359204, "grad_norm": 1.169278860092163, "learning_rate": 2.8744375401757018e-05, "loss": 0.2651, "step": 1994 }, { "epoch": 1.2796664528543937, "grad_norm": 1.2044354677200317, "learning_rate": 2.8733661881294192e-05, "loss": 0.2669, "step": 1995 }, { "epoch": 1.2803078896728672, "grad_norm": 1.1145687103271484, "learning_rate": 2.872294836083137e-05, "loss": 0.2436, "step": 1996 }, { "epoch": 1.2809493264913405, "grad_norm": 0.9745197892189026, "learning_rate": 2.8712234840368545e-05, "loss": 0.2333, "step": 1997 }, { "epoch": 1.281590763309814, "grad_norm": 1.4281387329101562, "learning_rate": 2.8701521319905723e-05, "loss": 0.2912, "step": 1998 }, { "epoch": 1.2822322001282873, "grad_norm": 1.5647960901260376, "learning_rate": 2.8690807799442898e-05, "loss": 0.3301, "step": 1999 }, { "epoch": 1.2828736369467608, "grad_norm": 1.1194027662277222, "learning_rate": 2.8680094278980073e-05, "loss": 0.2463, "step": 2000 }, { "epoch": 1.283515073765234, "grad_norm": 1.2441272735595703, "learning_rate": 2.866938075851725e-05, "loss": 0.2728, "step": 2001 }, { "epoch": 1.2841565105837076, "grad_norm": 1.2419095039367676, "learning_rate": 2.8658667238054426e-05, "loss": 0.2825, "step": 2002 }, { "epoch": 1.284797947402181, "grad_norm": 1.0596320629119873, "learning_rate": 2.8647953717591604e-05, "loss": 0.2336, "step": 2003 }, { "epoch": 1.2854393842206542, "grad_norm": 0.9265231490135193, "learning_rate": 2.863724019712878e-05, "loss": 0.2102, "step": 2004 }, { "epoch": 1.2860808210391277, "grad_norm": 0.9918829798698425, "learning_rate": 2.8626526676665953e-05, "loss": 0.2285, "step": 2005 }, { "epoch": 1.286722257857601, "grad_norm": 1.0531022548675537, "learning_rate": 2.861581315620313e-05, "loss": 0.2283, "step": 2006 }, { "epoch": 1.2873636946760745, "grad_norm": 1.3989348411560059, "learning_rate": 2.8605099635740306e-05, "loss": 0.3071, "step": 2007 }, { "epoch": 1.2880051314945478, "grad_norm": 1.1128005981445312, "learning_rate": 2.859438611527748e-05, "loss": 0.2469, "step": 2008 }, { "epoch": 1.288646568313021, "grad_norm": 1.2172484397888184, "learning_rate": 2.858367259481466e-05, "loss": 0.2565, "step": 2009 }, { "epoch": 1.2892880051314946, "grad_norm": 1.311637043952942, "learning_rate": 2.8572959074351834e-05, "loss": 0.2456, "step": 2010 }, { "epoch": 1.289929441949968, "grad_norm": 1.0975154638290405, "learning_rate": 2.8562245553889012e-05, "loss": 0.2177, "step": 2011 }, { "epoch": 1.2905708787684413, "grad_norm": 1.387780785560608, "learning_rate": 2.8551532033426187e-05, "loss": 0.3033, "step": 2012 }, { "epoch": 1.2912123155869146, "grad_norm": 1.259158730506897, "learning_rate": 2.8540818512963362e-05, "loss": 0.2569, "step": 2013 }, { "epoch": 1.2918537524053881, "grad_norm": 1.6694756746292114, "learning_rate": 2.853010499250054e-05, "loss": 0.3257, "step": 2014 }, { "epoch": 1.2924951892238614, "grad_norm": 1.552249550819397, "learning_rate": 2.8519391472037715e-05, "loss": 0.3233, "step": 2015 }, { "epoch": 1.293136626042335, "grad_norm": 1.4834365844726562, "learning_rate": 2.8508677951574893e-05, "loss": 0.2971, "step": 2016 }, { "epoch": 1.2937780628608082, "grad_norm": 1.4008957147598267, "learning_rate": 2.8497964431112068e-05, "loss": 0.312, "step": 2017 }, { "epoch": 1.2944194996792815, "grad_norm": 1.4598133563995361, "learning_rate": 2.848725091064924e-05, "loss": 0.2848, "step": 2018 }, { "epoch": 1.295060936497755, "grad_norm": 1.4023903608322144, "learning_rate": 2.8476537390186414e-05, "loss": 0.2635, "step": 2019 }, { "epoch": 1.2957023733162283, "grad_norm": 1.193686604499817, "learning_rate": 2.8465823869723592e-05, "loss": 0.2508, "step": 2020 }, { "epoch": 1.2963438101347018, "grad_norm": 1.8134958744049072, "learning_rate": 2.8455110349260767e-05, "loss": 0.3945, "step": 2021 }, { "epoch": 1.296985246953175, "grad_norm": 1.1769144535064697, "learning_rate": 2.844439682879794e-05, "loss": 0.282, "step": 2022 }, { "epoch": 1.2976266837716484, "grad_norm": 1.049474835395813, "learning_rate": 2.843368330833512e-05, "loss": 0.2372, "step": 2023 }, { "epoch": 1.2982681205901219, "grad_norm": 1.1926385164260864, "learning_rate": 2.8422969787872294e-05, "loss": 0.2428, "step": 2024 }, { "epoch": 1.2989095574085954, "grad_norm": 1.9328062534332275, "learning_rate": 2.841225626740947e-05, "loss": 0.2841, "step": 2025 }, { "epoch": 1.2995509942270687, "grad_norm": 1.1165337562561035, "learning_rate": 2.8401542746946647e-05, "loss": 0.2404, "step": 2026 }, { "epoch": 1.300192431045542, "grad_norm": 1.1751765012741089, "learning_rate": 2.8390829226483822e-05, "loss": 0.2486, "step": 2027 }, { "epoch": 1.3008338678640154, "grad_norm": 1.2551817893981934, "learning_rate": 2.8380115706021e-05, "loss": 0.2713, "step": 2028 }, { "epoch": 1.3014753046824887, "grad_norm": 1.3972327709197998, "learning_rate": 2.8369402185558175e-05, "loss": 0.2834, "step": 2029 }, { "epoch": 1.3021167415009622, "grad_norm": 0.985526978969574, "learning_rate": 2.835868866509535e-05, "loss": 0.2317, "step": 2030 }, { "epoch": 1.3027581783194355, "grad_norm": 0.9984263181686401, "learning_rate": 2.8347975144632528e-05, "loss": 0.2172, "step": 2031 }, { "epoch": 1.3033996151379088, "grad_norm": 1.1873793601989746, "learning_rate": 2.8337261624169702e-05, "loss": 0.2936, "step": 2032 }, { "epoch": 1.3040410519563823, "grad_norm": 1.2561733722686768, "learning_rate": 2.832654810370688e-05, "loss": 0.2689, "step": 2033 }, { "epoch": 1.3046824887748556, "grad_norm": 1.332111120223999, "learning_rate": 2.8315834583244055e-05, "loss": 0.3021, "step": 2034 }, { "epoch": 1.305323925593329, "grad_norm": 1.1644566059112549, "learning_rate": 2.830512106278123e-05, "loss": 0.2253, "step": 2035 }, { "epoch": 1.3059653624118024, "grad_norm": 1.1888439655303955, "learning_rate": 2.8294407542318408e-05, "loss": 0.2513, "step": 2036 }, { "epoch": 1.306606799230276, "grad_norm": 1.3667259216308594, "learning_rate": 2.8283694021855583e-05, "loss": 0.3049, "step": 2037 }, { "epoch": 1.3072482360487492, "grad_norm": 1.047118902206421, "learning_rate": 2.827298050139276e-05, "loss": 0.2315, "step": 2038 }, { "epoch": 1.3078896728672227, "grad_norm": 1.1706105470657349, "learning_rate": 2.8262266980929936e-05, "loss": 0.2367, "step": 2039 }, { "epoch": 1.308531109685696, "grad_norm": 1.2088353633880615, "learning_rate": 2.825155346046711e-05, "loss": 0.2624, "step": 2040 }, { "epoch": 1.3091725465041693, "grad_norm": 1.2967371940612793, "learning_rate": 2.824083994000429e-05, "loss": 0.252, "step": 2041 }, { "epoch": 1.3098139833226428, "grad_norm": 1.6238926649093628, "learning_rate": 2.8230126419541464e-05, "loss": 0.3127, "step": 2042 }, { "epoch": 1.310455420141116, "grad_norm": 1.3755927085876465, "learning_rate": 2.821941289907864e-05, "loss": 0.2968, "step": 2043 }, { "epoch": 1.3110968569595896, "grad_norm": 1.1010665893554688, "learning_rate": 2.8208699378615817e-05, "loss": 0.2537, "step": 2044 }, { "epoch": 1.3117382937780628, "grad_norm": 1.1632906198501587, "learning_rate": 2.819798585815299e-05, "loss": 0.2403, "step": 2045 }, { "epoch": 1.3123797305965361, "grad_norm": 1.1510039567947388, "learning_rate": 2.818727233769017e-05, "loss": 0.2523, "step": 2046 }, { "epoch": 1.3130211674150096, "grad_norm": 1.329156756401062, "learning_rate": 2.8176558817227344e-05, "loss": 0.2587, "step": 2047 }, { "epoch": 1.3136626042334831, "grad_norm": 1.395485758781433, "learning_rate": 2.816584529676452e-05, "loss": 0.3134, "step": 2048 }, { "epoch": 1.3143040410519564, "grad_norm": 1.32795250415802, "learning_rate": 2.8155131776301697e-05, "loss": 0.3025, "step": 2049 }, { "epoch": 1.3149454778704297, "grad_norm": 1.743902564048767, "learning_rate": 2.8144418255838872e-05, "loss": 0.3263, "step": 2050 }, { "epoch": 1.3155869146889032, "grad_norm": 1.5823090076446533, "learning_rate": 2.813370473537605e-05, "loss": 0.3293, "step": 2051 }, { "epoch": 1.3162283515073765, "grad_norm": 1.3630844354629517, "learning_rate": 2.8122991214913218e-05, "loss": 0.3372, "step": 2052 }, { "epoch": 1.31686978832585, "grad_norm": 1.5216853618621826, "learning_rate": 2.8112277694450396e-05, "loss": 0.3784, "step": 2053 }, { "epoch": 1.3175112251443233, "grad_norm": 1.4820153713226318, "learning_rate": 2.810156417398757e-05, "loss": 0.2874, "step": 2054 }, { "epoch": 1.3181526619627966, "grad_norm": 1.4329363107681274, "learning_rate": 2.8090850653524746e-05, "loss": 0.3041, "step": 2055 }, { "epoch": 1.31879409878127, "grad_norm": 1.0986422300338745, "learning_rate": 2.8080137133061924e-05, "loss": 0.1924, "step": 2056 }, { "epoch": 1.3194355355997434, "grad_norm": 1.5137017965316772, "learning_rate": 2.80694236125991e-05, "loss": 0.3166, "step": 2057 }, { "epoch": 1.3200769724182169, "grad_norm": 1.2432185411453247, "learning_rate": 2.8058710092136277e-05, "loss": 0.2816, "step": 2058 }, { "epoch": 1.3207184092366901, "grad_norm": 1.4348214864730835, "learning_rate": 2.804799657167345e-05, "loss": 0.3415, "step": 2059 }, { "epoch": 1.3213598460551634, "grad_norm": 1.0789295434951782, "learning_rate": 2.8037283051210626e-05, "loss": 0.2427, "step": 2060 }, { "epoch": 1.322001282873637, "grad_norm": 1.0122026205062866, "learning_rate": 2.8026569530747804e-05, "loss": 0.2283, "step": 2061 }, { "epoch": 1.3226427196921104, "grad_norm": 1.0209122896194458, "learning_rate": 2.801585601028498e-05, "loss": 0.2447, "step": 2062 }, { "epoch": 1.3232841565105837, "grad_norm": 1.2823017835617065, "learning_rate": 2.8005142489822157e-05, "loss": 0.2882, "step": 2063 }, { "epoch": 1.323925593329057, "grad_norm": 1.2741869688034058, "learning_rate": 2.7994428969359332e-05, "loss": 0.2845, "step": 2064 }, { "epoch": 1.3245670301475305, "grad_norm": 1.6396678686141968, "learning_rate": 2.7983715448896507e-05, "loss": 0.3338, "step": 2065 }, { "epoch": 1.3252084669660038, "grad_norm": 1.34813392162323, "learning_rate": 2.7973001928433685e-05, "loss": 0.2748, "step": 2066 }, { "epoch": 1.3258499037844773, "grad_norm": 1.6009405851364136, "learning_rate": 2.796228840797086e-05, "loss": 0.338, "step": 2067 }, { "epoch": 1.3264913406029506, "grad_norm": 1.1916145086288452, "learning_rate": 2.7951574887508038e-05, "loss": 0.2464, "step": 2068 }, { "epoch": 1.3271327774214239, "grad_norm": 1.3931900262832642, "learning_rate": 2.7940861367045213e-05, "loss": 0.2835, "step": 2069 }, { "epoch": 1.3277742142398974, "grad_norm": 1.2720240354537964, "learning_rate": 2.7930147846582387e-05, "loss": 0.2469, "step": 2070 }, { "epoch": 1.3284156510583707, "grad_norm": 1.287179708480835, "learning_rate": 2.7919434326119566e-05, "loss": 0.2815, "step": 2071 }, { "epoch": 1.3290570878768442, "grad_norm": 1.0632914304733276, "learning_rate": 2.790872080565674e-05, "loss": 0.2342, "step": 2072 }, { "epoch": 1.3296985246953175, "grad_norm": 1.4613178968429565, "learning_rate": 2.7898007285193915e-05, "loss": 0.2787, "step": 2073 }, { "epoch": 1.330339961513791, "grad_norm": 1.6332899332046509, "learning_rate": 2.7887293764731093e-05, "loss": 0.265, "step": 2074 }, { "epoch": 1.3309813983322643, "grad_norm": 1.140023112297058, "learning_rate": 2.7876580244268268e-05, "loss": 0.2385, "step": 2075 }, { "epoch": 1.3316228351507378, "grad_norm": 1.2866066694259644, "learning_rate": 2.7865866723805446e-05, "loss": 0.2824, "step": 2076 }, { "epoch": 1.332264271969211, "grad_norm": 1.3976470232009888, "learning_rate": 2.785515320334262e-05, "loss": 0.2587, "step": 2077 }, { "epoch": 1.3329057087876843, "grad_norm": 1.4907461404800415, "learning_rate": 2.7844439682879796e-05, "loss": 0.284, "step": 2078 }, { "epoch": 1.3335471456061578, "grad_norm": 1.444069504737854, "learning_rate": 2.7833726162416974e-05, "loss": 0.2845, "step": 2079 }, { "epoch": 1.3341885824246311, "grad_norm": 1.2420120239257812, "learning_rate": 2.782301264195415e-05, "loss": 0.2567, "step": 2080 }, { "epoch": 1.3348300192431046, "grad_norm": 1.1035242080688477, "learning_rate": 2.7812299121491327e-05, "loss": 0.2339, "step": 2081 }, { "epoch": 1.335471456061578, "grad_norm": 1.273849606513977, "learning_rate": 2.78015856010285e-05, "loss": 0.2696, "step": 2082 }, { "epoch": 1.3361128928800512, "grad_norm": 1.3664226531982422, "learning_rate": 2.7790872080565676e-05, "loss": 0.3053, "step": 2083 }, { "epoch": 1.3367543296985247, "grad_norm": 1.1128590106964111, "learning_rate": 2.7780158560102854e-05, "loss": 0.2655, "step": 2084 }, { "epoch": 1.3373957665169982, "grad_norm": 1.0860817432403564, "learning_rate": 2.776944503964003e-05, "loss": 0.2416, "step": 2085 }, { "epoch": 1.3380372033354715, "grad_norm": 1.3471357822418213, "learning_rate": 2.7758731519177207e-05, "loss": 0.2846, "step": 2086 }, { "epoch": 1.3386786401539448, "grad_norm": 1.3730173110961914, "learning_rate": 2.7748017998714375e-05, "loss": 0.3304, "step": 2087 }, { "epoch": 1.3393200769724183, "grad_norm": 1.1464006900787354, "learning_rate": 2.7737304478251553e-05, "loss": 0.2515, "step": 2088 }, { "epoch": 1.3399615137908916, "grad_norm": 1.2007405757904053, "learning_rate": 2.7726590957788728e-05, "loss": 0.2474, "step": 2089 }, { "epoch": 1.340602950609365, "grad_norm": 1.1382662057876587, "learning_rate": 2.7715877437325903e-05, "loss": 0.2418, "step": 2090 }, { "epoch": 1.3412443874278384, "grad_norm": 0.9838912487030029, "learning_rate": 2.770516391686308e-05, "loss": 0.2271, "step": 2091 }, { "epoch": 1.3418858242463116, "grad_norm": 1.2778624296188354, "learning_rate": 2.7694450396400256e-05, "loss": 0.3095, "step": 2092 }, { "epoch": 1.3425272610647851, "grad_norm": 1.2449029684066772, "learning_rate": 2.7683736875937434e-05, "loss": 0.2616, "step": 2093 }, { "epoch": 1.3431686978832584, "grad_norm": 1.4095319509506226, "learning_rate": 2.767302335547461e-05, "loss": 0.3211, "step": 2094 }, { "epoch": 1.343810134701732, "grad_norm": 1.2966097593307495, "learning_rate": 2.7662309835011784e-05, "loss": 0.2537, "step": 2095 }, { "epoch": 1.3444515715202052, "grad_norm": 1.3841612339019775, "learning_rate": 2.765159631454896e-05, "loss": 0.2937, "step": 2096 }, { "epoch": 1.3450930083386785, "grad_norm": 1.0903805494308472, "learning_rate": 2.7640882794086136e-05, "loss": 0.2434, "step": 2097 }, { "epoch": 1.345734445157152, "grad_norm": 1.4654673337936401, "learning_rate": 2.7630169273623315e-05, "loss": 0.2823, "step": 2098 }, { "epoch": 1.3463758819756255, "grad_norm": 1.408163070678711, "learning_rate": 2.761945575316049e-05, "loss": 0.2511, "step": 2099 }, { "epoch": 1.3470173187940988, "grad_norm": 1.1438428163528442, "learning_rate": 2.7608742232697664e-05, "loss": 0.2307, "step": 2100 }, { "epoch": 1.347658755612572, "grad_norm": 1.262857437133789, "learning_rate": 2.7598028712234842e-05, "loss": 0.2641, "step": 2101 }, { "epoch": 1.3483001924310456, "grad_norm": 1.0474237203598022, "learning_rate": 2.7587315191772017e-05, "loss": 0.2233, "step": 2102 }, { "epoch": 1.3489416292495189, "grad_norm": 1.5409412384033203, "learning_rate": 2.7576601671309192e-05, "loss": 0.2847, "step": 2103 }, { "epoch": 1.3495830660679924, "grad_norm": 1.3951090574264526, "learning_rate": 2.756588815084637e-05, "loss": 0.3095, "step": 2104 }, { "epoch": 1.3502245028864657, "grad_norm": 1.7073084115982056, "learning_rate": 2.7555174630383545e-05, "loss": 0.248, "step": 2105 }, { "epoch": 1.350865939704939, "grad_norm": 1.4738471508026123, "learning_rate": 2.7544461109920723e-05, "loss": 0.3176, "step": 2106 }, { "epoch": 1.3515073765234125, "grad_norm": 1.2568374872207642, "learning_rate": 2.7533747589457898e-05, "loss": 0.2913, "step": 2107 }, { "epoch": 1.3521488133418857, "grad_norm": 1.3365038633346558, "learning_rate": 2.7523034068995072e-05, "loss": 0.3089, "step": 2108 }, { "epoch": 1.3527902501603593, "grad_norm": 1.1863785982131958, "learning_rate": 2.751232054853225e-05, "loss": 0.223, "step": 2109 }, { "epoch": 1.3534316869788325, "grad_norm": 1.29108464717865, "learning_rate": 2.7501607028069425e-05, "loss": 0.2708, "step": 2110 }, { "epoch": 1.354073123797306, "grad_norm": 1.234071969985962, "learning_rate": 2.7490893507606603e-05, "loss": 0.2506, "step": 2111 }, { "epoch": 1.3547145606157793, "grad_norm": 1.0406149625778198, "learning_rate": 2.7480179987143778e-05, "loss": 0.2345, "step": 2112 }, { "epoch": 1.3553559974342528, "grad_norm": 1.0853224992752075, "learning_rate": 2.7469466466680953e-05, "loss": 0.2459, "step": 2113 }, { "epoch": 1.3559974342527261, "grad_norm": 1.2466799020767212, "learning_rate": 2.745875294621813e-05, "loss": 0.2788, "step": 2114 }, { "epoch": 1.3566388710711994, "grad_norm": 1.3246904611587524, "learning_rate": 2.7448039425755306e-05, "loss": 0.2732, "step": 2115 }, { "epoch": 1.357280307889673, "grad_norm": 1.163664698600769, "learning_rate": 2.7437325905292484e-05, "loss": 0.2473, "step": 2116 }, { "epoch": 1.3579217447081462, "grad_norm": 1.2397650480270386, "learning_rate": 2.742661238482966e-05, "loss": 0.2508, "step": 2117 }, { "epoch": 1.3585631815266197, "grad_norm": 2.0033833980560303, "learning_rate": 2.7415898864366834e-05, "loss": 0.4147, "step": 2118 }, { "epoch": 1.359204618345093, "grad_norm": 1.3681297302246094, "learning_rate": 2.740518534390401e-05, "loss": 0.3181, "step": 2119 }, { "epoch": 1.3598460551635663, "grad_norm": 1.1688487529754639, "learning_rate": 2.7394471823441186e-05, "loss": 0.2382, "step": 2120 }, { "epoch": 1.3604874919820398, "grad_norm": 1.2269103527069092, "learning_rate": 2.7383758302978358e-05, "loss": 0.2541, "step": 2121 }, { "epoch": 1.3611289288005133, "grad_norm": 1.0909174680709839, "learning_rate": 2.7373044782515533e-05, "loss": 0.2329, "step": 2122 }, { "epoch": 1.3617703656189866, "grad_norm": 1.1438307762145996, "learning_rate": 2.736233126205271e-05, "loss": 0.2313, "step": 2123 }, { "epoch": 1.3624118024374599, "grad_norm": 1.6684354543685913, "learning_rate": 2.7351617741589885e-05, "loss": 0.2492, "step": 2124 }, { "epoch": 1.3630532392559334, "grad_norm": 1.5336259603500366, "learning_rate": 2.734090422112706e-05, "loss": 0.308, "step": 2125 }, { "epoch": 1.3636946760744066, "grad_norm": 1.5047674179077148, "learning_rate": 2.733019070066424e-05, "loss": 0.3166, "step": 2126 }, { "epoch": 1.3643361128928801, "grad_norm": 0.9943107962608337, "learning_rate": 2.7319477180201413e-05, "loss": 0.2081, "step": 2127 }, { "epoch": 1.3649775497113534, "grad_norm": 1.3598551750183105, "learning_rate": 2.730876365973859e-05, "loss": 0.2527, "step": 2128 }, { "epoch": 1.3656189865298267, "grad_norm": 1.3564071655273438, "learning_rate": 2.7298050139275766e-05, "loss": 0.263, "step": 2129 }, { "epoch": 1.3662604233483002, "grad_norm": 1.321958303451538, "learning_rate": 2.728733661881294e-05, "loss": 0.2158, "step": 2130 }, { "epoch": 1.3669018601667735, "grad_norm": 1.0040196180343628, "learning_rate": 2.727662309835012e-05, "loss": 0.2253, "step": 2131 }, { "epoch": 1.367543296985247, "grad_norm": 1.1532388925552368, "learning_rate": 2.7265909577887294e-05, "loss": 0.2562, "step": 2132 }, { "epoch": 1.3681847338037203, "grad_norm": 1.0114916563034058, "learning_rate": 2.7255196057424472e-05, "loss": 0.2217, "step": 2133 }, { "epoch": 1.3688261706221936, "grad_norm": 1.1943293809890747, "learning_rate": 2.7244482536961647e-05, "loss": 0.2257, "step": 2134 }, { "epoch": 1.369467607440667, "grad_norm": 1.548795461654663, "learning_rate": 2.723376901649882e-05, "loss": 0.341, "step": 2135 }, { "epoch": 1.3701090442591406, "grad_norm": 1.340909719467163, "learning_rate": 2.7223055496036e-05, "loss": 0.2454, "step": 2136 }, { "epoch": 1.3707504810776139, "grad_norm": 1.335349202156067, "learning_rate": 2.7212341975573174e-05, "loss": 0.2388, "step": 2137 }, { "epoch": 1.3713919178960872, "grad_norm": 1.41139554977417, "learning_rate": 2.720162845511035e-05, "loss": 0.2653, "step": 2138 }, { "epoch": 1.3720333547145607, "grad_norm": 1.5735673904418945, "learning_rate": 2.7190914934647527e-05, "loss": 0.3308, "step": 2139 }, { "epoch": 1.372674791533034, "grad_norm": 1.0361653566360474, "learning_rate": 2.7180201414184702e-05, "loss": 0.2214, "step": 2140 }, { "epoch": 1.3733162283515075, "grad_norm": 1.1458537578582764, "learning_rate": 2.716948789372188e-05, "loss": 0.2337, "step": 2141 }, { "epoch": 1.3739576651699807, "grad_norm": 1.6171391010284424, "learning_rate": 2.7158774373259055e-05, "loss": 0.3157, "step": 2142 }, { "epoch": 1.374599101988454, "grad_norm": 1.1552114486694336, "learning_rate": 2.714806085279623e-05, "loss": 0.2405, "step": 2143 }, { "epoch": 1.3752405388069275, "grad_norm": 1.3437678813934326, "learning_rate": 2.7137347332333408e-05, "loss": 0.2499, "step": 2144 }, { "epoch": 1.3758819756254008, "grad_norm": 1.1920835971832275, "learning_rate": 2.7126633811870583e-05, "loss": 0.2654, "step": 2145 }, { "epoch": 1.3765234124438743, "grad_norm": 1.414363145828247, "learning_rate": 2.711592029140776e-05, "loss": 0.2364, "step": 2146 }, { "epoch": 1.3771648492623476, "grad_norm": 1.0126769542694092, "learning_rate": 2.7105206770944935e-05, "loss": 0.2457, "step": 2147 }, { "epoch": 1.3778062860808211, "grad_norm": 1.4437354803085327, "learning_rate": 2.709449325048211e-05, "loss": 0.3476, "step": 2148 }, { "epoch": 1.3784477228992944, "grad_norm": 1.05747652053833, "learning_rate": 2.708377973001929e-05, "loss": 0.2416, "step": 2149 }, { "epoch": 1.379089159717768, "grad_norm": 1.030856966972351, "learning_rate": 2.7073066209556463e-05, "loss": 0.2522, "step": 2150 }, { "epoch": 1.3797305965362412, "grad_norm": 1.2570937871932983, "learning_rate": 2.706235268909364e-05, "loss": 0.2853, "step": 2151 }, { "epoch": 1.3803720333547145, "grad_norm": 1.1607999801635742, "learning_rate": 2.7051639168630816e-05, "loss": 0.2385, "step": 2152 }, { "epoch": 1.381013470173188, "grad_norm": 1.434055209159851, "learning_rate": 2.704092564816799e-05, "loss": 0.3342, "step": 2153 }, { "epoch": 1.3816549069916613, "grad_norm": 1.2156928777694702, "learning_rate": 2.703021212770517e-05, "loss": 0.2474, "step": 2154 }, { "epoch": 1.3822963438101348, "grad_norm": 1.2412667274475098, "learning_rate": 2.7019498607242337e-05, "loss": 0.2788, "step": 2155 }, { "epoch": 1.382937780628608, "grad_norm": 1.1144551038742065, "learning_rate": 2.7008785086779515e-05, "loss": 0.2568, "step": 2156 }, { "epoch": 1.3835792174470813, "grad_norm": 1.710383653640747, "learning_rate": 2.699807156631669e-05, "loss": 0.3363, "step": 2157 }, { "epoch": 1.3842206542655549, "grad_norm": 1.6924232244491577, "learning_rate": 2.6987358045853868e-05, "loss": 0.4041, "step": 2158 }, { "epoch": 1.3848620910840284, "grad_norm": 1.0568861961364746, "learning_rate": 2.6976644525391043e-05, "loss": 0.2528, "step": 2159 }, { "epoch": 1.3855035279025016, "grad_norm": 1.3128046989440918, "learning_rate": 2.6965931004928218e-05, "loss": 0.273, "step": 2160 }, { "epoch": 1.386144964720975, "grad_norm": 1.1967566013336182, "learning_rate": 2.6955217484465396e-05, "loss": 0.259, "step": 2161 }, { "epoch": 1.3867864015394484, "grad_norm": 1.8642117977142334, "learning_rate": 2.694450396400257e-05, "loss": 0.2575, "step": 2162 }, { "epoch": 1.3874278383579217, "grad_norm": 1.343948483467102, "learning_rate": 2.693379044353975e-05, "loss": 0.2938, "step": 2163 }, { "epoch": 1.3880692751763952, "grad_norm": 1.581681251525879, "learning_rate": 2.6923076923076923e-05, "loss": 0.3557, "step": 2164 }, { "epoch": 1.3887107119948685, "grad_norm": 1.2238922119140625, "learning_rate": 2.6912363402614098e-05, "loss": 0.2553, "step": 2165 }, { "epoch": 1.3893521488133418, "grad_norm": 1.454721450805664, "learning_rate": 2.6901649882151276e-05, "loss": 0.2847, "step": 2166 }, { "epoch": 1.3899935856318153, "grad_norm": 1.4565757513046265, "learning_rate": 2.689093636168845e-05, "loss": 0.2919, "step": 2167 }, { "epoch": 1.3906350224502886, "grad_norm": 1.066788911819458, "learning_rate": 2.6880222841225626e-05, "loss": 0.2357, "step": 2168 }, { "epoch": 1.391276459268762, "grad_norm": 1.2164192199707031, "learning_rate": 2.6869509320762804e-05, "loss": 0.2699, "step": 2169 }, { "epoch": 1.3919178960872354, "grad_norm": 1.3837262392044067, "learning_rate": 2.685879580029998e-05, "loss": 0.2774, "step": 2170 }, { "epoch": 1.3925593329057087, "grad_norm": 1.2348700761795044, "learning_rate": 2.6848082279837157e-05, "loss": 0.2986, "step": 2171 }, { "epoch": 1.3932007697241822, "grad_norm": 1.8257155418395996, "learning_rate": 2.683736875937433e-05, "loss": 0.3453, "step": 2172 }, { "epoch": 1.3938422065426557, "grad_norm": 1.2241830825805664, "learning_rate": 2.6826655238911506e-05, "loss": 0.2823, "step": 2173 }, { "epoch": 1.394483643361129, "grad_norm": 1.0941808223724365, "learning_rate": 2.6815941718448684e-05, "loss": 0.25, "step": 2174 }, { "epoch": 1.3951250801796022, "grad_norm": 1.2786304950714111, "learning_rate": 2.680522819798586e-05, "loss": 0.2684, "step": 2175 }, { "epoch": 1.3957665169980757, "grad_norm": 1.3114291429519653, "learning_rate": 2.6794514677523037e-05, "loss": 0.2698, "step": 2176 }, { "epoch": 1.396407953816549, "grad_norm": 1.1220709085464478, "learning_rate": 2.6783801157060212e-05, "loss": 0.258, "step": 2177 }, { "epoch": 1.3970493906350225, "grad_norm": 1.2551230192184448, "learning_rate": 2.6773087636597387e-05, "loss": 0.2897, "step": 2178 }, { "epoch": 1.3976908274534958, "grad_norm": 1.2013821601867676, "learning_rate": 2.6762374116134565e-05, "loss": 0.2544, "step": 2179 }, { "epoch": 1.398332264271969, "grad_norm": 1.1773312091827393, "learning_rate": 2.675166059567174e-05, "loss": 0.2535, "step": 2180 }, { "epoch": 1.3989737010904426, "grad_norm": 1.6232131719589233, "learning_rate": 2.6740947075208918e-05, "loss": 0.2832, "step": 2181 }, { "epoch": 1.399615137908916, "grad_norm": 0.9510869979858398, "learning_rate": 2.6730233554746093e-05, "loss": 0.2269, "step": 2182 }, { "epoch": 1.4002565747273894, "grad_norm": 1.1934577226638794, "learning_rate": 2.6719520034283267e-05, "loss": 0.2634, "step": 2183 }, { "epoch": 1.4008980115458627, "grad_norm": 1.2063813209533691, "learning_rate": 2.6708806513820446e-05, "loss": 0.2543, "step": 2184 }, { "epoch": 1.4015394483643362, "grad_norm": 1.1976827383041382, "learning_rate": 2.669809299335762e-05, "loss": 0.2647, "step": 2185 }, { "epoch": 1.4021808851828095, "grad_norm": 1.2905714511871338, "learning_rate": 2.6687379472894795e-05, "loss": 0.2601, "step": 2186 }, { "epoch": 1.402822322001283, "grad_norm": 1.179269552230835, "learning_rate": 2.6676665952431973e-05, "loss": 0.2232, "step": 2187 }, { "epoch": 1.4034637588197563, "grad_norm": 1.2705920934677124, "learning_rate": 2.6665952431969148e-05, "loss": 0.2558, "step": 2188 }, { "epoch": 1.4041051956382296, "grad_norm": 1.35444176197052, "learning_rate": 2.6655238911506326e-05, "loss": 0.2722, "step": 2189 }, { "epoch": 1.404746632456703, "grad_norm": 1.3937642574310303, "learning_rate": 2.6644525391043494e-05, "loss": 0.2643, "step": 2190 }, { "epoch": 1.4053880692751763, "grad_norm": 1.0913066864013672, "learning_rate": 2.6633811870580672e-05, "loss": 0.2007, "step": 2191 }, { "epoch": 1.4060295060936499, "grad_norm": 1.3274166584014893, "learning_rate": 2.6623098350117847e-05, "loss": 0.2658, "step": 2192 }, { "epoch": 1.4066709429121231, "grad_norm": 1.1965969800949097, "learning_rate": 2.6612384829655025e-05, "loss": 0.242, "step": 2193 }, { "epoch": 1.4073123797305964, "grad_norm": 1.0742483139038086, "learning_rate": 2.66016713091922e-05, "loss": 0.2264, "step": 2194 }, { "epoch": 1.40795381654907, "grad_norm": 1.304748296737671, "learning_rate": 2.6590957788729375e-05, "loss": 0.2541, "step": 2195 }, { "epoch": 1.4085952533675434, "grad_norm": 1.2498356103897095, "learning_rate": 2.6580244268266553e-05, "loss": 0.2585, "step": 2196 }, { "epoch": 1.4092366901860167, "grad_norm": 1.130019187927246, "learning_rate": 2.6569530747803728e-05, "loss": 0.2481, "step": 2197 }, { "epoch": 1.40987812700449, "grad_norm": 1.417426586151123, "learning_rate": 2.6558817227340906e-05, "loss": 0.2961, "step": 2198 }, { "epoch": 1.4105195638229635, "grad_norm": 1.1174983978271484, "learning_rate": 2.654810370687808e-05, "loss": 0.2586, "step": 2199 }, { "epoch": 1.4111610006414368, "grad_norm": 1.107313632965088, "learning_rate": 2.6537390186415255e-05, "loss": 0.2334, "step": 2200 }, { "epoch": 1.4118024374599103, "grad_norm": 1.2531254291534424, "learning_rate": 2.6526676665952434e-05, "loss": 0.2469, "step": 2201 }, { "epoch": 1.4124438742783836, "grad_norm": 1.4171444177627563, "learning_rate": 2.6515963145489608e-05, "loss": 0.2747, "step": 2202 }, { "epoch": 1.4130853110968569, "grad_norm": 1.123867154121399, "learning_rate": 2.6505249625026783e-05, "loss": 0.2415, "step": 2203 }, { "epoch": 1.4137267479153304, "grad_norm": 1.0770536661148071, "learning_rate": 2.649453610456396e-05, "loss": 0.2283, "step": 2204 }, { "epoch": 1.4143681847338037, "grad_norm": 1.2274585962295532, "learning_rate": 2.6483822584101136e-05, "loss": 0.211, "step": 2205 }, { "epoch": 1.4150096215522772, "grad_norm": 1.352617859840393, "learning_rate": 2.6473109063638314e-05, "loss": 0.2814, "step": 2206 }, { "epoch": 1.4156510583707504, "grad_norm": 1.1846604347229004, "learning_rate": 2.646239554317549e-05, "loss": 0.2442, "step": 2207 }, { "epoch": 1.4162924951892237, "grad_norm": 1.312095046043396, "learning_rate": 2.6451682022712664e-05, "loss": 0.249, "step": 2208 }, { "epoch": 1.4169339320076972, "grad_norm": 1.0819149017333984, "learning_rate": 2.6440968502249842e-05, "loss": 0.2203, "step": 2209 }, { "epoch": 1.4175753688261707, "grad_norm": 1.3528403043746948, "learning_rate": 2.6430254981787017e-05, "loss": 0.2591, "step": 2210 }, { "epoch": 1.418216805644644, "grad_norm": 1.18276047706604, "learning_rate": 2.6419541461324195e-05, "loss": 0.2964, "step": 2211 }, { "epoch": 1.4188582424631173, "grad_norm": 1.4562344551086426, "learning_rate": 2.640882794086137e-05, "loss": 0.3067, "step": 2212 }, { "epoch": 1.4194996792815908, "grad_norm": 1.1619645357131958, "learning_rate": 2.6398114420398544e-05, "loss": 0.2647, "step": 2213 }, { "epoch": 1.420141116100064, "grad_norm": 1.266046404838562, "learning_rate": 2.6387400899935722e-05, "loss": 0.2785, "step": 2214 }, { "epoch": 1.4207825529185376, "grad_norm": 1.242631196975708, "learning_rate": 2.6376687379472897e-05, "loss": 0.2733, "step": 2215 }, { "epoch": 1.421423989737011, "grad_norm": 1.1965094804763794, "learning_rate": 2.6365973859010075e-05, "loss": 0.2603, "step": 2216 }, { "epoch": 1.4220654265554842, "grad_norm": 1.0872071981430054, "learning_rate": 2.635526033854725e-05, "loss": 0.2454, "step": 2217 }, { "epoch": 1.4227068633739577, "grad_norm": 1.3785732984542847, "learning_rate": 2.6344546818084425e-05, "loss": 0.2964, "step": 2218 }, { "epoch": 1.423348300192431, "grad_norm": 1.061580777168274, "learning_rate": 2.6333833297621603e-05, "loss": 0.2312, "step": 2219 }, { "epoch": 1.4239897370109045, "grad_norm": 1.2246729135513306, "learning_rate": 2.6323119777158778e-05, "loss": 0.2883, "step": 2220 }, { "epoch": 1.4246311738293778, "grad_norm": 1.55219304561615, "learning_rate": 2.6312406256695952e-05, "loss": 0.3626, "step": 2221 }, { "epoch": 1.4252726106478513, "grad_norm": 1.3276934623718262, "learning_rate": 2.630169273623313e-05, "loss": 0.2716, "step": 2222 }, { "epoch": 1.4259140474663246, "grad_norm": 1.3365180492401123, "learning_rate": 2.6290979215770305e-05, "loss": 0.3046, "step": 2223 }, { "epoch": 1.426555484284798, "grad_norm": 1.1597124338150024, "learning_rate": 2.6280265695307477e-05, "loss": 0.2484, "step": 2224 }, { "epoch": 1.4271969211032713, "grad_norm": 1.339127540588379, "learning_rate": 2.626955217484465e-05, "loss": 0.2526, "step": 2225 }, { "epoch": 1.4278383579217446, "grad_norm": 1.3153502941131592, "learning_rate": 2.625883865438183e-05, "loss": 0.2632, "step": 2226 }, { "epoch": 1.4284797947402181, "grad_norm": 1.3042664527893066, "learning_rate": 2.6248125133919004e-05, "loss": 0.2638, "step": 2227 }, { "epoch": 1.4291212315586914, "grad_norm": 1.4461424350738525, "learning_rate": 2.6237411613456183e-05, "loss": 0.2883, "step": 2228 }, { "epoch": 1.429762668377165, "grad_norm": 1.4270662069320679, "learning_rate": 2.6226698092993357e-05, "loss": 0.2518, "step": 2229 }, { "epoch": 1.4304041051956382, "grad_norm": 1.479407787322998, "learning_rate": 2.6215984572530532e-05, "loss": 0.2812, "step": 2230 }, { "epoch": 1.4310455420141115, "grad_norm": 1.376577377319336, "learning_rate": 2.620527105206771e-05, "loss": 0.2659, "step": 2231 }, { "epoch": 1.431686978832585, "grad_norm": 1.562712550163269, "learning_rate": 2.6194557531604885e-05, "loss": 0.27, "step": 2232 }, { "epoch": 1.4323284156510585, "grad_norm": 1.4573413133621216, "learning_rate": 2.618384401114206e-05, "loss": 0.2819, "step": 2233 }, { "epoch": 1.4329698524695318, "grad_norm": 1.376889705657959, "learning_rate": 2.6173130490679238e-05, "loss": 0.286, "step": 2234 }, { "epoch": 1.433611289288005, "grad_norm": 1.5691068172454834, "learning_rate": 2.6162416970216413e-05, "loss": 0.2549, "step": 2235 }, { "epoch": 1.4342527261064786, "grad_norm": 1.2224689722061157, "learning_rate": 2.615170344975359e-05, "loss": 0.2154, "step": 2236 }, { "epoch": 1.4348941629249519, "grad_norm": 1.5342118740081787, "learning_rate": 2.6140989929290766e-05, "loss": 0.2938, "step": 2237 }, { "epoch": 1.4355355997434254, "grad_norm": 1.1272268295288086, "learning_rate": 2.613027640882794e-05, "loss": 0.2269, "step": 2238 }, { "epoch": 1.4361770365618987, "grad_norm": 1.4715546369552612, "learning_rate": 2.611956288836512e-05, "loss": 0.2919, "step": 2239 }, { "epoch": 1.436818473380372, "grad_norm": 1.0589851140975952, "learning_rate": 2.6108849367902293e-05, "loss": 0.2759, "step": 2240 }, { "epoch": 1.4374599101988454, "grad_norm": 1.2822717428207397, "learning_rate": 2.609813584743947e-05, "loss": 0.2769, "step": 2241 }, { "epoch": 1.4381013470173187, "grad_norm": 1.2179769277572632, "learning_rate": 2.6087422326976646e-05, "loss": 0.2775, "step": 2242 }, { "epoch": 1.4387427838357922, "grad_norm": 1.3934147357940674, "learning_rate": 2.607670880651382e-05, "loss": 0.2958, "step": 2243 }, { "epoch": 1.4393842206542655, "grad_norm": 1.10690176486969, "learning_rate": 2.6065995286051e-05, "loss": 0.2468, "step": 2244 }, { "epoch": 1.4400256574727388, "grad_norm": 1.1078323125839233, "learning_rate": 2.6055281765588174e-05, "loss": 0.2612, "step": 2245 }, { "epoch": 1.4406670942912123, "grad_norm": 1.0818219184875488, "learning_rate": 2.6044568245125352e-05, "loss": 0.2248, "step": 2246 }, { "epoch": 1.4413085311096858, "grad_norm": 0.899897038936615, "learning_rate": 2.6033854724662527e-05, "loss": 0.2181, "step": 2247 }, { "epoch": 1.441949967928159, "grad_norm": 1.3334537744522095, "learning_rate": 2.60231412041997e-05, "loss": 0.2889, "step": 2248 }, { "epoch": 1.4425914047466324, "grad_norm": 1.1606712341308594, "learning_rate": 2.601242768373688e-05, "loss": 0.2488, "step": 2249 }, { "epoch": 1.443232841565106, "grad_norm": 1.2100658416748047, "learning_rate": 2.6001714163274054e-05, "loss": 0.2262, "step": 2250 }, { "epoch": 1.4438742783835792, "grad_norm": 1.475466012954712, "learning_rate": 2.599100064281123e-05, "loss": 0.3406, "step": 2251 }, { "epoch": 1.4445157152020527, "grad_norm": 1.409298062324524, "learning_rate": 2.5980287122348407e-05, "loss": 0.3086, "step": 2252 }, { "epoch": 1.445157152020526, "grad_norm": 1.2024884223937988, "learning_rate": 2.5969573601885582e-05, "loss": 0.256, "step": 2253 }, { "epoch": 1.4457985888389993, "grad_norm": 1.380705714225769, "learning_rate": 2.595886008142276e-05, "loss": 0.2819, "step": 2254 }, { "epoch": 1.4464400256574728, "grad_norm": 1.3454324007034302, "learning_rate": 2.5948146560959935e-05, "loss": 0.2582, "step": 2255 }, { "epoch": 1.447081462475946, "grad_norm": 1.5278685092926025, "learning_rate": 2.593743304049711e-05, "loss": 0.3167, "step": 2256 }, { "epoch": 1.4477228992944196, "grad_norm": 1.2008321285247803, "learning_rate": 2.5926719520034288e-05, "loss": 0.2144, "step": 2257 }, { "epoch": 1.4483643361128928, "grad_norm": 1.4276386499404907, "learning_rate": 2.591600599957146e-05, "loss": 0.2315, "step": 2258 }, { "epoch": 1.4490057729313663, "grad_norm": 1.1390019655227661, "learning_rate": 2.5905292479108634e-05, "loss": 0.2341, "step": 2259 }, { "epoch": 1.4496472097498396, "grad_norm": 1.112757921218872, "learning_rate": 2.589457895864581e-05, "loss": 0.2209, "step": 2260 }, { "epoch": 1.4502886465683131, "grad_norm": 1.1817731857299805, "learning_rate": 2.5883865438182987e-05, "loss": 0.2469, "step": 2261 }, { "epoch": 1.4509300833867864, "grad_norm": 1.0888516902923584, "learning_rate": 2.587315191772016e-05, "loss": 0.2117, "step": 2262 }, { "epoch": 1.4515715202052597, "grad_norm": 1.4068751335144043, "learning_rate": 2.5862438397257336e-05, "loss": 0.2664, "step": 2263 }, { "epoch": 1.4522129570237332, "grad_norm": 1.1735535860061646, "learning_rate": 2.5851724876794515e-05, "loss": 0.2472, "step": 2264 }, { "epoch": 1.4528543938422065, "grad_norm": 1.4847655296325684, "learning_rate": 2.584101135633169e-05, "loss": 0.3152, "step": 2265 }, { "epoch": 1.45349583066068, "grad_norm": 1.2347310781478882, "learning_rate": 2.5830297835868867e-05, "loss": 0.2475, "step": 2266 }, { "epoch": 1.4541372674791533, "grad_norm": 1.3051111698150635, "learning_rate": 2.5819584315406042e-05, "loss": 0.2393, "step": 2267 }, { "epoch": 1.4547787042976266, "grad_norm": 1.173987627029419, "learning_rate": 2.5808870794943217e-05, "loss": 0.2487, "step": 2268 }, { "epoch": 1.4554201411161, "grad_norm": 1.2636784315109253, "learning_rate": 2.5798157274480395e-05, "loss": 0.2508, "step": 2269 }, { "epoch": 1.4560615779345736, "grad_norm": 1.1800167560577393, "learning_rate": 2.578744375401757e-05, "loss": 0.2617, "step": 2270 }, { "epoch": 1.4567030147530469, "grad_norm": 1.120592713356018, "learning_rate": 2.5776730233554748e-05, "loss": 0.2402, "step": 2271 }, { "epoch": 1.4573444515715201, "grad_norm": 1.3507893085479736, "learning_rate": 2.5766016713091923e-05, "loss": 0.2881, "step": 2272 }, { "epoch": 1.4579858883899937, "grad_norm": 1.201192021369934, "learning_rate": 2.5755303192629098e-05, "loss": 0.2575, "step": 2273 }, { "epoch": 1.458627325208467, "grad_norm": 1.6211949586868286, "learning_rate": 2.5744589672166276e-05, "loss": 0.3047, "step": 2274 }, { "epoch": 1.4592687620269404, "grad_norm": 1.1628997325897217, "learning_rate": 2.573387615170345e-05, "loss": 0.2566, "step": 2275 }, { "epoch": 1.4599101988454137, "grad_norm": 1.2635712623596191, "learning_rate": 2.572316263124063e-05, "loss": 0.2246, "step": 2276 }, { "epoch": 1.460551635663887, "grad_norm": 1.185896396636963, "learning_rate": 2.5712449110777803e-05, "loss": 0.2808, "step": 2277 }, { "epoch": 1.4611930724823605, "grad_norm": 1.3321750164031982, "learning_rate": 2.5701735590314978e-05, "loss": 0.2874, "step": 2278 }, { "epoch": 1.4618345093008338, "grad_norm": 1.0428273677825928, "learning_rate": 2.5691022069852156e-05, "loss": 0.2348, "step": 2279 }, { "epoch": 1.4624759461193073, "grad_norm": 1.1209852695465088, "learning_rate": 2.568030854938933e-05, "loss": 0.2347, "step": 2280 }, { "epoch": 1.4631173829377806, "grad_norm": 1.1730903387069702, "learning_rate": 2.5669595028926506e-05, "loss": 0.262, "step": 2281 }, { "epoch": 1.4637588197562539, "grad_norm": 1.343227505683899, "learning_rate": 2.5658881508463684e-05, "loss": 0.2843, "step": 2282 }, { "epoch": 1.4644002565747274, "grad_norm": 1.3650845289230347, "learning_rate": 2.564816798800086e-05, "loss": 0.2603, "step": 2283 }, { "epoch": 1.465041693393201, "grad_norm": 1.1079601049423218, "learning_rate": 2.5637454467538037e-05, "loss": 0.2308, "step": 2284 }, { "epoch": 1.4656831302116742, "grad_norm": 1.2708293199539185, "learning_rate": 2.562674094707521e-05, "loss": 0.2609, "step": 2285 }, { "epoch": 1.4663245670301475, "grad_norm": 1.2195595502853394, "learning_rate": 2.5616027426612386e-05, "loss": 0.2639, "step": 2286 }, { "epoch": 1.466966003848621, "grad_norm": 1.5114868879318237, "learning_rate": 2.5605313906149565e-05, "loss": 0.2823, "step": 2287 }, { "epoch": 1.4676074406670943, "grad_norm": 1.7246792316436768, "learning_rate": 2.559460038568674e-05, "loss": 0.3228, "step": 2288 }, { "epoch": 1.4682488774855678, "grad_norm": 1.5381743907928467, "learning_rate": 2.5583886865223917e-05, "loss": 0.3565, "step": 2289 }, { "epoch": 1.468890314304041, "grad_norm": 1.2580807209014893, "learning_rate": 2.5573173344761092e-05, "loss": 0.2538, "step": 2290 }, { "epoch": 1.4695317511225143, "grad_norm": 1.021989107131958, "learning_rate": 2.5562459824298267e-05, "loss": 0.2188, "step": 2291 }, { "epoch": 1.4701731879409878, "grad_norm": 1.0690616369247437, "learning_rate": 2.5551746303835445e-05, "loss": 0.2426, "step": 2292 }, { "epoch": 1.4708146247594611, "grad_norm": 1.0571658611297607, "learning_rate": 2.5541032783372617e-05, "loss": 0.2261, "step": 2293 }, { "epoch": 1.4714560615779346, "grad_norm": 1.318561315536499, "learning_rate": 2.553031926290979e-05, "loss": 0.281, "step": 2294 }, { "epoch": 1.472097498396408, "grad_norm": 1.3379822969436646, "learning_rate": 2.5519605742446966e-05, "loss": 0.2768, "step": 2295 }, { "epoch": 1.4727389352148814, "grad_norm": 1.3859676122665405, "learning_rate": 2.5508892221984144e-05, "loss": 0.272, "step": 2296 }, { "epoch": 1.4733803720333547, "grad_norm": 1.216260313987732, "learning_rate": 2.549817870152132e-05, "loss": 0.2559, "step": 2297 }, { "epoch": 1.4740218088518282, "grad_norm": 1.2865630388259888, "learning_rate": 2.5487465181058494e-05, "loss": 0.2812, "step": 2298 }, { "epoch": 1.4746632456703015, "grad_norm": 1.3303779363632202, "learning_rate": 2.5476751660595672e-05, "loss": 0.2779, "step": 2299 }, { "epoch": 1.4753046824887748, "grad_norm": 1.727504014968872, "learning_rate": 2.5466038140132847e-05, "loss": 0.4052, "step": 2300 }, { "epoch": 1.4759461193072483, "grad_norm": 1.0464887619018555, "learning_rate": 2.5455324619670025e-05, "loss": 0.2248, "step": 2301 }, { "epoch": 1.4765875561257216, "grad_norm": 1.2684516906738281, "learning_rate": 2.54446110992072e-05, "loss": 0.2647, "step": 2302 }, { "epoch": 1.477228992944195, "grad_norm": 1.2289830446243286, "learning_rate": 2.5433897578744374e-05, "loss": 0.2558, "step": 2303 }, { "epoch": 1.4778704297626684, "grad_norm": 1.230260968208313, "learning_rate": 2.5423184058281552e-05, "loss": 0.2593, "step": 2304 }, { "epoch": 1.4785118665811416, "grad_norm": 1.4922202825546265, "learning_rate": 2.5412470537818727e-05, "loss": 0.2934, "step": 2305 }, { "epoch": 1.4791533033996151, "grad_norm": 1.287206768989563, "learning_rate": 2.5401757017355905e-05, "loss": 0.2677, "step": 2306 }, { "epoch": 1.4797947402180887, "grad_norm": 1.281671166419983, "learning_rate": 2.539104349689308e-05, "loss": 0.275, "step": 2307 }, { "epoch": 1.480436177036562, "grad_norm": 1.2228806018829346, "learning_rate": 2.5380329976430255e-05, "loss": 0.2788, "step": 2308 }, { "epoch": 1.4810776138550352, "grad_norm": 1.0568877458572388, "learning_rate": 2.5369616455967433e-05, "loss": 0.2414, "step": 2309 }, { "epoch": 1.4817190506735087, "grad_norm": 1.2347508668899536, "learning_rate": 2.5358902935504608e-05, "loss": 0.2553, "step": 2310 }, { "epoch": 1.482360487491982, "grad_norm": 1.266029953956604, "learning_rate": 2.5348189415041786e-05, "loss": 0.2696, "step": 2311 }, { "epoch": 1.4830019243104555, "grad_norm": 1.2287983894348145, "learning_rate": 2.533747589457896e-05, "loss": 0.2682, "step": 2312 }, { "epoch": 1.4836433611289288, "grad_norm": 1.7159864902496338, "learning_rate": 2.5326762374116135e-05, "loss": 0.284, "step": 2313 }, { "epoch": 1.484284797947402, "grad_norm": 1.263382911682129, "learning_rate": 2.5316048853653314e-05, "loss": 0.269, "step": 2314 }, { "epoch": 1.4849262347658756, "grad_norm": 1.392594814300537, "learning_rate": 2.530533533319049e-05, "loss": 0.2959, "step": 2315 }, { "epoch": 1.4855676715843489, "grad_norm": 1.5088093280792236, "learning_rate": 2.5294621812727663e-05, "loss": 0.32, "step": 2316 }, { "epoch": 1.4862091084028224, "grad_norm": 1.3431555032730103, "learning_rate": 2.528390829226484e-05, "loss": 0.2752, "step": 2317 }, { "epoch": 1.4868505452212957, "grad_norm": 1.4995253086090088, "learning_rate": 2.5273194771802016e-05, "loss": 0.3133, "step": 2318 }, { "epoch": 1.487491982039769, "grad_norm": 1.4605430364608765, "learning_rate": 2.5262481251339194e-05, "loss": 0.2743, "step": 2319 }, { "epoch": 1.4881334188582425, "grad_norm": 1.5496649742126465, "learning_rate": 2.525176773087637e-05, "loss": 0.2982, "step": 2320 }, { "epoch": 1.488774855676716, "grad_norm": 1.4858416318893433, "learning_rate": 2.5241054210413544e-05, "loss": 0.3017, "step": 2321 }, { "epoch": 1.4894162924951893, "grad_norm": 1.4986227750778198, "learning_rate": 2.5230340689950722e-05, "loss": 0.2715, "step": 2322 }, { "epoch": 1.4900577293136625, "grad_norm": 1.179303526878357, "learning_rate": 2.5219627169487897e-05, "loss": 0.2548, "step": 2323 }, { "epoch": 1.490699166132136, "grad_norm": 1.1025294065475464, "learning_rate": 2.5208913649025075e-05, "loss": 0.25, "step": 2324 }, { "epoch": 1.4913406029506093, "grad_norm": 1.3322618007659912, "learning_rate": 2.519820012856225e-05, "loss": 0.2519, "step": 2325 }, { "epoch": 1.4919820397690828, "grad_norm": 1.2775802612304688, "learning_rate": 2.5187486608099424e-05, "loss": 0.2424, "step": 2326 }, { "epoch": 1.4926234765875561, "grad_norm": 1.3530452251434326, "learning_rate": 2.5176773087636596e-05, "loss": 0.2691, "step": 2327 }, { "epoch": 1.4932649134060294, "grad_norm": 1.1087766885757446, "learning_rate": 2.516605956717377e-05, "loss": 0.2331, "step": 2328 }, { "epoch": 1.493906350224503, "grad_norm": 1.377044439315796, "learning_rate": 2.515534604671095e-05, "loss": 0.2475, "step": 2329 }, { "epoch": 1.4945477870429762, "grad_norm": 1.1363883018493652, "learning_rate": 2.5144632526248123e-05, "loss": 0.226, "step": 2330 }, { "epoch": 1.4951892238614497, "grad_norm": 1.4305877685546875, "learning_rate": 2.51339190057853e-05, "loss": 0.2698, "step": 2331 }, { "epoch": 1.495830660679923, "grad_norm": 0.9411978125572205, "learning_rate": 2.5123205485322476e-05, "loss": 0.2089, "step": 2332 }, { "epoch": 1.4964720974983963, "grad_norm": 1.1367672681808472, "learning_rate": 2.511249196485965e-05, "loss": 0.2423, "step": 2333 }, { "epoch": 1.4971135343168698, "grad_norm": 1.1346782445907593, "learning_rate": 2.510177844439683e-05, "loss": 0.2242, "step": 2334 }, { "epoch": 1.4977549711353433, "grad_norm": 1.0825114250183105, "learning_rate": 2.5091064923934004e-05, "loss": 0.2399, "step": 2335 }, { "epoch": 1.4983964079538166, "grad_norm": 1.4169056415557861, "learning_rate": 2.5080351403471182e-05, "loss": 0.2642, "step": 2336 }, { "epoch": 1.4990378447722899, "grad_norm": 1.1332653760910034, "learning_rate": 2.5069637883008357e-05, "loss": 0.2438, "step": 2337 }, { "epoch": 1.4996792815907634, "grad_norm": 1.18496835231781, "learning_rate": 2.505892436254553e-05, "loss": 0.2734, "step": 2338 }, { "epoch": 1.5003207184092366, "grad_norm": 1.109883189201355, "learning_rate": 2.504821084208271e-05, "loss": 0.2271, "step": 2339 }, { "epoch": 1.5009621552277101, "grad_norm": 1.0836262702941895, "learning_rate": 2.5037497321619884e-05, "loss": 0.2332, "step": 2340 }, { "epoch": 1.5016035920461834, "grad_norm": 1.4917699098587036, "learning_rate": 2.5026783801157063e-05, "loss": 0.2824, "step": 2341 }, { "epoch": 1.5022450288646567, "grad_norm": 1.202085256576538, "learning_rate": 2.5016070280694237e-05, "loss": 0.2637, "step": 2342 }, { "epoch": 1.5028864656831302, "grad_norm": 1.2200053930282593, "learning_rate": 2.5005356760231412e-05, "loss": 0.2533, "step": 2343 }, { "epoch": 1.5035279025016037, "grad_norm": 1.36581552028656, "learning_rate": 2.499464323976859e-05, "loss": 0.2945, "step": 2344 }, { "epoch": 1.504169339320077, "grad_norm": 1.4156303405761719, "learning_rate": 2.4983929719305765e-05, "loss": 0.2661, "step": 2345 }, { "epoch": 1.5048107761385503, "grad_norm": 1.0732256174087524, "learning_rate": 2.497321619884294e-05, "loss": 0.2551, "step": 2346 }, { "epoch": 1.5054522129570236, "grad_norm": 1.6771773099899292, "learning_rate": 2.4962502678380118e-05, "loss": 0.3666, "step": 2347 }, { "epoch": 1.506093649775497, "grad_norm": 1.2287534475326538, "learning_rate": 2.4951789157917293e-05, "loss": 0.244, "step": 2348 }, { "epoch": 1.5067350865939706, "grad_norm": 1.20772385597229, "learning_rate": 2.494107563745447e-05, "loss": 0.2512, "step": 2349 }, { "epoch": 1.5073765234124439, "grad_norm": 1.1545172929763794, "learning_rate": 2.4930362116991646e-05, "loss": 0.2561, "step": 2350 }, { "epoch": 1.5080179602309172, "grad_norm": 1.0313873291015625, "learning_rate": 2.491964859652882e-05, "loss": 0.2283, "step": 2351 }, { "epoch": 1.5086593970493907, "grad_norm": 1.2794986963272095, "learning_rate": 2.4908935076066e-05, "loss": 0.284, "step": 2352 }, { "epoch": 1.5093008338678642, "grad_norm": 1.3042370080947876, "learning_rate": 2.489822155560317e-05, "loss": 0.2786, "step": 2353 }, { "epoch": 1.5099422706863375, "grad_norm": 1.1597899198532104, "learning_rate": 2.4887508035140348e-05, "loss": 0.2471, "step": 2354 }, { "epoch": 1.5105837075048107, "grad_norm": 1.3391515016555786, "learning_rate": 2.4876794514677523e-05, "loss": 0.2641, "step": 2355 }, { "epoch": 1.511225144323284, "grad_norm": 1.0974575281143188, "learning_rate": 2.48660809942147e-05, "loss": 0.2307, "step": 2356 }, { "epoch": 1.5118665811417575, "grad_norm": 1.311566948890686, "learning_rate": 2.4855367473751876e-05, "loss": 0.2817, "step": 2357 }, { "epoch": 1.512508017960231, "grad_norm": 1.4274152517318726, "learning_rate": 2.484465395328905e-05, "loss": 0.3119, "step": 2358 }, { "epoch": 1.5131494547787043, "grad_norm": 1.364643931388855, "learning_rate": 2.483394043282623e-05, "loss": 0.2712, "step": 2359 }, { "epoch": 1.5137908915971776, "grad_norm": 1.307924747467041, "learning_rate": 2.4823226912363403e-05, "loss": 0.3157, "step": 2360 }, { "epoch": 1.514432328415651, "grad_norm": 1.0566143989562988, "learning_rate": 2.4812513391900578e-05, "loss": 0.2541, "step": 2361 }, { "epoch": 1.5150737652341244, "grad_norm": 1.224439024925232, "learning_rate": 2.4801799871437756e-05, "loss": 0.2717, "step": 2362 }, { "epoch": 1.515715202052598, "grad_norm": 1.3031985759735107, "learning_rate": 2.479108635097493e-05, "loss": 0.2918, "step": 2363 }, { "epoch": 1.5163566388710712, "grad_norm": 0.9985074996948242, "learning_rate": 2.478037283051211e-05, "loss": 0.2244, "step": 2364 }, { "epoch": 1.5169980756895445, "grad_norm": 1.3067665100097656, "learning_rate": 2.4769659310049284e-05, "loss": 0.279, "step": 2365 }, { "epoch": 1.517639512508018, "grad_norm": 1.2711374759674072, "learning_rate": 2.475894578958646e-05, "loss": 0.2921, "step": 2366 }, { "epoch": 1.5182809493264915, "grad_norm": 1.1494756937026978, "learning_rate": 2.4748232269123637e-05, "loss": 0.255, "step": 2367 }, { "epoch": 1.5189223861449648, "grad_norm": 1.1385549306869507, "learning_rate": 2.473751874866081e-05, "loss": 0.2421, "step": 2368 }, { "epoch": 1.519563822963438, "grad_norm": 1.0137079954147339, "learning_rate": 2.472680522819799e-05, "loss": 0.2443, "step": 2369 }, { "epoch": 1.5202052597819113, "grad_norm": 1.415663480758667, "learning_rate": 2.471609170773516e-05, "loss": 0.2871, "step": 2370 }, { "epoch": 1.5208466966003849, "grad_norm": 1.406309723854065, "learning_rate": 2.470537818727234e-05, "loss": 0.3162, "step": 2371 }, { "epoch": 1.5214881334188584, "grad_norm": 1.2531847953796387, "learning_rate": 2.4694664666809514e-05, "loss": 0.2953, "step": 2372 }, { "epoch": 1.5221295702373316, "grad_norm": 1.274299144744873, "learning_rate": 2.468395114634669e-05, "loss": 0.308, "step": 2373 }, { "epoch": 1.522771007055805, "grad_norm": 1.0672626495361328, "learning_rate": 2.4673237625883867e-05, "loss": 0.2287, "step": 2374 }, { "epoch": 1.5234124438742784, "grad_norm": 1.0836455821990967, "learning_rate": 2.4662524105421042e-05, "loss": 0.2309, "step": 2375 }, { "epoch": 1.5240538806927517, "grad_norm": 1.4732357263565063, "learning_rate": 2.4651810584958216e-05, "loss": 0.3181, "step": 2376 }, { "epoch": 1.5246953175112252, "grad_norm": 1.1583256721496582, "learning_rate": 2.4641097064495395e-05, "loss": 0.2375, "step": 2377 }, { "epoch": 1.5253367543296985, "grad_norm": 1.1352401971817017, "learning_rate": 2.463038354403257e-05, "loss": 0.264, "step": 2378 }, { "epoch": 1.5259781911481718, "grad_norm": 1.3394309282302856, "learning_rate": 2.4619670023569748e-05, "loss": 0.2754, "step": 2379 }, { "epoch": 1.5266196279666453, "grad_norm": 1.0392855405807495, "learning_rate": 2.4608956503106922e-05, "loss": 0.234, "step": 2380 }, { "epoch": 1.5272610647851188, "grad_norm": 1.2706998586654663, "learning_rate": 2.4598242982644097e-05, "loss": 0.2562, "step": 2381 }, { "epoch": 1.527902501603592, "grad_norm": 1.3703906536102295, "learning_rate": 2.4587529462181275e-05, "loss": 0.2649, "step": 2382 }, { "epoch": 1.5285439384220654, "grad_norm": 1.2921674251556396, "learning_rate": 2.457681594171845e-05, "loss": 0.2484, "step": 2383 }, { "epoch": 1.5291853752405387, "grad_norm": 1.4006940126419067, "learning_rate": 2.4566102421255628e-05, "loss": 0.2692, "step": 2384 }, { "epoch": 1.5298268120590122, "grad_norm": 1.3575142621994019, "learning_rate": 2.4555388900792803e-05, "loss": 0.2528, "step": 2385 }, { "epoch": 1.5304682488774857, "grad_norm": 1.0644749402999878, "learning_rate": 2.4544675380329978e-05, "loss": 0.2359, "step": 2386 }, { "epoch": 1.531109685695959, "grad_norm": 1.3102880716323853, "learning_rate": 2.4533961859867152e-05, "loss": 0.2948, "step": 2387 }, { "epoch": 1.5317511225144322, "grad_norm": 1.187801480293274, "learning_rate": 2.4523248339404327e-05, "loss": 0.2596, "step": 2388 }, { "epoch": 1.5323925593329057, "grad_norm": 1.460214376449585, "learning_rate": 2.4512534818941505e-05, "loss": 0.3143, "step": 2389 }, { "epoch": 1.5330339961513793, "grad_norm": 1.5014771223068237, "learning_rate": 2.450182129847868e-05, "loss": 0.2695, "step": 2390 }, { "epoch": 1.5336754329698525, "grad_norm": 1.0946794748306274, "learning_rate": 2.4491107778015858e-05, "loss": 0.251, "step": 2391 }, { "epoch": 1.5343168697883258, "grad_norm": 1.1205180883407593, "learning_rate": 2.4480394257553033e-05, "loss": 0.2665, "step": 2392 }, { "epoch": 1.534958306606799, "grad_norm": 1.3233356475830078, "learning_rate": 2.4469680737090208e-05, "loss": 0.2895, "step": 2393 }, { "epoch": 1.5355997434252726, "grad_norm": 0.8933968544006348, "learning_rate": 2.4458967216627386e-05, "loss": 0.2138, "step": 2394 }, { "epoch": 1.5362411802437461, "grad_norm": 1.144021987915039, "learning_rate": 2.444825369616456e-05, "loss": 0.2551, "step": 2395 }, { "epoch": 1.5368826170622194, "grad_norm": 1.3601717948913574, "learning_rate": 2.4437540175701735e-05, "loss": 0.3095, "step": 2396 }, { "epoch": 1.5375240538806927, "grad_norm": 1.2905144691467285, "learning_rate": 2.4426826655238914e-05, "loss": 0.2433, "step": 2397 }, { "epoch": 1.538165490699166, "grad_norm": 1.4520076513290405, "learning_rate": 2.441611313477609e-05, "loss": 0.3221, "step": 2398 }, { "epoch": 1.5388069275176395, "grad_norm": 1.6854298114776611, "learning_rate": 2.4405399614313266e-05, "loss": 0.3662, "step": 2399 }, { "epoch": 1.539448364336113, "grad_norm": 1.0940606594085693, "learning_rate": 2.439468609385044e-05, "loss": 0.2319, "step": 2400 }, { "epoch": 1.5400898011545863, "grad_norm": 1.377801775932312, "learning_rate": 2.4383972573387616e-05, "loss": 0.304, "step": 2401 }, { "epoch": 1.5407312379730596, "grad_norm": 1.30165433883667, "learning_rate": 2.4373259052924794e-05, "loss": 0.2514, "step": 2402 }, { "epoch": 1.541372674791533, "grad_norm": 1.2579777240753174, "learning_rate": 2.436254553246197e-05, "loss": 0.2706, "step": 2403 }, { "epoch": 1.5420141116100066, "grad_norm": 1.1190359592437744, "learning_rate": 2.4351832011999144e-05, "loss": 0.2354, "step": 2404 }, { "epoch": 1.5426555484284799, "grad_norm": 1.5603482723236084, "learning_rate": 2.434111849153632e-05, "loss": 0.3415, "step": 2405 }, { "epoch": 1.5432969852469531, "grad_norm": 1.4472782611846924, "learning_rate": 2.4330404971073497e-05, "loss": 0.2614, "step": 2406 }, { "epoch": 1.5439384220654264, "grad_norm": 1.2711172103881836, "learning_rate": 2.431969145061067e-05, "loss": 0.2448, "step": 2407 }, { "epoch": 1.5445798588839, "grad_norm": 1.2676677703857422, "learning_rate": 2.4308977930147846e-05, "loss": 0.2604, "step": 2408 }, { "epoch": 1.5452212957023734, "grad_norm": 1.4261342287063599, "learning_rate": 2.4298264409685024e-05, "loss": 0.3338, "step": 2409 }, { "epoch": 1.5458627325208467, "grad_norm": 1.1475963592529297, "learning_rate": 2.42875508892222e-05, "loss": 0.2523, "step": 2410 }, { "epoch": 1.54650416933932, "grad_norm": 1.4019837379455566, "learning_rate": 2.4276837368759374e-05, "loss": 0.2931, "step": 2411 }, { "epoch": 1.5471456061577935, "grad_norm": 1.348872423171997, "learning_rate": 2.4266123848296552e-05, "loss": 0.313, "step": 2412 }, { "epoch": 1.5477870429762668, "grad_norm": 1.5809942483901978, "learning_rate": 2.4255410327833727e-05, "loss": 0.3118, "step": 2413 }, { "epoch": 1.5484284797947403, "grad_norm": 1.1729729175567627, "learning_rate": 2.4244696807370905e-05, "loss": 0.2447, "step": 2414 }, { "epoch": 1.5490699166132136, "grad_norm": 1.395987868309021, "learning_rate": 2.423398328690808e-05, "loss": 0.2679, "step": 2415 }, { "epoch": 1.5497113534316869, "grad_norm": 1.3556349277496338, "learning_rate": 2.4223269766445254e-05, "loss": 0.271, "step": 2416 }, { "epoch": 1.5503527902501604, "grad_norm": 1.6018621921539307, "learning_rate": 2.4212556245982432e-05, "loss": 0.3043, "step": 2417 }, { "epoch": 1.5509942270686339, "grad_norm": 1.1808087825775146, "learning_rate": 2.4201842725519607e-05, "loss": 0.2371, "step": 2418 }, { "epoch": 1.5516356638871072, "grad_norm": 1.3515527248382568, "learning_rate": 2.4191129205056785e-05, "loss": 0.2667, "step": 2419 }, { "epoch": 1.5522771007055804, "grad_norm": 1.5394021272659302, "learning_rate": 2.418041568459396e-05, "loss": 0.3007, "step": 2420 }, { "epoch": 1.5529185375240537, "grad_norm": 1.5959302186965942, "learning_rate": 2.4169702164131135e-05, "loss": 0.295, "step": 2421 }, { "epoch": 1.5535599743425272, "grad_norm": 1.2859485149383545, "learning_rate": 2.415898864366831e-05, "loss": 0.2796, "step": 2422 }, { "epoch": 1.5542014111610007, "grad_norm": 2.0094032287597656, "learning_rate": 2.4148275123205484e-05, "loss": 0.4166, "step": 2423 }, { "epoch": 1.554842847979474, "grad_norm": 1.526092767715454, "learning_rate": 2.4137561602742663e-05, "loss": 0.3023, "step": 2424 }, { "epoch": 1.5554842847979473, "grad_norm": 1.7267414331436157, "learning_rate": 2.4126848082279837e-05, "loss": 0.415, "step": 2425 }, { "epoch": 1.5561257216164208, "grad_norm": 1.1671819686889648, "learning_rate": 2.4116134561817012e-05, "loss": 0.2452, "step": 2426 }, { "epoch": 1.5567671584348943, "grad_norm": 1.5673115253448486, "learning_rate": 2.410542104135419e-05, "loss": 0.3196, "step": 2427 }, { "epoch": 1.5574085952533676, "grad_norm": 1.166937232017517, "learning_rate": 2.4094707520891365e-05, "loss": 0.2445, "step": 2428 }, { "epoch": 1.558050032071841, "grad_norm": 1.282053828239441, "learning_rate": 2.4083994000428543e-05, "loss": 0.2806, "step": 2429 }, { "epoch": 1.5586914688903142, "grad_norm": 1.3850072622299194, "learning_rate": 2.4073280479965718e-05, "loss": 0.2743, "step": 2430 }, { "epoch": 1.5593329057087877, "grad_norm": 1.5069936513900757, "learning_rate": 2.4062566959502893e-05, "loss": 0.3733, "step": 2431 }, { "epoch": 1.5599743425272612, "grad_norm": 1.2427464723587036, "learning_rate": 2.405185343904007e-05, "loss": 0.2794, "step": 2432 }, { "epoch": 1.5606157793457345, "grad_norm": 1.3704959154129028, "learning_rate": 2.4041139918577246e-05, "loss": 0.2857, "step": 2433 }, { "epoch": 1.5612572161642078, "grad_norm": 1.3919812440872192, "learning_rate": 2.4030426398114424e-05, "loss": 0.2898, "step": 2434 }, { "epoch": 1.561898652982681, "grad_norm": 0.9056454300880432, "learning_rate": 2.40197128776516e-05, "loss": 0.2106, "step": 2435 }, { "epoch": 1.5625400898011546, "grad_norm": 1.2892332077026367, "learning_rate": 2.4008999357188773e-05, "loss": 0.293, "step": 2436 }, { "epoch": 1.563181526619628, "grad_norm": 1.2682660818099976, "learning_rate": 2.399828583672595e-05, "loss": 0.2767, "step": 2437 }, { "epoch": 1.5638229634381013, "grad_norm": 1.24415922164917, "learning_rate": 2.3987572316263123e-05, "loss": 0.2667, "step": 2438 }, { "epoch": 1.5644644002565746, "grad_norm": 1.3478344678878784, "learning_rate": 2.39768587958003e-05, "loss": 0.2892, "step": 2439 }, { "epoch": 1.5651058370750481, "grad_norm": 1.1209337711334229, "learning_rate": 2.3966145275337476e-05, "loss": 0.2195, "step": 2440 }, { "epoch": 1.5657472738935216, "grad_norm": 0.878271758556366, "learning_rate": 2.395543175487465e-05, "loss": 0.2054, "step": 2441 }, { "epoch": 1.566388710711995, "grad_norm": 1.0877175331115723, "learning_rate": 2.394471823441183e-05, "loss": 0.2246, "step": 2442 }, { "epoch": 1.5670301475304682, "grad_norm": 1.2926744222640991, "learning_rate": 2.3934004713949003e-05, "loss": 0.2469, "step": 2443 }, { "epoch": 1.5676715843489415, "grad_norm": 1.4005615711212158, "learning_rate": 2.392329119348618e-05, "loss": 0.2986, "step": 2444 }, { "epoch": 1.568313021167415, "grad_norm": 1.2751572132110596, "learning_rate": 2.3912577673023356e-05, "loss": 0.2836, "step": 2445 }, { "epoch": 1.5689544579858885, "grad_norm": 1.031894326210022, "learning_rate": 2.390186415256053e-05, "loss": 0.2258, "step": 2446 }, { "epoch": 1.5695958948043618, "grad_norm": 1.525549054145813, "learning_rate": 2.389115063209771e-05, "loss": 0.3441, "step": 2447 }, { "epoch": 1.570237331622835, "grad_norm": 1.2118643522262573, "learning_rate": 2.3880437111634884e-05, "loss": 0.2488, "step": 2448 }, { "epoch": 1.5708787684413086, "grad_norm": 1.198703646659851, "learning_rate": 2.3869723591172062e-05, "loss": 0.242, "step": 2449 }, { "epoch": 1.5715202052597819, "grad_norm": 1.2587227821350098, "learning_rate": 2.3859010070709237e-05, "loss": 0.2435, "step": 2450 }, { "epoch": 1.5721616420782554, "grad_norm": 1.3293280601501465, "learning_rate": 2.384829655024641e-05, "loss": 0.2774, "step": 2451 }, { "epoch": 1.5728030788967287, "grad_norm": 1.1196831464767456, "learning_rate": 2.383758302978359e-05, "loss": 0.2289, "step": 2452 }, { "epoch": 1.573444515715202, "grad_norm": 1.5171927213668823, "learning_rate": 2.3826869509320765e-05, "loss": 0.3182, "step": 2453 }, { "epoch": 1.5740859525336754, "grad_norm": 1.4992424249649048, "learning_rate": 2.3816155988857943e-05, "loss": 0.264, "step": 2454 }, { "epoch": 1.574727389352149, "grad_norm": 1.257247805595398, "learning_rate": 2.3805442468395117e-05, "loss": 0.2306, "step": 2455 }, { "epoch": 1.5753688261706222, "grad_norm": 1.305217981338501, "learning_rate": 2.379472894793229e-05, "loss": 0.2668, "step": 2456 }, { "epoch": 1.5760102629890955, "grad_norm": 1.2631564140319824, "learning_rate": 2.3784015427469467e-05, "loss": 0.2368, "step": 2457 }, { "epoch": 1.5766516998075688, "grad_norm": 1.5415985584259033, "learning_rate": 2.3773301907006642e-05, "loss": 0.2758, "step": 2458 }, { "epoch": 1.5772931366260423, "grad_norm": 1.348375678062439, "learning_rate": 2.376258838654382e-05, "loss": 0.2646, "step": 2459 }, { "epoch": 1.5779345734445158, "grad_norm": 1.1104750633239746, "learning_rate": 2.3751874866080995e-05, "loss": 0.226, "step": 2460 }, { "epoch": 1.578576010262989, "grad_norm": 1.3757448196411133, "learning_rate": 2.374116134561817e-05, "loss": 0.3007, "step": 2461 }, { "epoch": 1.5792174470814624, "grad_norm": 1.4881956577301025, "learning_rate": 2.3730447825155348e-05, "loss": 0.3067, "step": 2462 }, { "epoch": 1.579858883899936, "grad_norm": 1.174572229385376, "learning_rate": 2.3719734304692522e-05, "loss": 0.2452, "step": 2463 }, { "epoch": 1.5805003207184094, "grad_norm": 1.2516145706176758, "learning_rate": 2.37090207842297e-05, "loss": 0.248, "step": 2464 }, { "epoch": 1.5811417575368827, "grad_norm": 1.355280876159668, "learning_rate": 2.3698307263766875e-05, "loss": 0.3138, "step": 2465 }, { "epoch": 1.581783194355356, "grad_norm": 1.3768943548202515, "learning_rate": 2.368759374330405e-05, "loss": 0.2974, "step": 2466 }, { "epoch": 1.5824246311738293, "grad_norm": 1.2008296251296997, "learning_rate": 2.3676880222841228e-05, "loss": 0.2554, "step": 2467 }, { "epoch": 1.5830660679923028, "grad_norm": 1.2071962356567383, "learning_rate": 2.3666166702378403e-05, "loss": 0.2539, "step": 2468 }, { "epoch": 1.5837075048107763, "grad_norm": 1.329349160194397, "learning_rate": 2.365545318191558e-05, "loss": 0.2706, "step": 2469 }, { "epoch": 1.5843489416292496, "grad_norm": 1.2981741428375244, "learning_rate": 2.3644739661452756e-05, "loss": 0.2627, "step": 2470 }, { "epoch": 1.5849903784477228, "grad_norm": 1.0677367448806763, "learning_rate": 2.363402614098993e-05, "loss": 0.2313, "step": 2471 }, { "epoch": 1.5856318152661961, "grad_norm": 1.2531168460845947, "learning_rate": 2.362331262052711e-05, "loss": 0.2822, "step": 2472 }, { "epoch": 1.5862732520846696, "grad_norm": 1.1295192241668701, "learning_rate": 2.361259910006428e-05, "loss": 0.2479, "step": 2473 }, { "epoch": 1.5869146889031431, "grad_norm": 1.180370569229126, "learning_rate": 2.3601885579601458e-05, "loss": 0.2582, "step": 2474 }, { "epoch": 1.5875561257216164, "grad_norm": 1.2340995073318481, "learning_rate": 2.3591172059138633e-05, "loss": 0.2634, "step": 2475 }, { "epoch": 1.5881975625400897, "grad_norm": 0.916256308555603, "learning_rate": 2.3580458538675808e-05, "loss": 0.2165, "step": 2476 }, { "epoch": 1.5888389993585632, "grad_norm": 1.0177812576293945, "learning_rate": 2.3569745018212986e-05, "loss": 0.2357, "step": 2477 }, { "epoch": 1.5894804361770367, "grad_norm": 1.448468565940857, "learning_rate": 2.355903149775016e-05, "loss": 0.3281, "step": 2478 }, { "epoch": 1.59012187299551, "grad_norm": 1.4242156744003296, "learning_rate": 2.354831797728734e-05, "loss": 0.2964, "step": 2479 }, { "epoch": 1.5907633098139833, "grad_norm": 1.100259780883789, "learning_rate": 2.3537604456824514e-05, "loss": 0.2243, "step": 2480 }, { "epoch": 1.5914047466324566, "grad_norm": 1.4607356786727905, "learning_rate": 2.352689093636169e-05, "loss": 0.2917, "step": 2481 }, { "epoch": 1.59204618345093, "grad_norm": 1.3370305299758911, "learning_rate": 2.3516177415898866e-05, "loss": 0.266, "step": 2482 }, { "epoch": 1.5926876202694036, "grad_norm": 1.139918327331543, "learning_rate": 2.350546389543604e-05, "loss": 0.2333, "step": 2483 }, { "epoch": 1.5933290570878769, "grad_norm": 1.3469274044036865, "learning_rate": 2.349475037497322e-05, "loss": 0.293, "step": 2484 }, { "epoch": 1.5939704939063501, "grad_norm": 1.113419532775879, "learning_rate": 2.3484036854510394e-05, "loss": 0.2144, "step": 2485 }, { "epoch": 1.5946119307248237, "grad_norm": 1.535705804824829, "learning_rate": 2.347332333404757e-05, "loss": 0.3297, "step": 2486 }, { "epoch": 1.595253367543297, "grad_norm": 1.1610609292984009, "learning_rate": 2.3462609813584747e-05, "loss": 0.2618, "step": 2487 }, { "epoch": 1.5958948043617704, "grad_norm": 1.142659306526184, "learning_rate": 2.3451896293121922e-05, "loss": 0.2255, "step": 2488 }, { "epoch": 1.5965362411802437, "grad_norm": 1.1576015949249268, "learning_rate": 2.3441182772659097e-05, "loss": 0.2423, "step": 2489 }, { "epoch": 1.597177677998717, "grad_norm": 1.5339754819869995, "learning_rate": 2.343046925219627e-05, "loss": 0.2986, "step": 2490 }, { "epoch": 1.5978191148171905, "grad_norm": 1.1553672552108765, "learning_rate": 2.3419755731733446e-05, "loss": 0.2633, "step": 2491 }, { "epoch": 1.598460551635664, "grad_norm": 1.0889861583709717, "learning_rate": 2.3409042211270624e-05, "loss": 0.2364, "step": 2492 }, { "epoch": 1.5991019884541373, "grad_norm": 1.388624668121338, "learning_rate": 2.33983286908078e-05, "loss": 0.2412, "step": 2493 }, { "epoch": 1.5997434252726106, "grad_norm": 1.2673702239990234, "learning_rate": 2.3387615170344977e-05, "loss": 0.2621, "step": 2494 }, { "epoch": 1.6003848620910839, "grad_norm": 1.0929465293884277, "learning_rate": 2.3376901649882152e-05, "loss": 0.2252, "step": 2495 }, { "epoch": 1.6010262989095574, "grad_norm": 1.0294771194458008, "learning_rate": 2.3366188129419327e-05, "loss": 0.2369, "step": 2496 }, { "epoch": 1.601667735728031, "grad_norm": 1.5844817161560059, "learning_rate": 2.3355474608956505e-05, "loss": 0.2754, "step": 2497 }, { "epoch": 1.6023091725465042, "grad_norm": 1.4331544637680054, "learning_rate": 2.334476108849368e-05, "loss": 0.2644, "step": 2498 }, { "epoch": 1.6029506093649775, "grad_norm": 1.2858463525772095, "learning_rate": 2.3334047568030858e-05, "loss": 0.2913, "step": 2499 }, { "epoch": 1.603592046183451, "grad_norm": 1.5781077146530151, "learning_rate": 2.3323334047568032e-05, "loss": 0.3459, "step": 2500 }, { "epoch": 1.6042334830019245, "grad_norm": 1.3052608966827393, "learning_rate": 2.3312620527105207e-05, "loss": 0.3043, "step": 2501 }, { "epoch": 1.6048749198203978, "grad_norm": 1.118510365486145, "learning_rate": 2.3301907006642385e-05, "loss": 0.2236, "step": 2502 }, { "epoch": 1.605516356638871, "grad_norm": 1.19033944606781, "learning_rate": 2.329119348617956e-05, "loss": 0.2541, "step": 2503 }, { "epoch": 1.6061577934573443, "grad_norm": 1.3869961500167847, "learning_rate": 2.3280479965716738e-05, "loss": 0.2451, "step": 2504 }, { "epoch": 1.6067992302758178, "grad_norm": 1.3834943771362305, "learning_rate": 2.3269766445253913e-05, "loss": 0.3058, "step": 2505 }, { "epoch": 1.6074406670942913, "grad_norm": 1.003682255744934, "learning_rate": 2.3259052924791088e-05, "loss": 0.2327, "step": 2506 }, { "epoch": 1.6080821039127646, "grad_norm": 1.53524649143219, "learning_rate": 2.3248339404328263e-05, "loss": 0.284, "step": 2507 }, { "epoch": 1.608723540731238, "grad_norm": 1.223949670791626, "learning_rate": 2.3237625883865437e-05, "loss": 0.2463, "step": 2508 }, { "epoch": 1.6093649775497112, "grad_norm": 1.2338857650756836, "learning_rate": 2.3226912363402615e-05, "loss": 0.2432, "step": 2509 }, { "epoch": 1.6100064143681847, "grad_norm": 1.5836271047592163, "learning_rate": 2.321619884293979e-05, "loss": 0.3054, "step": 2510 }, { "epoch": 1.6106478511866582, "grad_norm": 1.1855084896087646, "learning_rate": 2.3205485322476965e-05, "loss": 0.245, "step": 2511 }, { "epoch": 1.6112892880051315, "grad_norm": 1.4716438055038452, "learning_rate": 2.3194771802014143e-05, "loss": 0.3042, "step": 2512 }, { "epoch": 1.6119307248236048, "grad_norm": 0.9869175553321838, "learning_rate": 2.3184058281551318e-05, "loss": 0.2256, "step": 2513 }, { "epoch": 1.6125721616420783, "grad_norm": 1.1336106061935425, "learning_rate": 2.3173344761088496e-05, "loss": 0.2405, "step": 2514 }, { "epoch": 1.6132135984605518, "grad_norm": 1.2867814302444458, "learning_rate": 2.316263124062567e-05, "loss": 0.2691, "step": 2515 }, { "epoch": 1.613855035279025, "grad_norm": 1.4247851371765137, "learning_rate": 2.3151917720162846e-05, "loss": 0.3143, "step": 2516 }, { "epoch": 1.6144964720974984, "grad_norm": 1.2593646049499512, "learning_rate": 2.3141204199700024e-05, "loss": 0.2586, "step": 2517 }, { "epoch": 1.6151379089159716, "grad_norm": 1.2690672874450684, "learning_rate": 2.31304906792372e-05, "loss": 0.2568, "step": 2518 }, { "epoch": 1.6157793457344451, "grad_norm": 1.4977666139602661, "learning_rate": 2.3119777158774377e-05, "loss": 0.301, "step": 2519 }, { "epoch": 1.6164207825529187, "grad_norm": 1.3779160976409912, "learning_rate": 2.310906363831155e-05, "loss": 0.2989, "step": 2520 }, { "epoch": 1.617062219371392, "grad_norm": 1.2333025932312012, "learning_rate": 2.3098350117848726e-05, "loss": 0.2817, "step": 2521 }, { "epoch": 1.6177036561898652, "grad_norm": 1.2805213928222656, "learning_rate": 2.3087636597385904e-05, "loss": 0.2533, "step": 2522 }, { "epoch": 1.6183450930083387, "grad_norm": 1.164289116859436, "learning_rate": 2.307692307692308e-05, "loss": 0.2548, "step": 2523 }, { "epoch": 1.618986529826812, "grad_norm": 1.272743582725525, "learning_rate": 2.3066209556460254e-05, "loss": 0.2598, "step": 2524 }, { "epoch": 1.6196279666452855, "grad_norm": 1.4133312702178955, "learning_rate": 2.305549603599743e-05, "loss": 0.2536, "step": 2525 }, { "epoch": 1.6202694034637588, "grad_norm": 1.408881664276123, "learning_rate": 2.3044782515534603e-05, "loss": 0.2679, "step": 2526 }, { "epoch": 1.620910840282232, "grad_norm": 1.102033019065857, "learning_rate": 2.303406899507178e-05, "loss": 0.2498, "step": 2527 }, { "epoch": 1.6215522771007056, "grad_norm": 0.9831531643867493, "learning_rate": 2.3023355474608956e-05, "loss": 0.2126, "step": 2528 }, { "epoch": 1.622193713919179, "grad_norm": 1.345276117324829, "learning_rate": 2.3012641954146134e-05, "loss": 0.3035, "step": 2529 }, { "epoch": 1.6228351507376524, "grad_norm": 1.1808055639266968, "learning_rate": 2.300192843368331e-05, "loss": 0.2717, "step": 2530 }, { "epoch": 1.6234765875561257, "grad_norm": 1.155967116355896, "learning_rate": 2.2991214913220484e-05, "loss": 0.2376, "step": 2531 }, { "epoch": 1.624118024374599, "grad_norm": 1.0476953983306885, "learning_rate": 2.2980501392757662e-05, "loss": 0.2145, "step": 2532 }, { "epoch": 1.6247594611930725, "grad_norm": 1.5565588474273682, "learning_rate": 2.2969787872294837e-05, "loss": 0.2917, "step": 2533 }, { "epoch": 1.625400898011546, "grad_norm": 1.177314043045044, "learning_rate": 2.2959074351832015e-05, "loss": 0.2279, "step": 2534 }, { "epoch": 1.6260423348300193, "grad_norm": 1.3117473125457764, "learning_rate": 2.294836083136919e-05, "loss": 0.2954, "step": 2535 }, { "epoch": 1.6266837716484925, "grad_norm": 1.1560354232788086, "learning_rate": 2.2937647310906365e-05, "loss": 0.2524, "step": 2536 }, { "epoch": 1.627325208466966, "grad_norm": 0.9520967602729797, "learning_rate": 2.2926933790443543e-05, "loss": 0.2108, "step": 2537 }, { "epoch": 1.6279666452854393, "grad_norm": 1.423876166343689, "learning_rate": 2.2916220269980717e-05, "loss": 0.3112, "step": 2538 }, { "epoch": 1.6286080821039128, "grad_norm": 1.3226604461669922, "learning_rate": 2.2905506749517892e-05, "loss": 0.2707, "step": 2539 }, { "epoch": 1.6292495189223861, "grad_norm": 1.4093023538589478, "learning_rate": 2.289479322905507e-05, "loss": 0.2859, "step": 2540 }, { "epoch": 1.6298909557408594, "grad_norm": 1.3509314060211182, "learning_rate": 2.2884079708592242e-05, "loss": 0.2722, "step": 2541 }, { "epoch": 1.630532392559333, "grad_norm": 1.4042487144470215, "learning_rate": 2.287336618812942e-05, "loss": 0.2897, "step": 2542 }, { "epoch": 1.6311738293778064, "grad_norm": 1.2699946165084839, "learning_rate": 2.2862652667666595e-05, "loss": 0.248, "step": 2543 }, { "epoch": 1.6318152661962797, "grad_norm": 1.1097908020019531, "learning_rate": 2.2851939147203773e-05, "loss": 0.2229, "step": 2544 }, { "epoch": 1.632456703014753, "grad_norm": 1.0013399124145508, "learning_rate": 2.2841225626740948e-05, "loss": 0.2264, "step": 2545 }, { "epoch": 1.6330981398332263, "grad_norm": 1.3209227323532104, "learning_rate": 2.2830512106278122e-05, "loss": 0.3071, "step": 2546 }, { "epoch": 1.6337395766516998, "grad_norm": 1.2631863355636597, "learning_rate": 2.28197985858153e-05, "loss": 0.2621, "step": 2547 }, { "epoch": 1.6343810134701733, "grad_norm": 1.2319633960723877, "learning_rate": 2.2809085065352475e-05, "loss": 0.2769, "step": 2548 }, { "epoch": 1.6350224502886466, "grad_norm": 1.566948652267456, "learning_rate": 2.2798371544889653e-05, "loss": 0.3532, "step": 2549 }, { "epoch": 1.6356638871071199, "grad_norm": 1.271995186805725, "learning_rate": 2.2787658024426828e-05, "loss": 0.2804, "step": 2550 }, { "epoch": 1.6363053239255934, "grad_norm": 1.406042218208313, "learning_rate": 2.2776944503964003e-05, "loss": 0.2984, "step": 2551 }, { "epoch": 1.6369467607440669, "grad_norm": 1.5563223361968994, "learning_rate": 2.276623098350118e-05, "loss": 0.3411, "step": 2552 }, { "epoch": 1.6375881975625401, "grad_norm": 1.1209940910339355, "learning_rate": 2.2755517463038356e-05, "loss": 0.238, "step": 2553 }, { "epoch": 1.6382296343810134, "grad_norm": 1.2614922523498535, "learning_rate": 2.274480394257553e-05, "loss": 0.2617, "step": 2554 }, { "epoch": 1.6388710711994867, "grad_norm": 0.9150387644767761, "learning_rate": 2.273409042211271e-05, "loss": 0.216, "step": 2555 }, { "epoch": 1.6395125080179602, "grad_norm": 1.0650540590286255, "learning_rate": 2.2723376901649883e-05, "loss": 0.2326, "step": 2556 }, { "epoch": 1.6401539448364337, "grad_norm": 1.2188881635665894, "learning_rate": 2.271266338118706e-05, "loss": 0.2723, "step": 2557 }, { "epoch": 1.640795381654907, "grad_norm": 1.554589867591858, "learning_rate": 2.2701949860724233e-05, "loss": 0.3233, "step": 2558 }, { "epoch": 1.6414368184733803, "grad_norm": 1.27902352809906, "learning_rate": 2.269123634026141e-05, "loss": 0.2483, "step": 2559 }, { "epoch": 1.6420782552918538, "grad_norm": 1.0888116359710693, "learning_rate": 2.2680522819798586e-05, "loss": 0.2201, "step": 2560 }, { "epoch": 1.642719692110327, "grad_norm": 1.3450220823287964, "learning_rate": 2.266980929933576e-05, "loss": 0.2905, "step": 2561 }, { "epoch": 1.6433611289288006, "grad_norm": 1.4311708211898804, "learning_rate": 2.265909577887294e-05, "loss": 0.2881, "step": 2562 }, { "epoch": 1.6440025657472739, "grad_norm": 1.1611278057098389, "learning_rate": 2.2648382258410114e-05, "loss": 0.2348, "step": 2563 }, { "epoch": 1.6446440025657472, "grad_norm": 1.4181629419326782, "learning_rate": 2.263766873794729e-05, "loss": 0.3078, "step": 2564 }, { "epoch": 1.6452854393842207, "grad_norm": 1.2469449043273926, "learning_rate": 2.2626955217484466e-05, "loss": 0.2417, "step": 2565 }, { "epoch": 1.6459268762026942, "grad_norm": 1.5264776945114136, "learning_rate": 2.261624169702164e-05, "loss": 0.3048, "step": 2566 }, { "epoch": 1.6465683130211675, "grad_norm": 0.9686952233314514, "learning_rate": 2.260552817655882e-05, "loss": 0.2359, "step": 2567 }, { "epoch": 1.6472097498396407, "grad_norm": 1.0187535285949707, "learning_rate": 2.2594814656095994e-05, "loss": 0.263, "step": 2568 }, { "epoch": 1.647851186658114, "grad_norm": 1.1646547317504883, "learning_rate": 2.258410113563317e-05, "loss": 0.2311, "step": 2569 }, { "epoch": 1.6484926234765875, "grad_norm": 1.051867961883545, "learning_rate": 2.2573387615170347e-05, "loss": 0.2312, "step": 2570 }, { "epoch": 1.649134060295061, "grad_norm": 1.450859546661377, "learning_rate": 2.2562674094707522e-05, "loss": 0.2585, "step": 2571 }, { "epoch": 1.6497754971135343, "grad_norm": 1.08970046043396, "learning_rate": 2.25519605742447e-05, "loss": 0.2344, "step": 2572 }, { "epoch": 1.6504169339320076, "grad_norm": 1.103904366493225, "learning_rate": 2.2541247053781875e-05, "loss": 0.2314, "step": 2573 }, { "epoch": 1.6510583707504811, "grad_norm": 1.2255008220672607, "learning_rate": 2.253053353331905e-05, "loss": 0.26, "step": 2574 }, { "epoch": 1.6516998075689544, "grad_norm": 1.5432219505310059, "learning_rate": 2.2519820012856228e-05, "loss": 0.3317, "step": 2575 }, { "epoch": 1.652341244387428, "grad_norm": 1.0306991338729858, "learning_rate": 2.25091064923934e-05, "loss": 0.2359, "step": 2576 }, { "epoch": 1.6529826812059012, "grad_norm": 1.573811650276184, "learning_rate": 2.2498392971930577e-05, "loss": 0.3235, "step": 2577 }, { "epoch": 1.6536241180243745, "grad_norm": 1.159751296043396, "learning_rate": 2.2487679451467752e-05, "loss": 0.2363, "step": 2578 }, { "epoch": 1.654265554842848, "grad_norm": 1.1524370908737183, "learning_rate": 2.247696593100493e-05, "loss": 0.2373, "step": 2579 }, { "epoch": 1.6549069916613215, "grad_norm": 1.3138492107391357, "learning_rate": 2.2466252410542105e-05, "loss": 0.2745, "step": 2580 }, { "epoch": 1.6555484284797948, "grad_norm": 1.1715203523635864, "learning_rate": 2.245553889007928e-05, "loss": 0.2378, "step": 2581 }, { "epoch": 1.656189865298268, "grad_norm": 1.1913843154907227, "learning_rate": 2.2444825369616458e-05, "loss": 0.2398, "step": 2582 }, { "epoch": 1.6568313021167413, "grad_norm": 1.6138267517089844, "learning_rate": 2.2434111849153632e-05, "loss": 0.3429, "step": 2583 }, { "epoch": 1.6574727389352149, "grad_norm": 1.510577917098999, "learning_rate": 2.242339832869081e-05, "loss": 0.2742, "step": 2584 }, { "epoch": 1.6581141757536884, "grad_norm": 1.2102668285369873, "learning_rate": 2.2412684808227985e-05, "loss": 0.248, "step": 2585 }, { "epoch": 1.6587556125721616, "grad_norm": 1.348606824874878, "learning_rate": 2.240197128776516e-05, "loss": 0.2924, "step": 2586 }, { "epoch": 1.659397049390635, "grad_norm": 1.3920377492904663, "learning_rate": 2.2391257767302338e-05, "loss": 0.2639, "step": 2587 }, { "epoch": 1.6600384862091084, "grad_norm": 1.4148494005203247, "learning_rate": 2.2380544246839513e-05, "loss": 0.2829, "step": 2588 }, { "epoch": 1.660679923027582, "grad_norm": 1.2239763736724854, "learning_rate": 2.2369830726376688e-05, "loss": 0.2613, "step": 2589 }, { "epoch": 1.6613213598460552, "grad_norm": 1.3981868028640747, "learning_rate": 2.2359117205913866e-05, "loss": 0.2741, "step": 2590 }, { "epoch": 1.6619627966645285, "grad_norm": 1.3694418668746948, "learning_rate": 2.234840368545104e-05, "loss": 0.3275, "step": 2591 }, { "epoch": 1.6626042334830018, "grad_norm": 1.2420824766159058, "learning_rate": 2.233769016498822e-05, "loss": 0.2755, "step": 2592 }, { "epoch": 1.6632456703014753, "grad_norm": 0.8297078609466553, "learning_rate": 2.232697664452539e-05, "loss": 0.1947, "step": 2593 }, { "epoch": 1.6638871071199488, "grad_norm": 1.2183603048324585, "learning_rate": 2.231626312406257e-05, "loss": 0.2619, "step": 2594 }, { "epoch": 1.664528543938422, "grad_norm": 1.7389893531799316, "learning_rate": 2.2305549603599743e-05, "loss": 0.4061, "step": 2595 }, { "epoch": 1.6651699807568954, "grad_norm": 1.4621541500091553, "learning_rate": 2.2294836083136918e-05, "loss": 0.2399, "step": 2596 }, { "epoch": 1.6658114175753689, "grad_norm": 0.9070848226547241, "learning_rate": 2.2284122562674096e-05, "loss": 0.223, "step": 2597 }, { "epoch": 1.6664528543938422, "grad_norm": 1.275272250175476, "learning_rate": 2.227340904221127e-05, "loss": 0.2539, "step": 2598 }, { "epoch": 1.6670942912123157, "grad_norm": 1.4920843839645386, "learning_rate": 2.226269552174845e-05, "loss": 0.2743, "step": 2599 }, { "epoch": 1.667735728030789, "grad_norm": 1.186514139175415, "learning_rate": 2.2251982001285624e-05, "loss": 0.2669, "step": 2600 }, { "epoch": 1.6683771648492622, "grad_norm": 1.6155591011047363, "learning_rate": 2.22412684808228e-05, "loss": 0.3593, "step": 2601 }, { "epoch": 1.6690186016677357, "grad_norm": 1.4562735557556152, "learning_rate": 2.2230554960359977e-05, "loss": 0.3209, "step": 2602 }, { "epoch": 1.6696600384862093, "grad_norm": 1.2269911766052246, "learning_rate": 2.221984143989715e-05, "loss": 0.2622, "step": 2603 }, { "epoch": 1.6703014753046825, "grad_norm": 1.8096710443496704, "learning_rate": 2.2209127919434326e-05, "loss": 0.4046, "step": 2604 }, { "epoch": 1.6709429121231558, "grad_norm": 1.413831114768982, "learning_rate": 2.2198414398971504e-05, "loss": 0.3249, "step": 2605 }, { "epoch": 1.671584348941629, "grad_norm": 1.1951617002487183, "learning_rate": 2.218770087850868e-05, "loss": 0.2645, "step": 2606 }, { "epoch": 1.6722257857601026, "grad_norm": 1.155356526374817, "learning_rate": 2.2176987358045857e-05, "loss": 0.2596, "step": 2607 }, { "epoch": 1.6728672225785761, "grad_norm": 1.3950101137161255, "learning_rate": 2.2166273837583032e-05, "loss": 0.2928, "step": 2608 }, { "epoch": 1.6735086593970494, "grad_norm": 1.0557172298431396, "learning_rate": 2.2155560317120207e-05, "loss": 0.2484, "step": 2609 }, { "epoch": 1.6741500962155227, "grad_norm": 1.1926902532577515, "learning_rate": 2.214484679665738e-05, "loss": 0.2408, "step": 2610 }, { "epoch": 1.6747915330339962, "grad_norm": 1.333120346069336, "learning_rate": 2.2134133276194556e-05, "loss": 0.2511, "step": 2611 }, { "epoch": 1.6754329698524695, "grad_norm": 1.1379388570785522, "learning_rate": 2.2123419755731734e-05, "loss": 0.287, "step": 2612 }, { "epoch": 1.676074406670943, "grad_norm": 1.000163197517395, "learning_rate": 2.211270623526891e-05, "loss": 0.2164, "step": 2613 }, { "epoch": 1.6767158434894163, "grad_norm": 1.210903525352478, "learning_rate": 2.2101992714806087e-05, "loss": 0.2799, "step": 2614 }, { "epoch": 1.6773572803078896, "grad_norm": 1.3158146142959595, "learning_rate": 2.2091279194343262e-05, "loss": 0.3156, "step": 2615 }, { "epoch": 1.677998717126363, "grad_norm": 1.1509959697723389, "learning_rate": 2.2080565673880437e-05, "loss": 0.2523, "step": 2616 }, { "epoch": 1.6786401539448366, "grad_norm": 1.369177222251892, "learning_rate": 2.2069852153417615e-05, "loss": 0.3021, "step": 2617 }, { "epoch": 1.6792815907633099, "grad_norm": 1.250043272972107, "learning_rate": 2.205913863295479e-05, "loss": 0.2717, "step": 2618 }, { "epoch": 1.6799230275817831, "grad_norm": 1.3203184604644775, "learning_rate": 2.2048425112491964e-05, "loss": 0.2652, "step": 2619 }, { "epoch": 1.6805644644002564, "grad_norm": 1.6785963773727417, "learning_rate": 2.2037711592029143e-05, "loss": 0.403, "step": 2620 }, { "epoch": 1.68120590121873, "grad_norm": 1.1493375301361084, "learning_rate": 2.2026998071566317e-05, "loss": 0.2328, "step": 2621 }, { "epoch": 1.6818473380372034, "grad_norm": 1.073328971862793, "learning_rate": 2.2016284551103496e-05, "loss": 0.226, "step": 2622 }, { "epoch": 1.6824887748556767, "grad_norm": 1.3211771249771118, "learning_rate": 2.200557103064067e-05, "loss": 0.2735, "step": 2623 }, { "epoch": 1.68313021167415, "grad_norm": 1.2854560613632202, "learning_rate": 2.1994857510177845e-05, "loss": 0.306, "step": 2624 }, { "epoch": 1.6837716484926235, "grad_norm": 1.336601734161377, "learning_rate": 2.1984143989715023e-05, "loss": 0.2571, "step": 2625 }, { "epoch": 1.684413085311097, "grad_norm": 1.1005326509475708, "learning_rate": 2.1973430469252198e-05, "loss": 0.2507, "step": 2626 }, { "epoch": 1.6850545221295703, "grad_norm": 1.0871680974960327, "learning_rate": 2.1962716948789373e-05, "loss": 0.2326, "step": 2627 }, { "epoch": 1.6856959589480436, "grad_norm": 1.3405693769454956, "learning_rate": 2.1952003428326548e-05, "loss": 0.2498, "step": 2628 }, { "epoch": 1.6863373957665169, "grad_norm": 1.396431565284729, "learning_rate": 2.1941289907863726e-05, "loss": 0.2965, "step": 2629 }, { "epoch": 1.6869788325849904, "grad_norm": 1.2182618379592896, "learning_rate": 2.19305763874009e-05, "loss": 0.2657, "step": 2630 }, { "epoch": 1.6876202694034639, "grad_norm": 1.1579170227050781, "learning_rate": 2.1919862866938075e-05, "loss": 0.2218, "step": 2631 }, { "epoch": 1.6882617062219372, "grad_norm": 1.260891079902649, "learning_rate": 2.1909149346475253e-05, "loss": 0.2452, "step": 2632 }, { "epoch": 1.6889031430404104, "grad_norm": 1.165810465812683, "learning_rate": 2.1898435826012428e-05, "loss": 0.2563, "step": 2633 }, { "epoch": 1.689544579858884, "grad_norm": 1.1848852634429932, "learning_rate": 2.1887722305549603e-05, "loss": 0.2264, "step": 2634 }, { "epoch": 1.6901860166773572, "grad_norm": 1.7113007307052612, "learning_rate": 2.187700878508678e-05, "loss": 0.3219, "step": 2635 }, { "epoch": 1.6908274534958307, "grad_norm": 1.4598886966705322, "learning_rate": 2.1866295264623956e-05, "loss": 0.3103, "step": 2636 }, { "epoch": 1.691468890314304, "grad_norm": 1.0090537071228027, "learning_rate": 2.1855581744161134e-05, "loss": 0.2334, "step": 2637 }, { "epoch": 1.6921103271327773, "grad_norm": 1.1713320016860962, "learning_rate": 2.184486822369831e-05, "loss": 0.23, "step": 2638 }, { "epoch": 1.6927517639512508, "grad_norm": 1.4038723707199097, "learning_rate": 2.1834154703235483e-05, "loss": 0.2966, "step": 2639 }, { "epoch": 1.6933932007697243, "grad_norm": 1.0576155185699463, "learning_rate": 2.182344118277266e-05, "loss": 0.2153, "step": 2640 }, { "epoch": 1.6940346375881976, "grad_norm": 1.2520153522491455, "learning_rate": 2.1812727662309836e-05, "loss": 0.2476, "step": 2641 }, { "epoch": 1.694676074406671, "grad_norm": 1.1846497058868408, "learning_rate": 2.1802014141847014e-05, "loss": 0.2363, "step": 2642 }, { "epoch": 1.6953175112251442, "grad_norm": 1.168346881866455, "learning_rate": 2.179130062138419e-05, "loss": 0.2756, "step": 2643 }, { "epoch": 1.6959589480436177, "grad_norm": 1.0846805572509766, "learning_rate": 2.1780587100921364e-05, "loss": 0.2371, "step": 2644 }, { "epoch": 1.6966003848620912, "grad_norm": 1.338950276374817, "learning_rate": 2.176987358045854e-05, "loss": 0.279, "step": 2645 }, { "epoch": 1.6972418216805645, "grad_norm": 1.3954709768295288, "learning_rate": 2.1759160059995714e-05, "loss": 0.2822, "step": 2646 }, { "epoch": 1.6978832584990378, "grad_norm": 1.1996917724609375, "learning_rate": 2.174844653953289e-05, "loss": 0.2428, "step": 2647 }, { "epoch": 1.6985246953175113, "grad_norm": 1.3838646411895752, "learning_rate": 2.1737733019070066e-05, "loss": 0.2684, "step": 2648 }, { "epoch": 1.6991661321359846, "grad_norm": 1.2029083967208862, "learning_rate": 2.172701949860724e-05, "loss": 0.2583, "step": 2649 }, { "epoch": 1.699807568954458, "grad_norm": 1.4111701250076294, "learning_rate": 2.171630597814442e-05, "loss": 0.3237, "step": 2650 }, { "epoch": 1.7004490057729313, "grad_norm": 1.1536976099014282, "learning_rate": 2.1705592457681594e-05, "loss": 0.2483, "step": 2651 }, { "epoch": 1.7010904425914046, "grad_norm": 1.3982810974121094, "learning_rate": 2.1694878937218772e-05, "loss": 0.2668, "step": 2652 }, { "epoch": 1.7017318794098781, "grad_norm": 1.1855055093765259, "learning_rate": 2.1684165416755947e-05, "loss": 0.2586, "step": 2653 }, { "epoch": 1.7023733162283516, "grad_norm": 1.5172826051712036, "learning_rate": 2.1673451896293122e-05, "loss": 0.3027, "step": 2654 }, { "epoch": 1.703014753046825, "grad_norm": 1.3282562494277954, "learning_rate": 2.16627383758303e-05, "loss": 0.2752, "step": 2655 }, { "epoch": 1.7036561898652982, "grad_norm": 1.4892120361328125, "learning_rate": 2.1652024855367475e-05, "loss": 0.3246, "step": 2656 }, { "epoch": 1.7042976266837715, "grad_norm": 1.2611465454101562, "learning_rate": 2.1641311334904653e-05, "loss": 0.2785, "step": 2657 }, { "epoch": 1.704939063502245, "grad_norm": 1.3381062746047974, "learning_rate": 2.1630597814441828e-05, "loss": 0.2892, "step": 2658 }, { "epoch": 1.7055805003207185, "grad_norm": 1.306594729423523, "learning_rate": 2.1619884293979002e-05, "loss": 0.272, "step": 2659 }, { "epoch": 1.7062219371391918, "grad_norm": 1.0957601070404053, "learning_rate": 2.160917077351618e-05, "loss": 0.2455, "step": 2660 }, { "epoch": 1.706863373957665, "grad_norm": 1.207631230354309, "learning_rate": 2.1598457253053352e-05, "loss": 0.2446, "step": 2661 }, { "epoch": 1.7075048107761386, "grad_norm": 1.3723704814910889, "learning_rate": 2.158774373259053e-05, "loss": 0.3011, "step": 2662 }, { "epoch": 1.708146247594612, "grad_norm": 1.0982426404953003, "learning_rate": 2.1577030212127705e-05, "loss": 0.2611, "step": 2663 }, { "epoch": 1.7087876844130854, "grad_norm": 1.153885006904602, "learning_rate": 2.156631669166488e-05, "loss": 0.2401, "step": 2664 }, { "epoch": 1.7094291212315587, "grad_norm": 1.050711989402771, "learning_rate": 2.1555603171202058e-05, "loss": 0.228, "step": 2665 }, { "epoch": 1.710070558050032, "grad_norm": 1.1260461807250977, "learning_rate": 2.1544889650739232e-05, "loss": 0.24, "step": 2666 }, { "epoch": 1.7107119948685054, "grad_norm": 1.2347536087036133, "learning_rate": 2.153417613027641e-05, "loss": 0.2697, "step": 2667 }, { "epoch": 1.711353431686979, "grad_norm": 1.3119966983795166, "learning_rate": 2.1523462609813585e-05, "loss": 0.274, "step": 2668 }, { "epoch": 1.7119948685054522, "grad_norm": 1.112560510635376, "learning_rate": 2.151274908935076e-05, "loss": 0.2403, "step": 2669 }, { "epoch": 1.7126363053239255, "grad_norm": 1.1508169174194336, "learning_rate": 2.1502035568887938e-05, "loss": 0.2376, "step": 2670 }, { "epoch": 1.7132777421423988, "grad_norm": 1.070061206817627, "learning_rate": 2.1491322048425113e-05, "loss": 0.2276, "step": 2671 }, { "epoch": 1.7139191789608723, "grad_norm": 1.476781964302063, "learning_rate": 2.148060852796229e-05, "loss": 0.2792, "step": 2672 }, { "epoch": 1.7145606157793458, "grad_norm": 1.3917826414108276, "learning_rate": 2.1469895007499466e-05, "loss": 0.2679, "step": 2673 }, { "epoch": 1.715202052597819, "grad_norm": 1.5577335357666016, "learning_rate": 2.145918148703664e-05, "loss": 0.3379, "step": 2674 }, { "epoch": 1.7158434894162924, "grad_norm": 1.100588321685791, "learning_rate": 2.144846796657382e-05, "loss": 0.229, "step": 2675 }, { "epoch": 1.716484926234766, "grad_norm": 1.6730234622955322, "learning_rate": 2.1437754446110994e-05, "loss": 0.3223, "step": 2676 }, { "epoch": 1.7171263630532394, "grad_norm": 1.3936184644699097, "learning_rate": 2.1427040925648172e-05, "loss": 0.2619, "step": 2677 }, { "epoch": 1.7177677998717127, "grad_norm": 1.3107537031173706, "learning_rate": 2.1416327405185347e-05, "loss": 0.301, "step": 2678 }, { "epoch": 1.718409236690186, "grad_norm": 1.0450202226638794, "learning_rate": 2.140561388472252e-05, "loss": 0.2373, "step": 2679 }, { "epoch": 1.7190506735086593, "grad_norm": 1.160505771636963, "learning_rate": 2.1394900364259696e-05, "loss": 0.2578, "step": 2680 }, { "epoch": 1.7196921103271328, "grad_norm": 0.9901423454284668, "learning_rate": 2.138418684379687e-05, "loss": 0.2169, "step": 2681 }, { "epoch": 1.7203335471456063, "grad_norm": 1.4335567951202393, "learning_rate": 2.137347332333405e-05, "loss": 0.2919, "step": 2682 }, { "epoch": 1.7209749839640796, "grad_norm": 1.5262788534164429, "learning_rate": 2.1362759802871224e-05, "loss": 0.3452, "step": 2683 }, { "epoch": 1.7216164207825528, "grad_norm": 1.1407880783081055, "learning_rate": 2.13520462824084e-05, "loss": 0.2512, "step": 2684 }, { "epoch": 1.7222578576010263, "grad_norm": 1.4225070476531982, "learning_rate": 2.1341332761945577e-05, "loss": 0.3123, "step": 2685 }, { "epoch": 1.7228992944194996, "grad_norm": 1.308515191078186, "learning_rate": 2.133061924148275e-05, "loss": 0.2766, "step": 2686 }, { "epoch": 1.7235407312379731, "grad_norm": 1.213986873626709, "learning_rate": 2.131990572101993e-05, "loss": 0.2635, "step": 2687 }, { "epoch": 1.7241821680564464, "grad_norm": 0.9968778491020203, "learning_rate": 2.1309192200557104e-05, "loss": 0.2275, "step": 2688 }, { "epoch": 1.7248236048749197, "grad_norm": 1.5000431537628174, "learning_rate": 2.129847868009428e-05, "loss": 0.3627, "step": 2689 }, { "epoch": 1.7254650416933932, "grad_norm": 1.1023242473602295, "learning_rate": 2.1287765159631457e-05, "loss": 0.2499, "step": 2690 }, { "epoch": 1.7261064785118667, "grad_norm": 1.1008588075637817, "learning_rate": 2.1277051639168632e-05, "loss": 0.2318, "step": 2691 }, { "epoch": 1.72674791533034, "grad_norm": 1.247532844543457, "learning_rate": 2.126633811870581e-05, "loss": 0.2514, "step": 2692 }, { "epoch": 1.7273893521488133, "grad_norm": 1.3334197998046875, "learning_rate": 2.1255624598242985e-05, "loss": 0.2797, "step": 2693 }, { "epoch": 1.7280307889672866, "grad_norm": 1.2312544584274292, "learning_rate": 2.124491107778016e-05, "loss": 0.2835, "step": 2694 }, { "epoch": 1.72867222578576, "grad_norm": 1.5088086128234863, "learning_rate": 2.1234197557317338e-05, "loss": 0.3146, "step": 2695 }, { "epoch": 1.7293136626042336, "grad_norm": 1.2806707620620728, "learning_rate": 2.122348403685451e-05, "loss": 0.2679, "step": 2696 }, { "epoch": 1.7299550994227069, "grad_norm": 1.2671971321105957, "learning_rate": 2.1212770516391687e-05, "loss": 0.2745, "step": 2697 }, { "epoch": 1.7305965362411802, "grad_norm": 1.0791115760803223, "learning_rate": 2.1202056995928862e-05, "loss": 0.2369, "step": 2698 }, { "epoch": 1.7312379730596537, "grad_norm": 1.0598589181900024, "learning_rate": 2.1191343475466037e-05, "loss": 0.2451, "step": 2699 }, { "epoch": 1.7318794098781272, "grad_norm": 1.5523974895477295, "learning_rate": 2.1180629955003215e-05, "loss": 0.3539, "step": 2700 }, { "epoch": 1.7325208466966004, "grad_norm": 1.3896251916885376, "learning_rate": 2.116991643454039e-05, "loss": 0.3006, "step": 2701 }, { "epoch": 1.7331622835150737, "grad_norm": 1.2640163898468018, "learning_rate": 2.1159202914077568e-05, "loss": 0.2398, "step": 2702 }, { "epoch": 1.733803720333547, "grad_norm": 1.1062692403793335, "learning_rate": 2.1148489393614743e-05, "loss": 0.2425, "step": 2703 }, { "epoch": 1.7344451571520205, "grad_norm": 1.2044503688812256, "learning_rate": 2.1137775873151917e-05, "loss": 0.2507, "step": 2704 }, { "epoch": 1.735086593970494, "grad_norm": 1.3720227479934692, "learning_rate": 2.1127062352689096e-05, "loss": 0.2801, "step": 2705 }, { "epoch": 1.7357280307889673, "grad_norm": 1.2129145860671997, "learning_rate": 2.111634883222627e-05, "loss": 0.2513, "step": 2706 }, { "epoch": 1.7363694676074406, "grad_norm": 1.5071665048599243, "learning_rate": 2.110563531176345e-05, "loss": 0.3047, "step": 2707 }, { "epoch": 1.7370109044259139, "grad_norm": 1.6378893852233887, "learning_rate": 2.1094921791300623e-05, "loss": 0.3946, "step": 2708 }, { "epoch": 1.7376523412443874, "grad_norm": 1.2279380559921265, "learning_rate": 2.1084208270837798e-05, "loss": 0.27, "step": 2709 }, { "epoch": 1.738293778062861, "grad_norm": 1.3955539464950562, "learning_rate": 2.1073494750374976e-05, "loss": 0.3007, "step": 2710 }, { "epoch": 1.7389352148813342, "grad_norm": 1.127461552619934, "learning_rate": 2.106278122991215e-05, "loss": 0.2498, "step": 2711 }, { "epoch": 1.7395766516998075, "grad_norm": 1.01076340675354, "learning_rate": 2.105206770944933e-05, "loss": 0.2125, "step": 2712 }, { "epoch": 1.740218088518281, "grad_norm": 1.1997649669647217, "learning_rate": 2.10413541889865e-05, "loss": 0.2417, "step": 2713 }, { "epoch": 1.7408595253367545, "grad_norm": 1.1727302074432373, "learning_rate": 2.1030640668523675e-05, "loss": 0.2348, "step": 2714 }, { "epoch": 1.7415009621552278, "grad_norm": 1.8896719217300415, "learning_rate": 2.1019927148060853e-05, "loss": 0.3717, "step": 2715 }, { "epoch": 1.742142398973701, "grad_norm": 1.4710800647735596, "learning_rate": 2.1009213627598028e-05, "loss": 0.3087, "step": 2716 }, { "epoch": 1.7427838357921743, "grad_norm": 1.3986849784851074, "learning_rate": 2.0998500107135206e-05, "loss": 0.2491, "step": 2717 }, { "epoch": 1.7434252726106478, "grad_norm": 1.2677555084228516, "learning_rate": 2.098778658667238e-05, "loss": 0.2524, "step": 2718 }, { "epoch": 1.7440667094291213, "grad_norm": 1.1690477132797241, "learning_rate": 2.0977073066209556e-05, "loss": 0.2503, "step": 2719 }, { "epoch": 1.7447081462475946, "grad_norm": 1.2197935581207275, "learning_rate": 2.0966359545746734e-05, "loss": 0.2598, "step": 2720 }, { "epoch": 1.745349583066068, "grad_norm": 1.2560619115829468, "learning_rate": 2.095564602528391e-05, "loss": 0.2526, "step": 2721 }, { "epoch": 1.7459910198845414, "grad_norm": 1.2235976457595825, "learning_rate": 2.0944932504821087e-05, "loss": 0.2228, "step": 2722 }, { "epoch": 1.7466324567030147, "grad_norm": 1.3775168657302856, "learning_rate": 2.093421898435826e-05, "loss": 0.2731, "step": 2723 }, { "epoch": 1.7472738935214882, "grad_norm": 1.2389029264450073, "learning_rate": 2.0923505463895436e-05, "loss": 0.2318, "step": 2724 }, { "epoch": 1.7479153303399615, "grad_norm": 2.0067336559295654, "learning_rate": 2.0912791943432614e-05, "loss": 0.4012, "step": 2725 }, { "epoch": 1.7485567671584348, "grad_norm": 1.1510703563690186, "learning_rate": 2.090207842296979e-05, "loss": 0.2456, "step": 2726 }, { "epoch": 1.7491982039769083, "grad_norm": 1.0420030355453491, "learning_rate": 2.0891364902506967e-05, "loss": 0.2081, "step": 2727 }, { "epoch": 1.7498396407953818, "grad_norm": 1.2421971559524536, "learning_rate": 2.0880651382044142e-05, "loss": 0.2641, "step": 2728 }, { "epoch": 1.750481077613855, "grad_norm": 1.4386134147644043, "learning_rate": 2.0869937861581317e-05, "loss": 0.2937, "step": 2729 }, { "epoch": 1.7511225144323284, "grad_norm": 1.3484541177749634, "learning_rate": 2.085922434111849e-05, "loss": 0.2665, "step": 2730 }, { "epoch": 1.7517639512508016, "grad_norm": 1.1611508131027222, "learning_rate": 2.0848510820655666e-05, "loss": 0.2334, "step": 2731 }, { "epoch": 1.7524053880692751, "grad_norm": 1.4637690782546997, "learning_rate": 2.0837797300192845e-05, "loss": 0.3169, "step": 2732 }, { "epoch": 1.7530468248877487, "grad_norm": 1.2211723327636719, "learning_rate": 2.082708377973002e-05, "loss": 0.2463, "step": 2733 }, { "epoch": 1.753688261706222, "grad_norm": 1.0200849771499634, "learning_rate": 2.0816370259267194e-05, "loss": 0.2154, "step": 2734 }, { "epoch": 1.7543296985246952, "grad_norm": 1.4305129051208496, "learning_rate": 2.0805656738804372e-05, "loss": 0.2729, "step": 2735 }, { "epoch": 1.7549711353431687, "grad_norm": 1.3326185941696167, "learning_rate": 2.0794943218341547e-05, "loss": 0.2584, "step": 2736 }, { "epoch": 1.7556125721616422, "grad_norm": 1.28689444065094, "learning_rate": 2.0784229697878725e-05, "loss": 0.258, "step": 2737 }, { "epoch": 1.7562540089801155, "grad_norm": 1.4711740016937256, "learning_rate": 2.07735161774159e-05, "loss": 0.3038, "step": 2738 }, { "epoch": 1.7568954457985888, "grad_norm": 1.2607252597808838, "learning_rate": 2.0762802656953075e-05, "loss": 0.3058, "step": 2739 }, { "epoch": 1.757536882617062, "grad_norm": 1.4360955953598022, "learning_rate": 2.0752089136490253e-05, "loss": 0.2616, "step": 2740 }, { "epoch": 1.7581783194355356, "grad_norm": 1.2605819702148438, "learning_rate": 2.0741375616027428e-05, "loss": 0.2594, "step": 2741 }, { "epoch": 1.758819756254009, "grad_norm": 1.326772928237915, "learning_rate": 2.0730662095564606e-05, "loss": 0.3335, "step": 2742 }, { "epoch": 1.7594611930724824, "grad_norm": 1.387067437171936, "learning_rate": 2.071994857510178e-05, "loss": 0.2651, "step": 2743 }, { "epoch": 1.7601026298909557, "grad_norm": 1.206567406654358, "learning_rate": 2.0709235054638955e-05, "loss": 0.2459, "step": 2744 }, { "epoch": 1.760744066709429, "grad_norm": 1.3183711767196655, "learning_rate": 2.0698521534176133e-05, "loss": 0.2749, "step": 2745 }, { "epoch": 1.7613855035279025, "grad_norm": 1.244006872177124, "learning_rate": 2.0687808013713308e-05, "loss": 0.2806, "step": 2746 }, { "epoch": 1.762026940346376, "grad_norm": 1.0326732397079468, "learning_rate": 2.0677094493250483e-05, "loss": 0.2338, "step": 2747 }, { "epoch": 1.7626683771648493, "grad_norm": 1.207788348197937, "learning_rate": 2.0666380972787658e-05, "loss": 0.2317, "step": 2748 }, { "epoch": 1.7633098139833225, "grad_norm": 1.4955546855926514, "learning_rate": 2.0655667452324832e-05, "loss": 0.317, "step": 2749 }, { "epoch": 1.763951250801796, "grad_norm": 1.1819249391555786, "learning_rate": 2.064495393186201e-05, "loss": 0.2599, "step": 2750 }, { "epoch": 1.7645926876202696, "grad_norm": 1.5045245885849, "learning_rate": 2.0634240411399185e-05, "loss": 0.2543, "step": 2751 }, { "epoch": 1.7652341244387428, "grad_norm": 1.5355250835418701, "learning_rate": 2.0623526890936363e-05, "loss": 0.2417, "step": 2752 }, { "epoch": 1.7658755612572161, "grad_norm": 1.1712907552719116, "learning_rate": 2.0612813370473538e-05, "loss": 0.2306, "step": 2753 }, { "epoch": 1.7665169980756894, "grad_norm": 1.606549859046936, "learning_rate": 2.0602099850010713e-05, "loss": 0.339, "step": 2754 }, { "epoch": 1.767158434894163, "grad_norm": 0.9466663599014282, "learning_rate": 2.059138632954789e-05, "loss": 0.2182, "step": 2755 }, { "epoch": 1.7677998717126364, "grad_norm": 1.2670832872390747, "learning_rate": 2.0580672809085066e-05, "loss": 0.2334, "step": 2756 }, { "epoch": 1.7684413085311097, "grad_norm": 1.4511452913284302, "learning_rate": 2.0569959288622244e-05, "loss": 0.2633, "step": 2757 }, { "epoch": 1.769082745349583, "grad_norm": 1.0251394510269165, "learning_rate": 2.055924576815942e-05, "loss": 0.2273, "step": 2758 }, { "epoch": 1.7697241821680565, "grad_norm": 1.3764419555664062, "learning_rate": 2.0548532247696594e-05, "loss": 0.2858, "step": 2759 }, { "epoch": 1.7703656189865298, "grad_norm": 1.3452818393707275, "learning_rate": 2.0537818727233772e-05, "loss": 0.2899, "step": 2760 }, { "epoch": 1.7710070558050033, "grad_norm": 1.2491614818572998, "learning_rate": 2.0527105206770947e-05, "loss": 0.2318, "step": 2761 }, { "epoch": 1.7716484926234766, "grad_norm": 1.22304105758667, "learning_rate": 2.051639168630812e-05, "loss": 0.2511, "step": 2762 }, { "epoch": 1.7722899294419499, "grad_norm": 1.4042552709579468, "learning_rate": 2.05056781658453e-05, "loss": 0.2582, "step": 2763 }, { "epoch": 1.7729313662604234, "grad_norm": 1.24922776222229, "learning_rate": 2.049496464538247e-05, "loss": 0.241, "step": 2764 }, { "epoch": 1.7735728030788969, "grad_norm": 1.178894281387329, "learning_rate": 2.048425112491965e-05, "loss": 0.2438, "step": 2765 }, { "epoch": 1.7742142398973701, "grad_norm": 1.2425727844238281, "learning_rate": 2.0473537604456824e-05, "loss": 0.2661, "step": 2766 }, { "epoch": 1.7748556767158434, "grad_norm": 1.243976354598999, "learning_rate": 2.0462824083994002e-05, "loss": 0.256, "step": 2767 }, { "epoch": 1.7754971135343167, "grad_norm": 1.606789469718933, "learning_rate": 2.0452110563531177e-05, "loss": 0.2949, "step": 2768 }, { "epoch": 1.7761385503527902, "grad_norm": 1.0709271430969238, "learning_rate": 2.044139704306835e-05, "loss": 0.2382, "step": 2769 }, { "epoch": 1.7767799871712637, "grad_norm": 1.4935693740844727, "learning_rate": 2.043068352260553e-05, "loss": 0.2998, "step": 2770 }, { "epoch": 1.777421423989737, "grad_norm": 1.6563351154327393, "learning_rate": 2.0419970002142704e-05, "loss": 0.3788, "step": 2771 }, { "epoch": 1.7780628608082103, "grad_norm": 1.3790647983551025, "learning_rate": 2.0409256481679882e-05, "loss": 0.2901, "step": 2772 }, { "epoch": 1.7787042976266838, "grad_norm": 1.1183958053588867, "learning_rate": 2.0398542961217057e-05, "loss": 0.2444, "step": 2773 }, { "epoch": 1.7793457344451573, "grad_norm": 0.9315316677093506, "learning_rate": 2.0387829440754232e-05, "loss": 0.2056, "step": 2774 }, { "epoch": 1.7799871712636306, "grad_norm": 1.0240895748138428, "learning_rate": 2.037711592029141e-05, "loss": 0.2407, "step": 2775 }, { "epoch": 1.7806286080821039, "grad_norm": 1.277368426322937, "learning_rate": 2.0366402399828585e-05, "loss": 0.292, "step": 2776 }, { "epoch": 1.7812700449005772, "grad_norm": 1.2551982402801514, "learning_rate": 2.035568887936576e-05, "loss": 0.3048, "step": 2777 }, { "epoch": 1.7819114817190507, "grad_norm": 1.0179857015609741, "learning_rate": 2.0344975358902938e-05, "loss": 0.2453, "step": 2778 }, { "epoch": 1.7825529185375242, "grad_norm": 1.5433696508407593, "learning_rate": 2.0334261838440113e-05, "loss": 0.3181, "step": 2779 }, { "epoch": 1.7831943553559975, "grad_norm": 1.2114312648773193, "learning_rate": 2.032354831797729e-05, "loss": 0.2512, "step": 2780 }, { "epoch": 1.7838357921744707, "grad_norm": 1.234658122062683, "learning_rate": 2.0312834797514465e-05, "loss": 0.2468, "step": 2781 }, { "epoch": 1.784477228992944, "grad_norm": 1.2843525409698486, "learning_rate": 2.030212127705164e-05, "loss": 0.3009, "step": 2782 }, { "epoch": 1.7851186658114175, "grad_norm": 1.096819519996643, "learning_rate": 2.0291407756588815e-05, "loss": 0.2501, "step": 2783 }, { "epoch": 1.785760102629891, "grad_norm": 1.4081729650497437, "learning_rate": 2.028069423612599e-05, "loss": 0.2769, "step": 2784 }, { "epoch": 1.7864015394483643, "grad_norm": 1.147238850593567, "learning_rate": 2.0269980715663168e-05, "loss": 0.2467, "step": 2785 }, { "epoch": 1.7870429762668376, "grad_norm": 1.2554503679275513, "learning_rate": 2.0259267195200343e-05, "loss": 0.2341, "step": 2786 }, { "epoch": 1.7876844130853111, "grad_norm": 0.9783976078033447, "learning_rate": 2.024855367473752e-05, "loss": 0.2235, "step": 2787 }, { "epoch": 1.7883258499037846, "grad_norm": 1.1732383966445923, "learning_rate": 2.0237840154274696e-05, "loss": 0.2296, "step": 2788 }, { "epoch": 1.788967286722258, "grad_norm": 1.2995705604553223, "learning_rate": 2.022712663381187e-05, "loss": 0.2571, "step": 2789 }, { "epoch": 1.7896087235407312, "grad_norm": 1.3621442317962646, "learning_rate": 2.021641311334905e-05, "loss": 0.3016, "step": 2790 }, { "epoch": 1.7902501603592045, "grad_norm": 1.3003153800964355, "learning_rate": 2.0205699592886223e-05, "loss": 0.2519, "step": 2791 }, { "epoch": 1.790891597177678, "grad_norm": 1.317916989326477, "learning_rate": 2.01949860724234e-05, "loss": 0.2855, "step": 2792 }, { "epoch": 1.7915330339961515, "grad_norm": 1.434015154838562, "learning_rate": 2.0184272551960576e-05, "loss": 0.2832, "step": 2793 }, { "epoch": 1.7921744708146248, "grad_norm": 1.034784197807312, "learning_rate": 2.017355903149775e-05, "loss": 0.2199, "step": 2794 }, { "epoch": 1.792815907633098, "grad_norm": 1.7033127546310425, "learning_rate": 2.016284551103493e-05, "loss": 0.3239, "step": 2795 }, { "epoch": 1.7934573444515716, "grad_norm": 1.524983286857605, "learning_rate": 2.0152131990572104e-05, "loss": 0.3287, "step": 2796 }, { "epoch": 1.7940987812700449, "grad_norm": 1.392500638961792, "learning_rate": 2.014141847010928e-05, "loss": 0.2704, "step": 2797 }, { "epoch": 1.7947402180885184, "grad_norm": 1.3919243812561035, "learning_rate": 2.0130704949646457e-05, "loss": 0.2798, "step": 2798 }, { "epoch": 1.7953816549069916, "grad_norm": 1.4920817613601685, "learning_rate": 2.0119991429183628e-05, "loss": 0.3023, "step": 2799 }, { "epoch": 1.796023091725465, "grad_norm": 1.1224652528762817, "learning_rate": 2.0109277908720806e-05, "loss": 0.2496, "step": 2800 }, { "epoch": 1.7966645285439384, "grad_norm": 1.3960014581680298, "learning_rate": 2.009856438825798e-05, "loss": 0.2849, "step": 2801 }, { "epoch": 1.797305965362412, "grad_norm": 1.349721074104309, "learning_rate": 2.008785086779516e-05, "loss": 0.3198, "step": 2802 }, { "epoch": 1.7979474021808852, "grad_norm": 1.2132986783981323, "learning_rate": 2.0077137347332334e-05, "loss": 0.249, "step": 2803 }, { "epoch": 1.7985888389993585, "grad_norm": 1.5639011859893799, "learning_rate": 2.006642382686951e-05, "loss": 0.307, "step": 2804 }, { "epoch": 1.7992302758178318, "grad_norm": 1.6647379398345947, "learning_rate": 2.0055710306406687e-05, "loss": 0.4069, "step": 2805 }, { "epoch": 1.7998717126363053, "grad_norm": 1.3528355360031128, "learning_rate": 2.004499678594386e-05, "loss": 0.3264, "step": 2806 }, { "epoch": 1.8005131494547788, "grad_norm": 1.1267056465148926, "learning_rate": 2.003428326548104e-05, "loss": 0.2277, "step": 2807 }, { "epoch": 1.801154586273252, "grad_norm": 1.1838964223861694, "learning_rate": 2.0023569745018214e-05, "loss": 0.2496, "step": 2808 }, { "epoch": 1.8017960230917254, "grad_norm": 1.3582359552383423, "learning_rate": 2.001285622455539e-05, "loss": 0.2632, "step": 2809 }, { "epoch": 1.8024374599101989, "grad_norm": 1.4015820026397705, "learning_rate": 2.0002142704092567e-05, "loss": 0.2823, "step": 2810 }, { "epoch": 1.8030788967286724, "grad_norm": 1.1399987936019897, "learning_rate": 1.9991429183629742e-05, "loss": 0.2507, "step": 2811 }, { "epoch": 1.8037203335471457, "grad_norm": 1.0502655506134033, "learning_rate": 1.9980715663166917e-05, "loss": 0.2175, "step": 2812 }, { "epoch": 1.804361770365619, "grad_norm": 1.3273249864578247, "learning_rate": 1.9970002142704095e-05, "loss": 0.2775, "step": 2813 }, { "epoch": 1.8050032071840922, "grad_norm": 1.3836205005645752, "learning_rate": 1.995928862224127e-05, "loss": 0.3065, "step": 2814 }, { "epoch": 1.8056446440025657, "grad_norm": 1.2486295700073242, "learning_rate": 1.9948575101778448e-05, "loss": 0.2572, "step": 2815 }, { "epoch": 1.8062860808210393, "grad_norm": 0.9108333587646484, "learning_rate": 1.993786158131562e-05, "loss": 0.2212, "step": 2816 }, { "epoch": 1.8069275176395125, "grad_norm": 1.5446604490280151, "learning_rate": 1.9927148060852797e-05, "loss": 0.3536, "step": 2817 }, { "epoch": 1.8075689544579858, "grad_norm": 1.1347814798355103, "learning_rate": 1.9916434540389972e-05, "loss": 0.2521, "step": 2818 }, { "epoch": 1.808210391276459, "grad_norm": 1.2188585996627808, "learning_rate": 1.9905721019927147e-05, "loss": 0.2567, "step": 2819 }, { "epoch": 1.8088518280949326, "grad_norm": 1.2597768306732178, "learning_rate": 1.9895007499464325e-05, "loss": 0.2942, "step": 2820 }, { "epoch": 1.8094932649134061, "grad_norm": 1.3777542114257812, "learning_rate": 1.98842939790015e-05, "loss": 0.3025, "step": 2821 }, { "epoch": 1.8101347017318794, "grad_norm": 1.2105672359466553, "learning_rate": 1.9873580458538678e-05, "loss": 0.2489, "step": 2822 }, { "epoch": 1.8107761385503527, "grad_norm": 1.2563953399658203, "learning_rate": 1.9862866938075853e-05, "loss": 0.2787, "step": 2823 }, { "epoch": 1.8114175753688262, "grad_norm": 1.293502688407898, "learning_rate": 1.9852153417613028e-05, "loss": 0.2781, "step": 2824 }, { "epoch": 1.8120590121872997, "grad_norm": 1.2933735847473145, "learning_rate": 1.9841439897150206e-05, "loss": 0.2805, "step": 2825 }, { "epoch": 1.812700449005773, "grad_norm": 1.3499809503555298, "learning_rate": 1.983072637668738e-05, "loss": 0.2803, "step": 2826 }, { "epoch": 1.8133418858242463, "grad_norm": 1.115799903869629, "learning_rate": 1.9820012856224555e-05, "loss": 0.2397, "step": 2827 }, { "epoch": 1.8139833226427196, "grad_norm": 1.1294201612472534, "learning_rate": 1.9809299335761733e-05, "loss": 0.2465, "step": 2828 }, { "epoch": 1.814624759461193, "grad_norm": 1.2189154624938965, "learning_rate": 1.9798585815298908e-05, "loss": 0.2834, "step": 2829 }, { "epoch": 1.8152661962796666, "grad_norm": 1.166540503501892, "learning_rate": 1.9787872294836086e-05, "loss": 0.222, "step": 2830 }, { "epoch": 1.8159076330981399, "grad_norm": 1.2637147903442383, "learning_rate": 1.977715877437326e-05, "loss": 0.2498, "step": 2831 }, { "epoch": 1.8165490699166131, "grad_norm": 1.2110779285430908, "learning_rate": 1.9766445253910436e-05, "loss": 0.2257, "step": 2832 }, { "epoch": 1.8171905067350866, "grad_norm": 1.4091566801071167, "learning_rate": 1.975573173344761e-05, "loss": 0.2686, "step": 2833 }, { "epoch": 1.81783194355356, "grad_norm": 1.3463033437728882, "learning_rate": 1.9745018212984785e-05, "loss": 0.2798, "step": 2834 }, { "epoch": 1.8184733803720334, "grad_norm": 1.2072908878326416, "learning_rate": 1.9734304692521963e-05, "loss": 0.2331, "step": 2835 }, { "epoch": 1.8191148171905067, "grad_norm": 1.1746430397033691, "learning_rate": 1.9723591172059138e-05, "loss": 0.2368, "step": 2836 }, { "epoch": 1.81975625400898, "grad_norm": 1.251786470413208, "learning_rate": 1.9712877651596316e-05, "loss": 0.2683, "step": 2837 }, { "epoch": 1.8203976908274535, "grad_norm": 1.1972320079803467, "learning_rate": 1.970216413113349e-05, "loss": 0.2422, "step": 2838 }, { "epoch": 1.821039127645927, "grad_norm": 1.717531681060791, "learning_rate": 1.9691450610670666e-05, "loss": 0.2576, "step": 2839 }, { "epoch": 1.8216805644644003, "grad_norm": 1.4372601509094238, "learning_rate": 1.9680737090207844e-05, "loss": 0.2984, "step": 2840 }, { "epoch": 1.8223220012828736, "grad_norm": 1.459831953048706, "learning_rate": 1.967002356974502e-05, "loss": 0.2664, "step": 2841 }, { "epoch": 1.8229634381013469, "grad_norm": 1.3613909482955933, "learning_rate": 1.9659310049282194e-05, "loss": 0.275, "step": 2842 }, { "epoch": 1.8236048749198204, "grad_norm": 1.4182071685791016, "learning_rate": 1.9648596528819372e-05, "loss": 0.2751, "step": 2843 }, { "epoch": 1.8242463117382939, "grad_norm": 1.508071780204773, "learning_rate": 1.9637883008356546e-05, "loss": 0.3103, "step": 2844 }, { "epoch": 1.8248877485567672, "grad_norm": 1.5216937065124512, "learning_rate": 1.9627169487893725e-05, "loss": 0.2814, "step": 2845 }, { "epoch": 1.8255291853752404, "grad_norm": 1.3136008977890015, "learning_rate": 1.96164559674309e-05, "loss": 0.2785, "step": 2846 }, { "epoch": 1.826170622193714, "grad_norm": 1.6238435506820679, "learning_rate": 1.9605742446968074e-05, "loss": 0.347, "step": 2847 }, { "epoch": 1.8268120590121875, "grad_norm": 1.0347977876663208, "learning_rate": 1.9595028926505252e-05, "loss": 0.238, "step": 2848 }, { "epoch": 1.8274534958306607, "grad_norm": 1.209008812904358, "learning_rate": 1.9584315406042427e-05, "loss": 0.2546, "step": 2849 }, { "epoch": 1.828094932649134, "grad_norm": 1.140206217765808, "learning_rate": 1.9573601885579602e-05, "loss": 0.2466, "step": 2850 }, { "epoch": 1.8287363694676073, "grad_norm": 1.2714757919311523, "learning_rate": 1.9562888365116777e-05, "loss": 0.2546, "step": 2851 }, { "epoch": 1.8293778062860808, "grad_norm": 1.2258527278900146, "learning_rate": 1.9552174844653955e-05, "loss": 0.2458, "step": 2852 }, { "epoch": 1.8300192431045543, "grad_norm": 1.2672629356384277, "learning_rate": 1.954146132419113e-05, "loss": 0.2505, "step": 2853 }, { "epoch": 1.8306606799230276, "grad_norm": 1.0925387144088745, "learning_rate": 1.9530747803728304e-05, "loss": 0.2258, "step": 2854 }, { "epoch": 1.831302116741501, "grad_norm": 1.4490184783935547, "learning_rate": 1.9520034283265482e-05, "loss": 0.2949, "step": 2855 }, { "epoch": 1.8319435535599742, "grad_norm": 1.1759793758392334, "learning_rate": 1.9509320762802657e-05, "loss": 0.2434, "step": 2856 }, { "epoch": 1.8325849903784477, "grad_norm": 1.3226706981658936, "learning_rate": 1.9498607242339832e-05, "loss": 0.2642, "step": 2857 }, { "epoch": 1.8332264271969212, "grad_norm": 1.3042848110198975, "learning_rate": 1.948789372187701e-05, "loss": 0.273, "step": 2858 }, { "epoch": 1.8338678640153945, "grad_norm": 1.2411062717437744, "learning_rate": 1.9477180201414185e-05, "loss": 0.2486, "step": 2859 }, { "epoch": 1.8345093008338678, "grad_norm": 1.3512247800827026, "learning_rate": 1.9466466680951363e-05, "loss": 0.3024, "step": 2860 }, { "epoch": 1.8351507376523413, "grad_norm": 1.568995475769043, "learning_rate": 1.9455753160488538e-05, "loss": 0.3832, "step": 2861 }, { "epoch": 1.8357921744708148, "grad_norm": 1.3570785522460938, "learning_rate": 1.9445039640025713e-05, "loss": 0.3047, "step": 2862 }, { "epoch": 1.836433611289288, "grad_norm": 1.2090399265289307, "learning_rate": 1.943432611956289e-05, "loss": 0.2345, "step": 2863 }, { "epoch": 1.8370750481077613, "grad_norm": 1.0689505338668823, "learning_rate": 1.9423612599100065e-05, "loss": 0.2527, "step": 2864 }, { "epoch": 1.8377164849262346, "grad_norm": 0.9879593849182129, "learning_rate": 1.9412899078637244e-05, "loss": 0.2258, "step": 2865 }, { "epoch": 1.8383579217447081, "grad_norm": 1.1120315790176392, "learning_rate": 1.940218555817442e-05, "loss": 0.2343, "step": 2866 }, { "epoch": 1.8389993585631816, "grad_norm": 1.4128142595291138, "learning_rate": 1.9391472037711593e-05, "loss": 0.2797, "step": 2867 }, { "epoch": 1.839640795381655, "grad_norm": 1.130672574043274, "learning_rate": 1.9380758517248768e-05, "loss": 0.2541, "step": 2868 }, { "epoch": 1.8402822322001282, "grad_norm": 1.0854462385177612, "learning_rate": 1.9370044996785943e-05, "loss": 0.2233, "step": 2869 }, { "epoch": 1.8409236690186017, "grad_norm": 1.4498248100280762, "learning_rate": 1.935933147632312e-05, "loss": 0.3047, "step": 2870 }, { "epoch": 1.841565105837075, "grad_norm": 1.249896764755249, "learning_rate": 1.9348617955860296e-05, "loss": 0.2404, "step": 2871 }, { "epoch": 1.8422065426555485, "grad_norm": 1.1441004276275635, "learning_rate": 1.9337904435397474e-05, "loss": 0.2469, "step": 2872 }, { "epoch": 1.8428479794740218, "grad_norm": 1.3151154518127441, "learning_rate": 1.932719091493465e-05, "loss": 0.2732, "step": 2873 }, { "epoch": 1.843489416292495, "grad_norm": 1.35737144947052, "learning_rate": 1.9316477394471823e-05, "loss": 0.2681, "step": 2874 }, { "epoch": 1.8441308531109686, "grad_norm": 1.2066998481750488, "learning_rate": 1.9305763874009e-05, "loss": 0.2309, "step": 2875 }, { "epoch": 1.844772289929442, "grad_norm": 1.232856273651123, "learning_rate": 1.9295050353546176e-05, "loss": 0.2469, "step": 2876 }, { "epoch": 1.8454137267479154, "grad_norm": 1.0799235105514526, "learning_rate": 1.928433683308335e-05, "loss": 0.2292, "step": 2877 }, { "epoch": 1.8460551635663887, "grad_norm": 1.2436803579330444, "learning_rate": 1.927362331262053e-05, "loss": 0.2641, "step": 2878 }, { "epoch": 1.846696600384862, "grad_norm": 1.4357938766479492, "learning_rate": 1.9262909792157704e-05, "loss": 0.2762, "step": 2879 }, { "epoch": 1.8473380372033354, "grad_norm": 1.4985015392303467, "learning_rate": 1.9252196271694882e-05, "loss": 0.2718, "step": 2880 }, { "epoch": 1.847979474021809, "grad_norm": 1.43358314037323, "learning_rate": 1.9241482751232057e-05, "loss": 0.2538, "step": 2881 }, { "epoch": 1.8486209108402822, "grad_norm": 1.2886337041854858, "learning_rate": 1.923076923076923e-05, "loss": 0.2527, "step": 2882 }, { "epoch": 1.8492623476587555, "grad_norm": 1.67549729347229, "learning_rate": 1.922005571030641e-05, "loss": 0.3101, "step": 2883 }, { "epoch": 1.849903784477229, "grad_norm": 1.6565495729446411, "learning_rate": 1.920934218984358e-05, "loss": 0.3343, "step": 2884 }, { "epoch": 1.8505452212957025, "grad_norm": 1.1547706127166748, "learning_rate": 1.919862866938076e-05, "loss": 0.2434, "step": 2885 }, { "epoch": 1.8511866581141758, "grad_norm": 1.3563439846038818, "learning_rate": 1.9187915148917934e-05, "loss": 0.2851, "step": 2886 }, { "epoch": 1.851828094932649, "grad_norm": 1.0979413986206055, "learning_rate": 1.9177201628455112e-05, "loss": 0.2121, "step": 2887 }, { "epoch": 1.8524695317511224, "grad_norm": 1.2406891584396362, "learning_rate": 1.9166488107992287e-05, "loss": 0.2791, "step": 2888 }, { "epoch": 1.853110968569596, "grad_norm": 1.3393582105636597, "learning_rate": 1.915577458752946e-05, "loss": 0.2454, "step": 2889 }, { "epoch": 1.8537524053880694, "grad_norm": 1.2373778820037842, "learning_rate": 1.914506106706664e-05, "loss": 0.2824, "step": 2890 }, { "epoch": 1.8543938422065427, "grad_norm": 1.6845827102661133, "learning_rate": 1.9134347546603814e-05, "loss": 0.3498, "step": 2891 }, { "epoch": 1.855035279025016, "grad_norm": 1.1811977624893188, "learning_rate": 1.912363402614099e-05, "loss": 0.2461, "step": 2892 }, { "epoch": 1.8556767158434893, "grad_norm": 1.7543511390686035, "learning_rate": 1.9112920505678167e-05, "loss": 0.361, "step": 2893 }, { "epoch": 1.8563181526619628, "grad_norm": 1.0958515405654907, "learning_rate": 1.9102206985215342e-05, "loss": 0.2417, "step": 2894 }, { "epoch": 1.8569595894804363, "grad_norm": 1.2737470865249634, "learning_rate": 1.909149346475252e-05, "loss": 0.2443, "step": 2895 }, { "epoch": 1.8576010262989096, "grad_norm": 1.5078356266021729, "learning_rate": 1.9080779944289695e-05, "loss": 0.2781, "step": 2896 }, { "epoch": 1.8582424631173828, "grad_norm": 1.3866959810256958, "learning_rate": 1.907006642382687e-05, "loss": 0.2873, "step": 2897 }, { "epoch": 1.8588838999358563, "grad_norm": 1.3449560403823853, "learning_rate": 1.9059352903364048e-05, "loss": 0.2914, "step": 2898 }, { "epoch": 1.8595253367543298, "grad_norm": 1.64473295211792, "learning_rate": 1.9048639382901223e-05, "loss": 0.2959, "step": 2899 }, { "epoch": 1.8601667735728031, "grad_norm": 1.2009084224700928, "learning_rate": 1.90379258624384e-05, "loss": 0.2361, "step": 2900 }, { "epoch": 1.8608082103912764, "grad_norm": 1.332340121269226, "learning_rate": 1.9027212341975576e-05, "loss": 0.2504, "step": 2901 }, { "epoch": 1.8614496472097497, "grad_norm": 1.141594409942627, "learning_rate": 1.901649882151275e-05, "loss": 0.2304, "step": 2902 }, { "epoch": 1.8620910840282232, "grad_norm": 1.147766351699829, "learning_rate": 1.9005785301049925e-05, "loss": 0.2452, "step": 2903 }, { "epoch": 1.8627325208466967, "grad_norm": 1.030208706855774, "learning_rate": 1.89950717805871e-05, "loss": 0.2283, "step": 2904 }, { "epoch": 1.86337395766517, "grad_norm": 1.055985689163208, "learning_rate": 1.8984358260124278e-05, "loss": 0.2225, "step": 2905 }, { "epoch": 1.8640153944836433, "grad_norm": 1.3118748664855957, "learning_rate": 1.8973644739661453e-05, "loss": 0.33, "step": 2906 }, { "epoch": 1.8646568313021168, "grad_norm": 1.2922016382217407, "learning_rate": 1.8962931219198628e-05, "loss": 0.2875, "step": 2907 }, { "epoch": 1.86529826812059, "grad_norm": 1.3969027996063232, "learning_rate": 1.8952217698735806e-05, "loss": 0.2802, "step": 2908 }, { "epoch": 1.8659397049390636, "grad_norm": 1.1715315580368042, "learning_rate": 1.894150417827298e-05, "loss": 0.2334, "step": 2909 }, { "epoch": 1.8665811417575369, "grad_norm": 1.422223687171936, "learning_rate": 1.893079065781016e-05, "loss": 0.3207, "step": 2910 }, { "epoch": 1.8672225785760102, "grad_norm": 1.0969573259353638, "learning_rate": 1.8920077137347333e-05, "loss": 0.243, "step": 2911 }, { "epoch": 1.8678640153944837, "grad_norm": 1.1657963991165161, "learning_rate": 1.8909363616884508e-05, "loss": 0.2397, "step": 2912 }, { "epoch": 1.8685054522129572, "grad_norm": 1.1590511798858643, "learning_rate": 1.8898650096421686e-05, "loss": 0.2281, "step": 2913 }, { "epoch": 1.8691468890314304, "grad_norm": 1.126690149307251, "learning_rate": 1.888793657595886e-05, "loss": 0.2403, "step": 2914 }, { "epoch": 1.8697883258499037, "grad_norm": 1.4560147523880005, "learning_rate": 1.887722305549604e-05, "loss": 0.2981, "step": 2915 }, { "epoch": 1.870429762668377, "grad_norm": 1.3078945875167847, "learning_rate": 1.8866509535033214e-05, "loss": 0.2537, "step": 2916 }, { "epoch": 1.8710711994868505, "grad_norm": 1.2244306802749634, "learning_rate": 1.885579601457039e-05, "loss": 0.2507, "step": 2917 }, { "epoch": 1.871712636305324, "grad_norm": 1.2883838415145874, "learning_rate": 1.8845082494107567e-05, "loss": 0.2641, "step": 2918 }, { "epoch": 1.8723540731237973, "grad_norm": 1.2628684043884277, "learning_rate": 1.8834368973644738e-05, "loss": 0.2663, "step": 2919 }, { "epoch": 1.8729955099422706, "grad_norm": 1.320689082145691, "learning_rate": 1.8823655453181916e-05, "loss": 0.275, "step": 2920 }, { "epoch": 1.873636946760744, "grad_norm": 1.14690101146698, "learning_rate": 1.881294193271909e-05, "loss": 0.2517, "step": 2921 }, { "epoch": 1.8742783835792176, "grad_norm": 1.2523187398910522, "learning_rate": 1.8802228412256266e-05, "loss": 0.2801, "step": 2922 }, { "epoch": 1.874919820397691, "grad_norm": 1.2537097930908203, "learning_rate": 1.8791514891793444e-05, "loss": 0.2524, "step": 2923 }, { "epoch": 1.8755612572161642, "grad_norm": 1.3243439197540283, "learning_rate": 1.878080137133062e-05, "loss": 0.3097, "step": 2924 }, { "epoch": 1.8762026940346375, "grad_norm": 1.2899843454360962, "learning_rate": 1.8770087850867797e-05, "loss": 0.2667, "step": 2925 }, { "epoch": 1.876844130853111, "grad_norm": 1.2986458539962769, "learning_rate": 1.8759374330404972e-05, "loss": 0.2794, "step": 2926 }, { "epoch": 1.8774855676715845, "grad_norm": 0.9228143692016602, "learning_rate": 1.8748660809942146e-05, "loss": 0.2242, "step": 2927 }, { "epoch": 1.8781270044900578, "grad_norm": 1.0716543197631836, "learning_rate": 1.8737947289479325e-05, "loss": 0.2493, "step": 2928 }, { "epoch": 1.878768441308531, "grad_norm": 1.1595958471298218, "learning_rate": 1.87272337690165e-05, "loss": 0.2638, "step": 2929 }, { "epoch": 1.8794098781270043, "grad_norm": 1.5315141677856445, "learning_rate": 1.8716520248553678e-05, "loss": 0.3146, "step": 2930 }, { "epoch": 1.8800513149454778, "grad_norm": 1.0838031768798828, "learning_rate": 1.8705806728090852e-05, "loss": 0.2602, "step": 2931 }, { "epoch": 1.8806927517639513, "grad_norm": 1.235734462738037, "learning_rate": 1.8695093207628027e-05, "loss": 0.2607, "step": 2932 }, { "epoch": 1.8813341885824246, "grad_norm": 1.2216901779174805, "learning_rate": 1.8684379687165205e-05, "loss": 0.2676, "step": 2933 }, { "epoch": 1.881975625400898, "grad_norm": 1.3639174699783325, "learning_rate": 1.867366616670238e-05, "loss": 0.2875, "step": 2934 }, { "epoch": 1.8826170622193714, "grad_norm": 1.3871283531188965, "learning_rate": 1.8662952646239558e-05, "loss": 0.2334, "step": 2935 }, { "epoch": 1.883258499037845, "grad_norm": 1.4074182510375977, "learning_rate": 1.865223912577673e-05, "loss": 0.306, "step": 2936 }, { "epoch": 1.8838999358563182, "grad_norm": 1.4780516624450684, "learning_rate": 1.8641525605313904e-05, "loss": 0.2704, "step": 2937 }, { "epoch": 1.8845413726747915, "grad_norm": 1.651982307434082, "learning_rate": 1.8630812084851082e-05, "loss": 0.3453, "step": 2938 }, { "epoch": 1.8851828094932648, "grad_norm": 1.3747361898422241, "learning_rate": 1.8620098564388257e-05, "loss": 0.2836, "step": 2939 }, { "epoch": 1.8858242463117383, "grad_norm": 1.0580253601074219, "learning_rate": 1.8609385043925435e-05, "loss": 0.2413, "step": 2940 }, { "epoch": 1.8864656831302118, "grad_norm": 1.5023243427276611, "learning_rate": 1.859867152346261e-05, "loss": 0.2834, "step": 2941 }, { "epoch": 1.887107119948685, "grad_norm": 1.2132879495620728, "learning_rate": 1.8587958002999785e-05, "loss": 0.2481, "step": 2942 }, { "epoch": 1.8877485567671584, "grad_norm": 1.4404101371765137, "learning_rate": 1.8577244482536963e-05, "loss": 0.2962, "step": 2943 }, { "epoch": 1.8883899935856319, "grad_norm": 1.3221436738967896, "learning_rate": 1.8566530962074138e-05, "loss": 0.2978, "step": 2944 }, { "epoch": 1.8890314304041051, "grad_norm": 1.3946473598480225, "learning_rate": 1.8555817441611316e-05, "loss": 0.3062, "step": 2945 }, { "epoch": 1.8896728672225787, "grad_norm": 1.1614596843719482, "learning_rate": 1.854510392114849e-05, "loss": 0.2844, "step": 2946 }, { "epoch": 1.890314304041052, "grad_norm": 1.0883163213729858, "learning_rate": 1.8534390400685665e-05, "loss": 0.2322, "step": 2947 }, { "epoch": 1.8909557408595252, "grad_norm": 1.003301978111267, "learning_rate": 1.8523676880222844e-05, "loss": 0.2235, "step": 2948 }, { "epoch": 1.8915971776779987, "grad_norm": 1.2631417512893677, "learning_rate": 1.851296335976002e-05, "loss": 0.2428, "step": 2949 }, { "epoch": 1.8922386144964722, "grad_norm": 1.2674521207809448, "learning_rate": 1.8502249839297196e-05, "loss": 0.2343, "step": 2950 }, { "epoch": 1.8928800513149455, "grad_norm": 1.495101809501648, "learning_rate": 1.849153631883437e-05, "loss": 0.3023, "step": 2951 }, { "epoch": 1.8935214881334188, "grad_norm": 1.6031147241592407, "learning_rate": 1.8480822798371546e-05, "loss": 0.3404, "step": 2952 }, { "epoch": 1.894162924951892, "grad_norm": 1.8350602388381958, "learning_rate": 1.847010927790872e-05, "loss": 0.387, "step": 2953 }, { "epoch": 1.8948043617703656, "grad_norm": 1.0959059000015259, "learning_rate": 1.8459395757445895e-05, "loss": 0.2621, "step": 2954 }, { "epoch": 1.895445798588839, "grad_norm": 1.3804062604904175, "learning_rate": 1.8448682236983074e-05, "loss": 0.2601, "step": 2955 }, { "epoch": 1.8960872354073124, "grad_norm": 1.3562461137771606, "learning_rate": 1.843796871652025e-05, "loss": 0.2899, "step": 2956 }, { "epoch": 1.8967286722257857, "grad_norm": 1.3094033002853394, "learning_rate": 1.8427255196057423e-05, "loss": 0.2743, "step": 2957 }, { "epoch": 1.8973701090442592, "grad_norm": 1.1560701131820679, "learning_rate": 1.84165416755946e-05, "loss": 0.2399, "step": 2958 }, { "epoch": 1.8980115458627327, "grad_norm": 1.1370182037353516, "learning_rate": 1.8405828155131776e-05, "loss": 0.2322, "step": 2959 }, { "epoch": 1.898652982681206, "grad_norm": 1.5980805158615112, "learning_rate": 1.8395114634668954e-05, "loss": 0.3315, "step": 2960 }, { "epoch": 1.8992944194996793, "grad_norm": 1.1557945013046265, "learning_rate": 1.838440111420613e-05, "loss": 0.2405, "step": 2961 }, { "epoch": 1.8999358563181525, "grad_norm": 1.3413949012756348, "learning_rate": 1.8373687593743304e-05, "loss": 0.2438, "step": 2962 }, { "epoch": 1.900577293136626, "grad_norm": 1.2336015701293945, "learning_rate": 1.8362974073280482e-05, "loss": 0.2856, "step": 2963 }, { "epoch": 1.9012187299550996, "grad_norm": 1.0245451927185059, "learning_rate": 1.8352260552817657e-05, "loss": 0.2285, "step": 2964 }, { "epoch": 1.9018601667735728, "grad_norm": 1.1723086833953857, "learning_rate": 1.8341547032354835e-05, "loss": 0.2356, "step": 2965 }, { "epoch": 1.9025016035920461, "grad_norm": 1.3550771474838257, "learning_rate": 1.833083351189201e-05, "loss": 0.2736, "step": 2966 }, { "epoch": 1.9031430404105194, "grad_norm": 1.450952410697937, "learning_rate": 1.8320119991429184e-05, "loss": 0.2774, "step": 2967 }, { "epoch": 1.903784477228993, "grad_norm": 0.9889509081840515, "learning_rate": 1.8309406470966362e-05, "loss": 0.2184, "step": 2968 }, { "epoch": 1.9044259140474664, "grad_norm": 1.2749128341674805, "learning_rate": 1.8298692950503537e-05, "loss": 0.2746, "step": 2969 }, { "epoch": 1.9050673508659397, "grad_norm": 1.4219648838043213, "learning_rate": 1.8287979430040712e-05, "loss": 0.3507, "step": 2970 }, { "epoch": 1.905708787684413, "grad_norm": 0.9860979318618774, "learning_rate": 1.8277265909577887e-05, "loss": 0.2408, "step": 2971 }, { "epoch": 1.9063502245028865, "grad_norm": 1.1361294984817505, "learning_rate": 1.826655238911506e-05, "loss": 0.2244, "step": 2972 }, { "epoch": 1.90699166132136, "grad_norm": 1.4251682758331299, "learning_rate": 1.825583886865224e-05, "loss": 0.2655, "step": 2973 }, { "epoch": 1.9076330981398333, "grad_norm": 1.105365514755249, "learning_rate": 1.8245125348189414e-05, "loss": 0.2383, "step": 2974 }, { "epoch": 1.9082745349583066, "grad_norm": 1.2831215858459473, "learning_rate": 1.8234411827726593e-05, "loss": 0.2657, "step": 2975 }, { "epoch": 1.9089159717767799, "grad_norm": 1.1376351118087769, "learning_rate": 1.8223698307263767e-05, "loss": 0.2457, "step": 2976 }, { "epoch": 1.9095574085952534, "grad_norm": 1.1115778684616089, "learning_rate": 1.8212984786800942e-05, "loss": 0.2313, "step": 2977 }, { "epoch": 1.9101988454137269, "grad_norm": 1.1635206937789917, "learning_rate": 1.820227126633812e-05, "loss": 0.2389, "step": 2978 }, { "epoch": 1.9108402822322001, "grad_norm": 1.3364347219467163, "learning_rate": 1.8191557745875295e-05, "loss": 0.2633, "step": 2979 }, { "epoch": 1.9114817190506734, "grad_norm": 1.3800411224365234, "learning_rate": 1.8180844225412473e-05, "loss": 0.2606, "step": 2980 }, { "epoch": 1.912123155869147, "grad_norm": 1.1631747484207153, "learning_rate": 1.8170130704949648e-05, "loss": 0.2277, "step": 2981 }, { "epoch": 1.9127645926876202, "grad_norm": 1.427559733390808, "learning_rate": 1.8159417184486823e-05, "loss": 0.2294, "step": 2982 }, { "epoch": 1.9134060295060937, "grad_norm": 1.3161547183990479, "learning_rate": 1.8148703664024e-05, "loss": 0.2954, "step": 2983 }, { "epoch": 1.914047466324567, "grad_norm": 1.3244918584823608, "learning_rate": 1.8137990143561176e-05, "loss": 0.2856, "step": 2984 }, { "epoch": 1.9146889031430403, "grad_norm": 1.1136788129806519, "learning_rate": 1.8127276623098354e-05, "loss": 0.2275, "step": 2985 }, { "epoch": 1.9153303399615138, "grad_norm": 1.2512469291687012, "learning_rate": 1.811656310263553e-05, "loss": 0.274, "step": 2986 }, { "epoch": 1.9159717767799873, "grad_norm": 1.3768032789230347, "learning_rate": 1.81058495821727e-05, "loss": 0.292, "step": 2987 }, { "epoch": 1.9166132135984606, "grad_norm": 1.1369327306747437, "learning_rate": 1.8095136061709878e-05, "loss": 0.2114, "step": 2988 }, { "epoch": 1.9172546504169339, "grad_norm": 1.0021921396255493, "learning_rate": 1.8084422541247053e-05, "loss": 0.2367, "step": 2989 }, { "epoch": 1.9178960872354072, "grad_norm": 1.5238621234893799, "learning_rate": 1.807370902078423e-05, "loss": 0.3498, "step": 2990 }, { "epoch": 1.9185375240538807, "grad_norm": 1.4855083227157593, "learning_rate": 1.8062995500321406e-05, "loss": 0.3182, "step": 2991 }, { "epoch": 1.9191789608723542, "grad_norm": 1.1731650829315186, "learning_rate": 1.805228197985858e-05, "loss": 0.2492, "step": 2992 }, { "epoch": 1.9198203976908275, "grad_norm": 1.615875005722046, "learning_rate": 1.804156845939576e-05, "loss": 0.2507, "step": 2993 }, { "epoch": 1.9204618345093007, "grad_norm": 1.0483556985855103, "learning_rate": 1.8030854938932933e-05, "loss": 0.2393, "step": 2994 }, { "epoch": 1.9211032713277743, "grad_norm": 1.029128909111023, "learning_rate": 1.802014141847011e-05, "loss": 0.2126, "step": 2995 }, { "epoch": 1.9217447081462478, "grad_norm": 1.244898796081543, "learning_rate": 1.8009427898007286e-05, "loss": 0.2735, "step": 2996 }, { "epoch": 1.922386144964721, "grad_norm": 1.36542809009552, "learning_rate": 1.799871437754446e-05, "loss": 0.294, "step": 2997 }, { "epoch": 1.9230275817831943, "grad_norm": 1.3502304553985596, "learning_rate": 1.798800085708164e-05, "loss": 0.2554, "step": 2998 }, { "epoch": 1.9236690186016676, "grad_norm": 1.0062932968139648, "learning_rate": 1.7977287336618814e-05, "loss": 0.2063, "step": 2999 }, { "epoch": 1.9243104554201411, "grad_norm": 1.5326350927352905, "learning_rate": 1.7966573816155992e-05, "loss": 0.3303, "step": 3000 }, { "epoch": 1.9249518922386146, "grad_norm": 1.2404776811599731, "learning_rate": 1.7955860295693167e-05, "loss": 0.2673, "step": 3001 }, { "epoch": 1.925593329057088, "grad_norm": 1.2313414812088013, "learning_rate": 1.794514677523034e-05, "loss": 0.2567, "step": 3002 }, { "epoch": 1.9262347658755612, "grad_norm": 1.2884515523910522, "learning_rate": 1.793443325476752e-05, "loss": 0.2705, "step": 3003 }, { "epoch": 1.9268762026940345, "grad_norm": 1.1908575296401978, "learning_rate": 1.7923719734304695e-05, "loss": 0.248, "step": 3004 }, { "epoch": 1.927517639512508, "grad_norm": 1.1131994724273682, "learning_rate": 1.791300621384187e-05, "loss": 0.2221, "step": 3005 }, { "epoch": 1.9281590763309815, "grad_norm": 1.4695026874542236, "learning_rate": 1.7902292693379044e-05, "loss": 0.2692, "step": 3006 }, { "epoch": 1.9288005131494548, "grad_norm": 1.5634608268737793, "learning_rate": 1.789157917291622e-05, "loss": 0.313, "step": 3007 }, { "epoch": 1.929441949967928, "grad_norm": 1.443362832069397, "learning_rate": 1.7880865652453397e-05, "loss": 0.2891, "step": 3008 }, { "epoch": 1.9300833867864016, "grad_norm": 1.0011996030807495, "learning_rate": 1.7870152131990572e-05, "loss": 0.2133, "step": 3009 }, { "epoch": 1.930724823604875, "grad_norm": 1.4160627126693726, "learning_rate": 1.785943861152775e-05, "loss": 0.2998, "step": 3010 }, { "epoch": 1.9313662604233484, "grad_norm": 1.2188184261322021, "learning_rate": 1.7848725091064925e-05, "loss": 0.2401, "step": 3011 }, { "epoch": 1.9320076972418216, "grad_norm": 1.5028479099273682, "learning_rate": 1.78380115706021e-05, "loss": 0.3056, "step": 3012 }, { "epoch": 1.932649134060295, "grad_norm": 1.0644205808639526, "learning_rate": 1.7827298050139278e-05, "loss": 0.2452, "step": 3013 }, { "epoch": 1.9332905708787684, "grad_norm": 1.4070881605148315, "learning_rate": 1.7816584529676452e-05, "loss": 0.3015, "step": 3014 }, { "epoch": 1.933932007697242, "grad_norm": 1.3393936157226562, "learning_rate": 1.780587100921363e-05, "loss": 0.2936, "step": 3015 }, { "epoch": 1.9345734445157152, "grad_norm": 1.0629676580429077, "learning_rate": 1.7795157488750805e-05, "loss": 0.2075, "step": 3016 }, { "epoch": 1.9352148813341885, "grad_norm": 1.2843185663223267, "learning_rate": 1.778444396828798e-05, "loss": 0.2686, "step": 3017 }, { "epoch": 1.935856318152662, "grad_norm": 1.2578312158584595, "learning_rate": 1.7773730447825158e-05, "loss": 0.3017, "step": 3018 }, { "epoch": 1.9364977549711353, "grad_norm": 1.2323354482650757, "learning_rate": 1.7763016927362333e-05, "loss": 0.248, "step": 3019 }, { "epoch": 1.9371391917896088, "grad_norm": 1.3464654684066772, "learning_rate": 1.7752303406899508e-05, "loss": 0.2793, "step": 3020 }, { "epoch": 1.937780628608082, "grad_norm": 1.3061271905899048, "learning_rate": 1.7741589886436686e-05, "loss": 0.2979, "step": 3021 }, { "epoch": 1.9384220654265554, "grad_norm": 1.2221277952194214, "learning_rate": 1.7730876365973857e-05, "loss": 0.2503, "step": 3022 }, { "epoch": 1.9390635022450289, "grad_norm": 1.5747045278549194, "learning_rate": 1.7720162845511035e-05, "loss": 0.3567, "step": 3023 }, { "epoch": 1.9397049390635024, "grad_norm": 1.1148639917373657, "learning_rate": 1.770944932504821e-05, "loss": 0.2456, "step": 3024 }, { "epoch": 1.9403463758819757, "grad_norm": 1.173774242401123, "learning_rate": 1.7698735804585388e-05, "loss": 0.2661, "step": 3025 }, { "epoch": 1.940987812700449, "grad_norm": 1.3056652545928955, "learning_rate": 1.7688022284122563e-05, "loss": 0.2632, "step": 3026 }, { "epoch": 1.9416292495189222, "grad_norm": 1.4924780130386353, "learning_rate": 1.7677308763659738e-05, "loss": 0.2739, "step": 3027 }, { "epoch": 1.9422706863373957, "grad_norm": 1.0539811849594116, "learning_rate": 1.7666595243196916e-05, "loss": 0.2261, "step": 3028 }, { "epoch": 1.9429121231558693, "grad_norm": 1.3523048162460327, "learning_rate": 1.765588172273409e-05, "loss": 0.3097, "step": 3029 }, { "epoch": 1.9435535599743425, "grad_norm": 1.2333487272262573, "learning_rate": 1.764516820227127e-05, "loss": 0.2523, "step": 3030 }, { "epoch": 1.9441949967928158, "grad_norm": 1.4489803314208984, "learning_rate": 1.7634454681808444e-05, "loss": 0.2831, "step": 3031 }, { "epoch": 1.9448364336112893, "grad_norm": 1.2186720371246338, "learning_rate": 1.7623741161345618e-05, "loss": 0.255, "step": 3032 }, { "epoch": 1.9454778704297628, "grad_norm": 1.368416666984558, "learning_rate": 1.7613027640882796e-05, "loss": 0.2921, "step": 3033 }, { "epoch": 1.9461193072482361, "grad_norm": 1.514708399772644, "learning_rate": 1.760231412041997e-05, "loss": 0.3287, "step": 3034 }, { "epoch": 1.9467607440667094, "grad_norm": 1.237584114074707, "learning_rate": 1.7591600599957146e-05, "loss": 0.235, "step": 3035 }, { "epoch": 1.9474021808851827, "grad_norm": 1.3979055881500244, "learning_rate": 1.7580887079494324e-05, "loss": 0.2743, "step": 3036 }, { "epoch": 1.9480436177036562, "grad_norm": 1.4024184942245483, "learning_rate": 1.75701735590315e-05, "loss": 0.2621, "step": 3037 }, { "epoch": 1.9486850545221297, "grad_norm": 1.0158419609069824, "learning_rate": 1.7559460038568677e-05, "loss": 0.203, "step": 3038 }, { "epoch": 1.949326491340603, "grad_norm": 1.0215867757797241, "learning_rate": 1.754874651810585e-05, "loss": 0.2411, "step": 3039 }, { "epoch": 1.9499679281590763, "grad_norm": 1.2646123170852661, "learning_rate": 1.7538032997643027e-05, "loss": 0.2796, "step": 3040 }, { "epoch": 1.9506093649775496, "grad_norm": 1.5651026964187622, "learning_rate": 1.75273194771802e-05, "loss": 0.2731, "step": 3041 }, { "epoch": 1.951250801796023, "grad_norm": 1.0898442268371582, "learning_rate": 1.7516605956717376e-05, "loss": 0.2286, "step": 3042 }, { "epoch": 1.9518922386144966, "grad_norm": 1.4611003398895264, "learning_rate": 1.7505892436254554e-05, "loss": 0.2825, "step": 3043 }, { "epoch": 1.9525336754329699, "grad_norm": 1.2309980392456055, "learning_rate": 1.749517891579173e-05, "loss": 0.2762, "step": 3044 }, { "epoch": 1.9531751122514431, "grad_norm": 1.12349271774292, "learning_rate": 1.7484465395328907e-05, "loss": 0.2059, "step": 3045 }, { "epoch": 1.9538165490699166, "grad_norm": 1.2761781215667725, "learning_rate": 1.7473751874866082e-05, "loss": 0.2485, "step": 3046 }, { "epoch": 1.9544579858883901, "grad_norm": 1.2490167617797852, "learning_rate": 1.7463038354403257e-05, "loss": 0.2716, "step": 3047 }, { "epoch": 1.9550994227068634, "grad_norm": 1.4777674674987793, "learning_rate": 1.7452324833940435e-05, "loss": 0.325, "step": 3048 }, { "epoch": 1.9557408595253367, "grad_norm": 1.043175220489502, "learning_rate": 1.744161131347761e-05, "loss": 0.2348, "step": 3049 }, { "epoch": 1.95638229634381, "grad_norm": 1.1208642721176147, "learning_rate": 1.7430897793014784e-05, "loss": 0.2451, "step": 3050 }, { "epoch": 1.9570237331622835, "grad_norm": 1.3023686408996582, "learning_rate": 1.7420184272551962e-05, "loss": 0.2745, "step": 3051 }, { "epoch": 1.957665169980757, "grad_norm": 1.096551537513733, "learning_rate": 1.7409470752089137e-05, "loss": 0.221, "step": 3052 }, { "epoch": 1.9583066067992303, "grad_norm": 1.2288082838058472, "learning_rate": 1.7398757231626315e-05, "loss": 0.246, "step": 3053 }, { "epoch": 1.9589480436177036, "grad_norm": 1.2888870239257812, "learning_rate": 1.738804371116349e-05, "loss": 0.2343, "step": 3054 }, { "epoch": 1.959589480436177, "grad_norm": 1.7487210035324097, "learning_rate": 1.7377330190700665e-05, "loss": 0.3475, "step": 3055 }, { "epoch": 1.9602309172546504, "grad_norm": 1.04288911819458, "learning_rate": 1.736661667023784e-05, "loss": 0.2253, "step": 3056 }, { "epoch": 1.9608723540731239, "grad_norm": 0.9953528046607971, "learning_rate": 1.7355903149775014e-05, "loss": 0.2307, "step": 3057 }, { "epoch": 1.9615137908915972, "grad_norm": 1.0966219902038574, "learning_rate": 1.7345189629312193e-05, "loss": 0.2446, "step": 3058 }, { "epoch": 1.9621552277100704, "grad_norm": 1.342126488685608, "learning_rate": 1.7334476108849367e-05, "loss": 0.2617, "step": 3059 }, { "epoch": 1.962796664528544, "grad_norm": 1.9366670846939087, "learning_rate": 1.7323762588386545e-05, "loss": 0.4101, "step": 3060 }, { "epoch": 1.9634381013470175, "grad_norm": 1.3125391006469727, "learning_rate": 1.731304906792372e-05, "loss": 0.2691, "step": 3061 }, { "epoch": 1.9640795381654907, "grad_norm": 1.2606667280197144, "learning_rate": 1.7302335547460895e-05, "loss": 0.2432, "step": 3062 }, { "epoch": 1.964720974983964, "grad_norm": 1.1786777973175049, "learning_rate": 1.7291622026998073e-05, "loss": 0.239, "step": 3063 }, { "epoch": 1.9653624118024373, "grad_norm": 1.1185567378997803, "learning_rate": 1.7280908506535248e-05, "loss": 0.2427, "step": 3064 }, { "epoch": 1.9660038486209108, "grad_norm": 0.9320878386497498, "learning_rate": 1.7270194986072426e-05, "loss": 0.201, "step": 3065 }, { "epoch": 1.9666452854393843, "grad_norm": 1.0495415925979614, "learning_rate": 1.72594814656096e-05, "loss": 0.2237, "step": 3066 }, { "epoch": 1.9672867222578576, "grad_norm": 1.17137610912323, "learning_rate": 1.7248767945146776e-05, "loss": 0.2422, "step": 3067 }, { "epoch": 1.967928159076331, "grad_norm": 1.2419120073318481, "learning_rate": 1.7238054424683954e-05, "loss": 0.2719, "step": 3068 }, { "epoch": 1.9685695958948044, "grad_norm": 1.5268306732177734, "learning_rate": 1.722734090422113e-05, "loss": 0.3515, "step": 3069 }, { "epoch": 1.9692110327132777, "grad_norm": 1.2866042852401733, "learning_rate": 1.7216627383758303e-05, "loss": 0.2554, "step": 3070 }, { "epoch": 1.9698524695317512, "grad_norm": 1.0857415199279785, "learning_rate": 1.720591386329548e-05, "loss": 0.2352, "step": 3071 }, { "epoch": 1.9704939063502245, "grad_norm": 1.2610812187194824, "learning_rate": 1.7195200342832656e-05, "loss": 0.2398, "step": 3072 }, { "epoch": 1.9711353431686978, "grad_norm": 1.3375837802886963, "learning_rate": 1.718448682236983e-05, "loss": 0.2496, "step": 3073 }, { "epoch": 1.9717767799871713, "grad_norm": 1.4237446784973145, "learning_rate": 1.7173773301907006e-05, "loss": 0.2648, "step": 3074 }, { "epoch": 1.9724182168056448, "grad_norm": 1.1972532272338867, "learning_rate": 1.7163059781444184e-05, "loss": 0.236, "step": 3075 }, { "epoch": 1.973059653624118, "grad_norm": 1.3224481344223022, "learning_rate": 1.715234626098136e-05, "loss": 0.271, "step": 3076 }, { "epoch": 1.9737010904425913, "grad_norm": 1.4129737615585327, "learning_rate": 1.7141632740518533e-05, "loss": 0.2922, "step": 3077 }, { "epoch": 1.9743425272610646, "grad_norm": 1.6794908046722412, "learning_rate": 1.713091922005571e-05, "loss": 0.3592, "step": 3078 }, { "epoch": 1.9749839640795381, "grad_norm": 1.2249882221221924, "learning_rate": 1.7120205699592886e-05, "loss": 0.255, "step": 3079 }, { "epoch": 1.9756254008980116, "grad_norm": 1.2134740352630615, "learning_rate": 1.7109492179130064e-05, "loss": 0.2589, "step": 3080 }, { "epoch": 1.976266837716485, "grad_norm": 1.67762291431427, "learning_rate": 1.709877865866724e-05, "loss": 0.2688, "step": 3081 }, { "epoch": 1.9769082745349582, "grad_norm": 1.2949259281158447, "learning_rate": 1.7088065138204414e-05, "loss": 0.2646, "step": 3082 }, { "epoch": 1.9775497113534317, "grad_norm": 1.3241339921951294, "learning_rate": 1.7077351617741592e-05, "loss": 0.2758, "step": 3083 }, { "epoch": 1.9781911481719052, "grad_norm": 1.0809659957885742, "learning_rate": 1.7066638097278767e-05, "loss": 0.24, "step": 3084 }, { "epoch": 1.9788325849903785, "grad_norm": 1.4088443517684937, "learning_rate": 1.705592457681594e-05, "loss": 0.2417, "step": 3085 }, { "epoch": 1.9794740218088518, "grad_norm": 1.5836800336837769, "learning_rate": 1.704521105635312e-05, "loss": 0.2972, "step": 3086 }, { "epoch": 1.980115458627325, "grad_norm": 0.9479689598083496, "learning_rate": 1.7034497535890294e-05, "loss": 0.2037, "step": 3087 }, { "epoch": 1.9807568954457986, "grad_norm": 1.246972680091858, "learning_rate": 1.7023784015427473e-05, "loss": 0.2974, "step": 3088 }, { "epoch": 1.981398332264272, "grad_norm": 1.1176539659500122, "learning_rate": 1.7013070494964647e-05, "loss": 0.2418, "step": 3089 }, { "epoch": 1.9820397690827454, "grad_norm": 1.1841623783111572, "learning_rate": 1.7002356974501822e-05, "loss": 0.2575, "step": 3090 }, { "epoch": 1.9826812059012187, "grad_norm": 0.9692655205726624, "learning_rate": 1.6991643454038997e-05, "loss": 0.2251, "step": 3091 }, { "epoch": 1.9833226427196922, "grad_norm": 1.2622815370559692, "learning_rate": 1.698092993357617e-05, "loss": 0.2623, "step": 3092 }, { "epoch": 1.9839640795381654, "grad_norm": 1.0218009948730469, "learning_rate": 1.697021641311335e-05, "loss": 0.2357, "step": 3093 }, { "epoch": 1.984605516356639, "grad_norm": 1.4036316871643066, "learning_rate": 1.6959502892650525e-05, "loss": 0.2982, "step": 3094 }, { "epoch": 1.9852469531751122, "grad_norm": 0.9704031348228455, "learning_rate": 1.6948789372187703e-05, "loss": 0.2243, "step": 3095 }, { "epoch": 1.9858883899935855, "grad_norm": 1.1666396856307983, "learning_rate": 1.6938075851724878e-05, "loss": 0.2443, "step": 3096 }, { "epoch": 1.986529826812059, "grad_norm": 1.2713260650634766, "learning_rate": 1.6927362331262052e-05, "loss": 0.2427, "step": 3097 }, { "epoch": 1.9871712636305325, "grad_norm": 1.2753945589065552, "learning_rate": 1.691664881079923e-05, "loss": 0.2524, "step": 3098 }, { "epoch": 1.9878127004490058, "grad_norm": 1.0323067903518677, "learning_rate": 1.6905935290336405e-05, "loss": 0.2322, "step": 3099 }, { "epoch": 1.988454137267479, "grad_norm": 1.3404896259307861, "learning_rate": 1.689522176987358e-05, "loss": 0.2469, "step": 3100 }, { "epoch": 1.9890955740859524, "grad_norm": 1.1781694889068604, "learning_rate": 1.6884508249410758e-05, "loss": 0.2571, "step": 3101 }, { "epoch": 1.989737010904426, "grad_norm": 1.273999571800232, "learning_rate": 1.6873794728947933e-05, "loss": 0.284, "step": 3102 }, { "epoch": 1.9903784477228994, "grad_norm": 1.4203190803527832, "learning_rate": 1.686308120848511e-05, "loss": 0.2538, "step": 3103 }, { "epoch": 1.9910198845413727, "grad_norm": 1.41256844997406, "learning_rate": 1.6852367688022286e-05, "loss": 0.2919, "step": 3104 }, { "epoch": 1.991661321359846, "grad_norm": 1.4024899005889893, "learning_rate": 1.684165416755946e-05, "loss": 0.2658, "step": 3105 }, { "epoch": 1.9923027581783195, "grad_norm": 1.6255589723587036, "learning_rate": 1.683094064709664e-05, "loss": 0.3173, "step": 3106 }, { "epoch": 1.9929441949967928, "grad_norm": 1.3591653108596802, "learning_rate": 1.6820227126633813e-05, "loss": 0.2752, "step": 3107 }, { "epoch": 1.9935856318152663, "grad_norm": 1.3019657135009766, "learning_rate": 1.6809513606170988e-05, "loss": 0.2292, "step": 3108 }, { "epoch": 1.9942270686337396, "grad_norm": 1.1553230285644531, "learning_rate": 1.6798800085708163e-05, "loss": 0.2403, "step": 3109 }, { "epoch": 1.9948685054522128, "grad_norm": 1.0582219362258911, "learning_rate": 1.678808656524534e-05, "loss": 0.2225, "step": 3110 }, { "epoch": 1.9955099422706863, "grad_norm": 1.1778861284255981, "learning_rate": 1.6777373044782516e-05, "loss": 0.2383, "step": 3111 }, { "epoch": 1.9961513790891598, "grad_norm": 1.4483152627944946, "learning_rate": 1.676665952431969e-05, "loss": 0.2764, "step": 3112 }, { "epoch": 1.9967928159076331, "grad_norm": 1.2137330770492554, "learning_rate": 1.675594600385687e-05, "loss": 0.2621, "step": 3113 }, { "epoch": 1.9974342527261064, "grad_norm": 1.3402173519134521, "learning_rate": 1.6745232483394044e-05, "loss": 0.2739, "step": 3114 }, { "epoch": 1.9980756895445797, "grad_norm": 1.3634238243103027, "learning_rate": 1.6734518962931218e-05, "loss": 0.3009, "step": 3115 }, { "epoch": 1.9987171263630532, "grad_norm": 1.183568000793457, "learning_rate": 1.6723805442468396e-05, "loss": 0.2791, "step": 3116 }, { "epoch": 1.9993585631815267, "grad_norm": 1.2508084774017334, "learning_rate": 1.671309192200557e-05, "loss": 0.2929, "step": 3117 }, { "epoch": 2.0, "grad_norm": 1.2722445726394653, "learning_rate": 1.670237840154275e-05, "loss": 0.2631, "step": 3118 }, { "epoch": 2.0006414368184733, "grad_norm": 0.6456621885299683, "learning_rate": 1.6691664881079924e-05, "loss": 0.155, "step": 3119 }, { "epoch": 2.0012828736369466, "grad_norm": 0.6049922108650208, "learning_rate": 1.66809513606171e-05, "loss": 0.1481, "step": 3120 }, { "epoch": 2.0019243104554203, "grad_norm": 0.7179097533226013, "learning_rate": 1.6670237840154277e-05, "loss": 0.1865, "step": 3121 }, { "epoch": 2.0025657472738936, "grad_norm": 0.6891741156578064, "learning_rate": 1.6659524319691452e-05, "loss": 0.1627, "step": 3122 }, { "epoch": 2.003207184092367, "grad_norm": 0.7448855042457581, "learning_rate": 1.664881079922863e-05, "loss": 0.1519, "step": 3123 }, { "epoch": 2.00384862091084, "grad_norm": 0.5858240723609924, "learning_rate": 1.6638097278765805e-05, "loss": 0.1418, "step": 3124 }, { "epoch": 2.004490057729314, "grad_norm": 0.5681788325309753, "learning_rate": 1.662738375830298e-05, "loss": 0.1398, "step": 3125 }, { "epoch": 2.005131494547787, "grad_norm": 0.8809100389480591, "learning_rate": 1.6616670237840154e-05, "loss": 0.16, "step": 3126 }, { "epoch": 2.0057729313662604, "grad_norm": 0.674100399017334, "learning_rate": 1.660595671737733e-05, "loss": 0.1361, "step": 3127 }, { "epoch": 2.0064143681847337, "grad_norm": 0.7249632477760315, "learning_rate": 1.6595243196914507e-05, "loss": 0.1277, "step": 3128 }, { "epoch": 2.007055805003207, "grad_norm": 0.5952154397964478, "learning_rate": 1.6584529676451682e-05, "loss": 0.1369, "step": 3129 }, { "epoch": 2.0076972418216807, "grad_norm": 0.8164462447166443, "learning_rate": 1.6573816155988857e-05, "loss": 0.1234, "step": 3130 }, { "epoch": 2.008338678640154, "grad_norm": 1.105686068534851, "learning_rate": 1.6563102635526035e-05, "loss": 0.1787, "step": 3131 }, { "epoch": 2.0089801154586273, "grad_norm": 0.8410708904266357, "learning_rate": 1.655238911506321e-05, "loss": 0.1305, "step": 3132 }, { "epoch": 2.0096215522771006, "grad_norm": 0.8218911290168762, "learning_rate": 1.6541675594600388e-05, "loss": 0.1261, "step": 3133 }, { "epoch": 2.010262989095574, "grad_norm": 0.8774808645248413, "learning_rate": 1.6530962074137562e-05, "loss": 0.1312, "step": 3134 }, { "epoch": 2.0109044259140476, "grad_norm": 0.8137223720550537, "learning_rate": 1.6520248553674737e-05, "loss": 0.1225, "step": 3135 }, { "epoch": 2.011545862732521, "grad_norm": 1.2049943208694458, "learning_rate": 1.6509535033211915e-05, "loss": 0.1288, "step": 3136 }, { "epoch": 2.012187299550994, "grad_norm": 1.23878812789917, "learning_rate": 1.649882151274909e-05, "loss": 0.1284, "step": 3137 }, { "epoch": 2.0128287363694675, "grad_norm": 1.3216296434402466, "learning_rate": 1.6488107992286268e-05, "loss": 0.1467, "step": 3138 }, { "epoch": 2.013470173187941, "grad_norm": 1.4532495737075806, "learning_rate": 1.6477394471823443e-05, "loss": 0.1334, "step": 3139 }, { "epoch": 2.0141116100064145, "grad_norm": 1.4454377889633179, "learning_rate": 1.6466680951360618e-05, "loss": 0.1341, "step": 3140 }, { "epoch": 2.0147530468248878, "grad_norm": 1.4672883749008179, "learning_rate": 1.6455967430897796e-05, "loss": 0.124, "step": 3141 }, { "epoch": 2.015394483643361, "grad_norm": 1.8039729595184326, "learning_rate": 1.6445253910434967e-05, "loss": 0.139, "step": 3142 }, { "epoch": 2.0160359204618343, "grad_norm": 1.3279694318771362, "learning_rate": 1.6434540389972145e-05, "loss": 0.1338, "step": 3143 }, { "epoch": 2.016677357280308, "grad_norm": 1.0719125270843506, "learning_rate": 1.642382686950932e-05, "loss": 0.1086, "step": 3144 }, { "epoch": 2.0173187940987813, "grad_norm": 1.7961822748184204, "learning_rate": 1.6413113349046495e-05, "loss": 0.1328, "step": 3145 }, { "epoch": 2.0179602309172546, "grad_norm": 1.0590550899505615, "learning_rate": 1.6402399828583673e-05, "loss": 0.109, "step": 3146 }, { "epoch": 2.018601667735728, "grad_norm": 1.6914108991622925, "learning_rate": 1.6391686308120848e-05, "loss": 0.1414, "step": 3147 }, { "epoch": 2.019243104554201, "grad_norm": 1.891947627067566, "learning_rate": 1.6380972787658026e-05, "loss": 0.1346, "step": 3148 }, { "epoch": 2.019884541372675, "grad_norm": 1.28817880153656, "learning_rate": 1.63702592671952e-05, "loss": 0.1266, "step": 3149 }, { "epoch": 2.020525978191148, "grad_norm": 1.4756649732589722, "learning_rate": 1.6359545746732376e-05, "loss": 0.1312, "step": 3150 }, { "epoch": 2.0211674150096215, "grad_norm": 1.9202128648757935, "learning_rate": 1.6348832226269554e-05, "loss": 0.1419, "step": 3151 }, { "epoch": 2.0218088518280948, "grad_norm": 1.2349282503128052, "learning_rate": 1.633811870580673e-05, "loss": 0.1291, "step": 3152 }, { "epoch": 2.0224502886465685, "grad_norm": 1.3500845432281494, "learning_rate": 1.6327405185343907e-05, "loss": 0.1347, "step": 3153 }, { "epoch": 2.023091725465042, "grad_norm": 1.5267596244812012, "learning_rate": 1.631669166488108e-05, "loss": 0.1354, "step": 3154 }, { "epoch": 2.023733162283515, "grad_norm": 1.3697185516357422, "learning_rate": 1.6305978144418256e-05, "loss": 0.1267, "step": 3155 }, { "epoch": 2.0243745991019884, "grad_norm": 1.329046368598938, "learning_rate": 1.6295264623955434e-05, "loss": 0.1263, "step": 3156 }, { "epoch": 2.0250160359204616, "grad_norm": 1.5444915294647217, "learning_rate": 1.628455110349261e-05, "loss": 0.135, "step": 3157 }, { "epoch": 2.0256574727389354, "grad_norm": 1.1910661458969116, "learning_rate": 1.6273837583029787e-05, "loss": 0.1332, "step": 3158 }, { "epoch": 2.0262989095574087, "grad_norm": 1.9158306121826172, "learning_rate": 1.626312406256696e-05, "loss": 0.1365, "step": 3159 }, { "epoch": 2.026940346375882, "grad_norm": 1.5432721376419067, "learning_rate": 1.6252410542104137e-05, "loss": 0.1361, "step": 3160 }, { "epoch": 2.0275817831943552, "grad_norm": 1.0635019540786743, "learning_rate": 1.624169702164131e-05, "loss": 0.1243, "step": 3161 }, { "epoch": 2.028223220012829, "grad_norm": 1.248262643814087, "learning_rate": 1.6230983501178486e-05, "loss": 0.1219, "step": 3162 }, { "epoch": 2.0288646568313022, "grad_norm": 0.930608868598938, "learning_rate": 1.6220269980715664e-05, "loss": 0.121, "step": 3163 }, { "epoch": 2.0295060936497755, "grad_norm": 1.614296793937683, "learning_rate": 1.620955646025284e-05, "loss": 0.1505, "step": 3164 }, { "epoch": 2.030147530468249, "grad_norm": 1.1637368202209473, "learning_rate": 1.6198842939790014e-05, "loss": 0.1247, "step": 3165 }, { "epoch": 2.030788967286722, "grad_norm": 0.878598690032959, "learning_rate": 1.6188129419327192e-05, "loss": 0.1162, "step": 3166 }, { "epoch": 2.031430404105196, "grad_norm": 1.0009818077087402, "learning_rate": 1.6177415898864367e-05, "loss": 0.1273, "step": 3167 }, { "epoch": 2.032071840923669, "grad_norm": 1.4830230474472046, "learning_rate": 1.6166702378401545e-05, "loss": 0.1402, "step": 3168 }, { "epoch": 2.0327132777421424, "grad_norm": 1.7317066192626953, "learning_rate": 1.615598885793872e-05, "loss": 0.1569, "step": 3169 }, { "epoch": 2.0333547145606157, "grad_norm": 1.0779668092727661, "learning_rate": 1.6145275337475894e-05, "loss": 0.1272, "step": 3170 }, { "epoch": 2.033996151379089, "grad_norm": 1.1129378080368042, "learning_rate": 1.6134561817013073e-05, "loss": 0.1332, "step": 3171 }, { "epoch": 2.0346375881975627, "grad_norm": 1.0247077941894531, "learning_rate": 1.6123848296550247e-05, "loss": 0.1284, "step": 3172 }, { "epoch": 2.035279025016036, "grad_norm": 1.31787109375, "learning_rate": 1.6113134776087426e-05, "loss": 0.1311, "step": 3173 }, { "epoch": 2.0359204618345093, "grad_norm": 1.1357672214508057, "learning_rate": 1.61024212556246e-05, "loss": 0.1251, "step": 3174 }, { "epoch": 2.0365618986529825, "grad_norm": 1.4911340475082397, "learning_rate": 1.6091707735161775e-05, "loss": 0.1443, "step": 3175 }, { "epoch": 2.0372033354714563, "grad_norm": 0.9944356679916382, "learning_rate": 1.608099421469895e-05, "loss": 0.1204, "step": 3176 }, { "epoch": 2.0378447722899296, "grad_norm": 1.3403947353363037, "learning_rate": 1.6070280694236125e-05, "loss": 0.1467, "step": 3177 }, { "epoch": 2.038486209108403, "grad_norm": 1.390872836112976, "learning_rate": 1.6059567173773303e-05, "loss": 0.1317, "step": 3178 }, { "epoch": 2.039127645926876, "grad_norm": 1.0733236074447632, "learning_rate": 1.6048853653310477e-05, "loss": 0.1228, "step": 3179 }, { "epoch": 2.0397690827453494, "grad_norm": 1.2694741487503052, "learning_rate": 1.6038140132847652e-05, "loss": 0.1328, "step": 3180 }, { "epoch": 2.040410519563823, "grad_norm": 1.1624362468719482, "learning_rate": 1.602742661238483e-05, "loss": 0.1279, "step": 3181 }, { "epoch": 2.0410519563822964, "grad_norm": 0.8858145475387573, "learning_rate": 1.6016713091922005e-05, "loss": 0.1175, "step": 3182 }, { "epoch": 2.0416933932007697, "grad_norm": 1.230027437210083, "learning_rate": 1.6005999571459183e-05, "loss": 0.129, "step": 3183 }, { "epoch": 2.042334830019243, "grad_norm": 1.3248845338821411, "learning_rate": 1.5995286050996358e-05, "loss": 0.1319, "step": 3184 }, { "epoch": 2.0429762668377163, "grad_norm": 0.9621805548667908, "learning_rate": 1.5984572530533533e-05, "loss": 0.1217, "step": 3185 }, { "epoch": 2.04361770365619, "grad_norm": 1.261370062828064, "learning_rate": 1.597385901007071e-05, "loss": 0.1282, "step": 3186 }, { "epoch": 2.0442591404746633, "grad_norm": 1.3059744834899902, "learning_rate": 1.5963145489607886e-05, "loss": 0.1302, "step": 3187 }, { "epoch": 2.0449005772931366, "grad_norm": 0.9452477693557739, "learning_rate": 1.5952431969145064e-05, "loss": 0.1187, "step": 3188 }, { "epoch": 2.04554201411161, "grad_norm": 1.3317539691925049, "learning_rate": 1.594171844868224e-05, "loss": 0.1381, "step": 3189 }, { "epoch": 2.0461834509300836, "grad_norm": 1.379641056060791, "learning_rate": 1.5931004928219413e-05, "loss": 0.1224, "step": 3190 }, { "epoch": 2.046824887748557, "grad_norm": 1.4626860618591309, "learning_rate": 1.592029140775659e-05, "loss": 0.1459, "step": 3191 }, { "epoch": 2.04746632456703, "grad_norm": 1.5498061180114746, "learning_rate": 1.5909577887293766e-05, "loss": 0.136, "step": 3192 }, { "epoch": 2.0481077613855034, "grad_norm": 1.3820655345916748, "learning_rate": 1.589886436683094e-05, "loss": 0.1309, "step": 3193 }, { "epoch": 2.0487491982039767, "grad_norm": 0.8855481743812561, "learning_rate": 1.5888150846368116e-05, "loss": 0.1211, "step": 3194 }, { "epoch": 2.0493906350224504, "grad_norm": 1.1798310279846191, "learning_rate": 1.587743732590529e-05, "loss": 0.1329, "step": 3195 }, { "epoch": 2.0500320718409237, "grad_norm": 1.3474606275558472, "learning_rate": 1.586672380544247e-05, "loss": 0.1375, "step": 3196 }, { "epoch": 2.050673508659397, "grad_norm": 1.3795744180679321, "learning_rate": 1.5856010284979644e-05, "loss": 0.1421, "step": 3197 }, { "epoch": 2.0513149454778703, "grad_norm": 1.1530526876449585, "learning_rate": 1.584529676451682e-05, "loss": 0.1266, "step": 3198 }, { "epoch": 2.051956382296344, "grad_norm": 1.15345299243927, "learning_rate": 1.5834583244053996e-05, "loss": 0.1331, "step": 3199 }, { "epoch": 2.0525978191148173, "grad_norm": 1.0545144081115723, "learning_rate": 1.582386972359117e-05, "loss": 0.1349, "step": 3200 }, { "epoch": 2.0532392559332906, "grad_norm": 1.5552637577056885, "learning_rate": 1.581315620312835e-05, "loss": 0.1539, "step": 3201 }, { "epoch": 2.053880692751764, "grad_norm": 1.5041022300720215, "learning_rate": 1.5802442682665524e-05, "loss": 0.1372, "step": 3202 }, { "epoch": 2.054522129570237, "grad_norm": 0.897092342376709, "learning_rate": 1.5791729162202702e-05, "loss": 0.124, "step": 3203 }, { "epoch": 2.055163566388711, "grad_norm": 1.286995768547058, "learning_rate": 1.5781015641739877e-05, "loss": 0.1285, "step": 3204 }, { "epoch": 2.055805003207184, "grad_norm": 1.3610239028930664, "learning_rate": 1.5770302121277052e-05, "loss": 0.1301, "step": 3205 }, { "epoch": 2.0564464400256575, "grad_norm": 1.0863312482833862, "learning_rate": 1.575958860081423e-05, "loss": 0.1375, "step": 3206 }, { "epoch": 2.0570878768441307, "grad_norm": 1.5507241487503052, "learning_rate": 1.5748875080351405e-05, "loss": 0.1463, "step": 3207 }, { "epoch": 2.057729313662604, "grad_norm": 1.0659210681915283, "learning_rate": 1.5738161559888583e-05, "loss": 0.1423, "step": 3208 }, { "epoch": 2.0583707504810778, "grad_norm": 1.5683585405349731, "learning_rate": 1.5727448039425758e-05, "loss": 0.1297, "step": 3209 }, { "epoch": 2.059012187299551, "grad_norm": 0.9489557147026062, "learning_rate": 1.571673451896293e-05, "loss": 0.1284, "step": 3210 }, { "epoch": 2.0596536241180243, "grad_norm": 0.9703658819198608, "learning_rate": 1.5706020998500107e-05, "loss": 0.126, "step": 3211 }, { "epoch": 2.0602950609364976, "grad_norm": 0.9658302068710327, "learning_rate": 1.5695307478037282e-05, "loss": 0.1222, "step": 3212 }, { "epoch": 2.0609364977549713, "grad_norm": 0.9918881058692932, "learning_rate": 1.568459395757446e-05, "loss": 0.1192, "step": 3213 }, { "epoch": 2.0615779345734446, "grad_norm": 1.4447509050369263, "learning_rate": 1.5673880437111635e-05, "loss": 0.1473, "step": 3214 }, { "epoch": 2.062219371391918, "grad_norm": 1.1939494609832764, "learning_rate": 1.566316691664881e-05, "loss": 0.1382, "step": 3215 }, { "epoch": 2.062860808210391, "grad_norm": 1.7442710399627686, "learning_rate": 1.5652453396185988e-05, "loss": 0.1624, "step": 3216 }, { "epoch": 2.0635022450288645, "grad_norm": 1.0020999908447266, "learning_rate": 1.5641739875723162e-05, "loss": 0.1233, "step": 3217 }, { "epoch": 2.064143681847338, "grad_norm": 1.3592199087142944, "learning_rate": 1.563102635526034e-05, "loss": 0.1363, "step": 3218 }, { "epoch": 2.0647851186658115, "grad_norm": 2.0519940853118896, "learning_rate": 1.5620312834797515e-05, "loss": 0.1567, "step": 3219 }, { "epoch": 2.0654265554842848, "grad_norm": 1.272823691368103, "learning_rate": 1.560959931433469e-05, "loss": 0.1334, "step": 3220 }, { "epoch": 2.066067992302758, "grad_norm": 1.0165845155715942, "learning_rate": 1.5598885793871868e-05, "loss": 0.1232, "step": 3221 }, { "epoch": 2.0667094291212313, "grad_norm": 1.1347179412841797, "learning_rate": 1.5588172273409043e-05, "loss": 0.122, "step": 3222 }, { "epoch": 2.067350865939705, "grad_norm": 1.4513468742370605, "learning_rate": 1.557745875294622e-05, "loss": 0.1394, "step": 3223 }, { "epoch": 2.0679923027581784, "grad_norm": 0.9265425801277161, "learning_rate": 1.5566745232483396e-05, "loss": 0.1202, "step": 3224 }, { "epoch": 2.0686337395766516, "grad_norm": 1.2264536619186401, "learning_rate": 1.555603171202057e-05, "loss": 0.1266, "step": 3225 }, { "epoch": 2.069275176395125, "grad_norm": 1.1434073448181152, "learning_rate": 1.554531819155775e-05, "loss": 0.1273, "step": 3226 }, { "epoch": 2.0699166132135987, "grad_norm": 0.9323124289512634, "learning_rate": 1.5534604671094924e-05, "loss": 0.1267, "step": 3227 }, { "epoch": 2.070558050032072, "grad_norm": 1.1000487804412842, "learning_rate": 1.55238911506321e-05, "loss": 0.1283, "step": 3228 }, { "epoch": 2.0711994868505452, "grad_norm": 1.019587516784668, "learning_rate": 1.5513177630169273e-05, "loss": 0.1244, "step": 3229 }, { "epoch": 2.0718409236690185, "grad_norm": 1.2667075395584106, "learning_rate": 1.5502464109706448e-05, "loss": 0.1321, "step": 3230 }, { "epoch": 2.072482360487492, "grad_norm": 1.288194179534912, "learning_rate": 1.5491750589243626e-05, "loss": 0.1351, "step": 3231 }, { "epoch": 2.0731237973059655, "grad_norm": 1.1283828020095825, "learning_rate": 1.54810370687808e-05, "loss": 0.1372, "step": 3232 }, { "epoch": 2.073765234124439, "grad_norm": 0.9183892011642456, "learning_rate": 1.547032354831798e-05, "loss": 0.1077, "step": 3233 }, { "epoch": 2.074406670942912, "grad_norm": 1.1190731525421143, "learning_rate": 1.5459610027855154e-05, "loss": 0.1229, "step": 3234 }, { "epoch": 2.0750481077613854, "grad_norm": 1.3618401288986206, "learning_rate": 1.544889650739233e-05, "loss": 0.1267, "step": 3235 }, { "epoch": 2.075689544579859, "grad_norm": 0.9254131317138672, "learning_rate": 1.5438182986929507e-05, "loss": 0.1186, "step": 3236 }, { "epoch": 2.0763309813983324, "grad_norm": 1.8176519870758057, "learning_rate": 1.542746946646668e-05, "loss": 0.146, "step": 3237 }, { "epoch": 2.0769724182168057, "grad_norm": 1.1859185695648193, "learning_rate": 1.541675594600386e-05, "loss": 0.1211, "step": 3238 }, { "epoch": 2.077613855035279, "grad_norm": 0.9150325655937195, "learning_rate": 1.5406042425541034e-05, "loss": 0.1113, "step": 3239 }, { "epoch": 2.0782552918537522, "grad_norm": 1.6838245391845703, "learning_rate": 1.539532890507821e-05, "loss": 0.1531, "step": 3240 }, { "epoch": 2.078896728672226, "grad_norm": 1.0426373481750488, "learning_rate": 1.5384615384615387e-05, "loss": 0.1176, "step": 3241 }, { "epoch": 2.0795381654906993, "grad_norm": 1.7371714115142822, "learning_rate": 1.5373901864152562e-05, "loss": 0.1484, "step": 3242 }, { "epoch": 2.0801796023091725, "grad_norm": 1.5045500993728638, "learning_rate": 1.5363188343689737e-05, "loss": 0.1472, "step": 3243 }, { "epoch": 2.080821039127646, "grad_norm": 1.1483412981033325, "learning_rate": 1.5352474823226915e-05, "loss": 0.1285, "step": 3244 }, { "epoch": 2.081462475946119, "grad_norm": 1.2605477571487427, "learning_rate": 1.5341761302764086e-05, "loss": 0.1292, "step": 3245 }, { "epoch": 2.082103912764593, "grad_norm": 1.3209991455078125, "learning_rate": 1.5331047782301264e-05, "loss": 0.1329, "step": 3246 }, { "epoch": 2.082745349583066, "grad_norm": 1.059425950050354, "learning_rate": 1.532033426183844e-05, "loss": 0.1321, "step": 3247 }, { "epoch": 2.0833867864015394, "grad_norm": 1.3121154308319092, "learning_rate": 1.5309620741375617e-05, "loss": 0.135, "step": 3248 }, { "epoch": 2.0840282232200127, "grad_norm": 0.8713024258613586, "learning_rate": 1.5298907220912792e-05, "loss": 0.1211, "step": 3249 }, { "epoch": 2.0846696600384864, "grad_norm": 1.2794502973556519, "learning_rate": 1.5288193700449967e-05, "loss": 0.1475, "step": 3250 }, { "epoch": 2.0853110968569597, "grad_norm": 1.1673390865325928, "learning_rate": 1.5277480179987145e-05, "loss": 0.1324, "step": 3251 }, { "epoch": 2.085952533675433, "grad_norm": 1.1663671731948853, "learning_rate": 1.526676665952432e-05, "loss": 0.1232, "step": 3252 }, { "epoch": 2.0865939704939063, "grad_norm": 1.3705438375473022, "learning_rate": 1.5256053139061496e-05, "loss": 0.1299, "step": 3253 }, { "epoch": 2.0872354073123796, "grad_norm": 0.9729217886924744, "learning_rate": 1.5245339618598673e-05, "loss": 0.1187, "step": 3254 }, { "epoch": 2.0878768441308533, "grad_norm": 1.2962870597839355, "learning_rate": 1.5234626098135849e-05, "loss": 0.1418, "step": 3255 }, { "epoch": 2.0885182809493266, "grad_norm": 1.4822678565979004, "learning_rate": 1.5223912577673024e-05, "loss": 0.1347, "step": 3256 }, { "epoch": 2.0891597177678, "grad_norm": 1.3787848949432373, "learning_rate": 1.52131990572102e-05, "loss": 0.1305, "step": 3257 }, { "epoch": 2.089801154586273, "grad_norm": 1.751129388809204, "learning_rate": 1.5202485536747377e-05, "loss": 0.1478, "step": 3258 }, { "epoch": 2.0904425914047464, "grad_norm": 0.988003134727478, "learning_rate": 1.5191772016284553e-05, "loss": 0.1247, "step": 3259 }, { "epoch": 2.09108402822322, "grad_norm": 1.420179009437561, "learning_rate": 1.518105849582173e-05, "loss": 0.1402, "step": 3260 }, { "epoch": 2.0917254650416934, "grad_norm": 0.9309625029563904, "learning_rate": 1.5170344975358904e-05, "loss": 0.1227, "step": 3261 }, { "epoch": 2.0923669018601667, "grad_norm": 1.342708706855774, "learning_rate": 1.5159631454896077e-05, "loss": 0.1354, "step": 3262 }, { "epoch": 2.09300833867864, "grad_norm": 1.122536063194275, "learning_rate": 1.5148917934433254e-05, "loss": 0.1261, "step": 3263 }, { "epoch": 2.0936497754971137, "grad_norm": 1.2829923629760742, "learning_rate": 1.513820441397043e-05, "loss": 0.1428, "step": 3264 }, { "epoch": 2.094291212315587, "grad_norm": 1.0834243297576904, "learning_rate": 1.5127490893507607e-05, "loss": 0.1264, "step": 3265 }, { "epoch": 2.0949326491340603, "grad_norm": 1.137591004371643, "learning_rate": 1.5116777373044783e-05, "loss": 0.125, "step": 3266 }, { "epoch": 2.0955740859525336, "grad_norm": 1.028734803199768, "learning_rate": 1.5106063852581958e-05, "loss": 0.1336, "step": 3267 }, { "epoch": 2.096215522771007, "grad_norm": 1.3431247472763062, "learning_rate": 1.5095350332119135e-05, "loss": 0.1395, "step": 3268 }, { "epoch": 2.0968569595894806, "grad_norm": 1.4787849187850952, "learning_rate": 1.5084636811656311e-05, "loss": 0.14, "step": 3269 }, { "epoch": 2.097498396407954, "grad_norm": 1.3876073360443115, "learning_rate": 1.5073923291193487e-05, "loss": 0.1379, "step": 3270 }, { "epoch": 2.098139833226427, "grad_norm": 1.5557208061218262, "learning_rate": 1.5063209770730662e-05, "loss": 0.1227, "step": 3271 }, { "epoch": 2.0987812700449004, "grad_norm": 1.4989593029022217, "learning_rate": 1.5052496250267839e-05, "loss": 0.1346, "step": 3272 }, { "epoch": 2.0994227068633737, "grad_norm": 1.61176335811615, "learning_rate": 1.5041782729805015e-05, "loss": 0.1421, "step": 3273 }, { "epoch": 2.1000641436818475, "grad_norm": 0.8853927254676819, "learning_rate": 1.5031069209342192e-05, "loss": 0.121, "step": 3274 }, { "epoch": 2.1007055805003207, "grad_norm": 1.0476760864257812, "learning_rate": 1.5020355688879368e-05, "loss": 0.1223, "step": 3275 }, { "epoch": 2.101347017318794, "grad_norm": 1.2486381530761719, "learning_rate": 1.5009642168416543e-05, "loss": 0.1412, "step": 3276 }, { "epoch": 2.1019884541372673, "grad_norm": 1.4771982431411743, "learning_rate": 1.499892864795372e-05, "loss": 0.1378, "step": 3277 }, { "epoch": 2.102629890955741, "grad_norm": 1.2457853555679321, "learning_rate": 1.4988215127490896e-05, "loss": 0.125, "step": 3278 }, { "epoch": 2.1032713277742143, "grad_norm": 1.9215975999832153, "learning_rate": 1.4977501607028069e-05, "loss": 0.1585, "step": 3279 }, { "epoch": 2.1039127645926876, "grad_norm": 1.1980711221694946, "learning_rate": 1.4966788086565245e-05, "loss": 0.1213, "step": 3280 }, { "epoch": 2.104554201411161, "grad_norm": 0.837425172328949, "learning_rate": 1.4956074566102422e-05, "loss": 0.1207, "step": 3281 }, { "epoch": 2.105195638229634, "grad_norm": 0.9695487022399902, "learning_rate": 1.4945361045639596e-05, "loss": 0.1347, "step": 3282 }, { "epoch": 2.105837075048108, "grad_norm": 1.1741715669631958, "learning_rate": 1.4934647525176773e-05, "loss": 0.1332, "step": 3283 }, { "epoch": 2.106478511866581, "grad_norm": 1.229599952697754, "learning_rate": 1.492393400471395e-05, "loss": 0.1315, "step": 3284 }, { "epoch": 2.1071199486850545, "grad_norm": 1.6780987977981567, "learning_rate": 1.4913220484251126e-05, "loss": 0.15, "step": 3285 }, { "epoch": 2.1077613855035278, "grad_norm": 1.3504167795181274, "learning_rate": 1.49025069637883e-05, "loss": 0.1295, "step": 3286 }, { "epoch": 2.1084028223220015, "grad_norm": 1.0613698959350586, "learning_rate": 1.4891793443325477e-05, "loss": 0.1347, "step": 3287 }, { "epoch": 2.1090442591404748, "grad_norm": 1.552774429321289, "learning_rate": 1.4881079922862653e-05, "loss": 0.1547, "step": 3288 }, { "epoch": 2.109685695958948, "grad_norm": 1.0270743370056152, "learning_rate": 1.487036640239983e-05, "loss": 0.1244, "step": 3289 }, { "epoch": 2.1103271327774213, "grad_norm": 1.1328026056289673, "learning_rate": 1.4859652881937006e-05, "loss": 0.1193, "step": 3290 }, { "epoch": 2.1109685695958946, "grad_norm": 2.1538944244384766, "learning_rate": 1.4848939361474181e-05, "loss": 0.1592, "step": 3291 }, { "epoch": 2.1116100064143684, "grad_norm": 1.2672213315963745, "learning_rate": 1.4838225841011358e-05, "loss": 0.1328, "step": 3292 }, { "epoch": 2.1122514432328416, "grad_norm": 1.1536047458648682, "learning_rate": 1.4827512320548534e-05, "loss": 0.1336, "step": 3293 }, { "epoch": 2.112892880051315, "grad_norm": 1.213147759437561, "learning_rate": 1.481679880008571e-05, "loss": 0.1351, "step": 3294 }, { "epoch": 2.113534316869788, "grad_norm": 1.947575330734253, "learning_rate": 1.4806085279622885e-05, "loss": 0.1701, "step": 3295 }, { "epoch": 2.1141757536882615, "grad_norm": 1.2698744535446167, "learning_rate": 1.479537175916006e-05, "loss": 0.1278, "step": 3296 }, { "epoch": 2.1148171905067352, "grad_norm": 1.154346227645874, "learning_rate": 1.4784658238697235e-05, "loss": 0.1373, "step": 3297 }, { "epoch": 2.1154586273252085, "grad_norm": 1.1178531646728516, "learning_rate": 1.4773944718234411e-05, "loss": 0.125, "step": 3298 }, { "epoch": 2.116100064143682, "grad_norm": 1.1387606859207153, "learning_rate": 1.4763231197771588e-05, "loss": 0.1233, "step": 3299 }, { "epoch": 2.116741500962155, "grad_norm": 1.113714575767517, "learning_rate": 1.4752517677308764e-05, "loss": 0.1255, "step": 3300 }, { "epoch": 2.117382937780629, "grad_norm": 1.1517341136932373, "learning_rate": 1.474180415684594e-05, "loss": 0.1428, "step": 3301 }, { "epoch": 2.118024374599102, "grad_norm": 1.1472139358520508, "learning_rate": 1.4731090636383115e-05, "loss": 0.1274, "step": 3302 }, { "epoch": 2.1186658114175754, "grad_norm": 1.315975546836853, "learning_rate": 1.4720377115920292e-05, "loss": 0.1301, "step": 3303 }, { "epoch": 2.1193072482360487, "grad_norm": 1.0412352085113525, "learning_rate": 1.4709663595457468e-05, "loss": 0.124, "step": 3304 }, { "epoch": 2.119948685054522, "grad_norm": 1.9361974000930786, "learning_rate": 1.4698950074994645e-05, "loss": 0.1455, "step": 3305 }, { "epoch": 2.1205901218729957, "grad_norm": 1.4003548622131348, "learning_rate": 1.468823655453182e-05, "loss": 0.1621, "step": 3306 }, { "epoch": 2.121231558691469, "grad_norm": 1.4618860483169556, "learning_rate": 1.4677523034068996e-05, "loss": 0.1567, "step": 3307 }, { "epoch": 2.1218729955099422, "grad_norm": 1.1354279518127441, "learning_rate": 1.4666809513606172e-05, "loss": 0.1238, "step": 3308 }, { "epoch": 2.1225144323284155, "grad_norm": 0.9687253832817078, "learning_rate": 1.4656095993143349e-05, "loss": 0.1219, "step": 3309 }, { "epoch": 2.1231558691468893, "grad_norm": 1.037191390991211, "learning_rate": 1.4645382472680525e-05, "loss": 0.1193, "step": 3310 }, { "epoch": 2.1237973059653625, "grad_norm": 1.009153962135315, "learning_rate": 1.46346689522177e-05, "loss": 0.1308, "step": 3311 }, { "epoch": 2.124438742783836, "grad_norm": 1.255570411682129, "learning_rate": 1.4623955431754876e-05, "loss": 0.1303, "step": 3312 }, { "epoch": 2.125080179602309, "grad_norm": 1.312886118888855, "learning_rate": 1.461324191129205e-05, "loss": 0.1378, "step": 3313 }, { "epoch": 2.1257216164207824, "grad_norm": 0.9024628400802612, "learning_rate": 1.4602528390829226e-05, "loss": 0.1217, "step": 3314 }, { "epoch": 2.126363053239256, "grad_norm": 1.0052528381347656, "learning_rate": 1.4591814870366402e-05, "loss": 0.1241, "step": 3315 }, { "epoch": 2.1270044900577294, "grad_norm": 1.102254867553711, "learning_rate": 1.4581101349903579e-05, "loss": 0.1219, "step": 3316 }, { "epoch": 2.1276459268762027, "grad_norm": 1.373814582824707, "learning_rate": 1.4570387829440754e-05, "loss": 0.1466, "step": 3317 }, { "epoch": 2.128287363694676, "grad_norm": 1.2094709873199463, "learning_rate": 1.455967430897793e-05, "loss": 0.1337, "step": 3318 }, { "epoch": 2.1289288005131493, "grad_norm": 1.7286858558654785, "learning_rate": 1.4548960788515107e-05, "loss": 0.1571, "step": 3319 }, { "epoch": 2.129570237331623, "grad_norm": 0.9794854521751404, "learning_rate": 1.4538247268052283e-05, "loss": 0.1156, "step": 3320 }, { "epoch": 2.1302116741500963, "grad_norm": 1.432329773902893, "learning_rate": 1.4527533747589458e-05, "loss": 0.1254, "step": 3321 }, { "epoch": 2.1308531109685696, "grad_norm": 1.5639392137527466, "learning_rate": 1.4516820227126634e-05, "loss": 0.1432, "step": 3322 }, { "epoch": 2.131494547787043, "grad_norm": 0.9943935871124268, "learning_rate": 1.450610670666381e-05, "loss": 0.1206, "step": 3323 }, { "epoch": 2.132135984605516, "grad_norm": 1.3898930549621582, "learning_rate": 1.4495393186200987e-05, "loss": 0.1379, "step": 3324 }, { "epoch": 2.13277742142399, "grad_norm": 1.1108952760696411, "learning_rate": 1.4484679665738164e-05, "loss": 0.1258, "step": 3325 }, { "epoch": 2.133418858242463, "grad_norm": 1.1042076349258423, "learning_rate": 1.4473966145275338e-05, "loss": 0.1222, "step": 3326 }, { "epoch": 2.1340602950609364, "grad_norm": 1.5346829891204834, "learning_rate": 1.4463252624812515e-05, "loss": 0.1305, "step": 3327 }, { "epoch": 2.1347017318794097, "grad_norm": 1.3636589050292969, "learning_rate": 1.4452539104349691e-05, "loss": 0.1267, "step": 3328 }, { "epoch": 2.1353431686978834, "grad_norm": 1.2023379802703857, "learning_rate": 1.4441825583886868e-05, "loss": 0.1311, "step": 3329 }, { "epoch": 2.1359846055163567, "grad_norm": 1.7553794384002686, "learning_rate": 1.4431112063424043e-05, "loss": 0.1364, "step": 3330 }, { "epoch": 2.13662604233483, "grad_norm": 1.085867166519165, "learning_rate": 1.4420398542961217e-05, "loss": 0.1332, "step": 3331 }, { "epoch": 2.1372674791533033, "grad_norm": 0.9327808022499084, "learning_rate": 1.4409685022498392e-05, "loss": 0.1239, "step": 3332 }, { "epoch": 2.137908915971777, "grad_norm": 1.1156717538833618, "learning_rate": 1.4398971502035568e-05, "loss": 0.1285, "step": 3333 }, { "epoch": 2.1385503527902503, "grad_norm": 1.2252061367034912, "learning_rate": 1.4388257981572745e-05, "loss": 0.1288, "step": 3334 }, { "epoch": 2.1391917896087236, "grad_norm": 1.8124932050704956, "learning_rate": 1.4377544461109921e-05, "loss": 0.1512, "step": 3335 }, { "epoch": 2.139833226427197, "grad_norm": 1.249336838722229, "learning_rate": 1.4366830940647096e-05, "loss": 0.122, "step": 3336 }, { "epoch": 2.14047466324567, "grad_norm": 1.2064685821533203, "learning_rate": 1.4356117420184273e-05, "loss": 0.1298, "step": 3337 }, { "epoch": 2.141116100064144, "grad_norm": 1.1896368265151978, "learning_rate": 1.4345403899721449e-05, "loss": 0.1224, "step": 3338 }, { "epoch": 2.141757536882617, "grad_norm": 1.5333672761917114, "learning_rate": 1.4334690379258626e-05, "loss": 0.1323, "step": 3339 }, { "epoch": 2.1423989737010904, "grad_norm": 1.7472892999649048, "learning_rate": 1.4323976858795802e-05, "loss": 0.1432, "step": 3340 }, { "epoch": 2.1430404105195637, "grad_norm": 1.0816901922225952, "learning_rate": 1.4313263338332977e-05, "loss": 0.1244, "step": 3341 }, { "epoch": 2.143681847338037, "grad_norm": 1.3674914836883545, "learning_rate": 1.4302549817870153e-05, "loss": 0.1402, "step": 3342 }, { "epoch": 2.1443232841565107, "grad_norm": 1.1813169717788696, "learning_rate": 1.429183629740733e-05, "loss": 0.1356, "step": 3343 }, { "epoch": 2.144964720974984, "grad_norm": 1.188887596130371, "learning_rate": 1.4281122776944506e-05, "loss": 0.13, "step": 3344 }, { "epoch": 2.1456061577934573, "grad_norm": 1.029232382774353, "learning_rate": 1.4270409256481681e-05, "loss": 0.1292, "step": 3345 }, { "epoch": 2.1462475946119306, "grad_norm": 1.9546935558319092, "learning_rate": 1.4259695736018857e-05, "loss": 0.1486, "step": 3346 }, { "epoch": 2.146889031430404, "grad_norm": 1.1678303480148315, "learning_rate": 1.4248982215556034e-05, "loss": 0.1254, "step": 3347 }, { "epoch": 2.1475304682488776, "grad_norm": 1.3110216856002808, "learning_rate": 1.4238268695093207e-05, "loss": 0.1366, "step": 3348 }, { "epoch": 2.148171905067351, "grad_norm": 1.447609782218933, "learning_rate": 1.4227555174630383e-05, "loss": 0.1376, "step": 3349 }, { "epoch": 2.148813341885824, "grad_norm": 1.1250295639038086, "learning_rate": 1.421684165416756e-05, "loss": 0.1279, "step": 3350 }, { "epoch": 2.1494547787042975, "grad_norm": 0.982209324836731, "learning_rate": 1.4206128133704734e-05, "loss": 0.1255, "step": 3351 }, { "epoch": 2.150096215522771, "grad_norm": 1.347332239151001, "learning_rate": 1.4195414613241911e-05, "loss": 0.1263, "step": 3352 }, { "epoch": 2.1507376523412445, "grad_norm": 0.8055627942085266, "learning_rate": 1.4184701092779087e-05, "loss": 0.1193, "step": 3353 }, { "epoch": 2.1513790891597178, "grad_norm": 1.766980767250061, "learning_rate": 1.4173987572316264e-05, "loss": 0.1539, "step": 3354 }, { "epoch": 2.152020525978191, "grad_norm": 1.0954601764678955, "learning_rate": 1.416327405185344e-05, "loss": 0.1363, "step": 3355 }, { "epoch": 2.1526619627966643, "grad_norm": 1.1229714155197144, "learning_rate": 1.4152560531390615e-05, "loss": 0.1271, "step": 3356 }, { "epoch": 2.153303399615138, "grad_norm": 1.1657748222351074, "learning_rate": 1.4141847010927792e-05, "loss": 0.1226, "step": 3357 }, { "epoch": 2.1539448364336113, "grad_norm": 1.4806276559829712, "learning_rate": 1.4131133490464968e-05, "loss": 0.1267, "step": 3358 }, { "epoch": 2.1545862732520846, "grad_norm": 1.2141311168670654, "learning_rate": 1.4120419970002144e-05, "loss": 0.1357, "step": 3359 }, { "epoch": 2.155227710070558, "grad_norm": 1.3648462295532227, "learning_rate": 1.410970644953932e-05, "loss": 0.1393, "step": 3360 }, { "epoch": 2.1558691468890316, "grad_norm": 1.2501782178878784, "learning_rate": 1.4098992929076496e-05, "loss": 0.1244, "step": 3361 }, { "epoch": 2.156510583707505, "grad_norm": 1.119786262512207, "learning_rate": 1.4088279408613672e-05, "loss": 0.131, "step": 3362 }, { "epoch": 2.157152020525978, "grad_norm": 1.3588346242904663, "learning_rate": 1.4077565888150849e-05, "loss": 0.1413, "step": 3363 }, { "epoch": 2.1577934573444515, "grad_norm": 1.006523609161377, "learning_rate": 1.4066852367688025e-05, "loss": 0.1271, "step": 3364 }, { "epoch": 2.1584348941629248, "grad_norm": 1.3163890838623047, "learning_rate": 1.4056138847225198e-05, "loss": 0.1463, "step": 3365 }, { "epoch": 2.1590763309813985, "grad_norm": 0.9506279230117798, "learning_rate": 1.4045425326762373e-05, "loss": 0.1194, "step": 3366 }, { "epoch": 2.159717767799872, "grad_norm": 0.8393688201904297, "learning_rate": 1.403471180629955e-05, "loss": 0.1263, "step": 3367 }, { "epoch": 2.160359204618345, "grad_norm": 1.2400058507919312, "learning_rate": 1.4023998285836726e-05, "loss": 0.1318, "step": 3368 }, { "epoch": 2.1610006414368184, "grad_norm": 1.1120495796203613, "learning_rate": 1.4013284765373902e-05, "loss": 0.122, "step": 3369 }, { "epoch": 2.1616420782552916, "grad_norm": 1.1685703992843628, "learning_rate": 1.4002571244911079e-05, "loss": 0.1275, "step": 3370 }, { "epoch": 2.1622835150737654, "grad_norm": 1.2495150566101074, "learning_rate": 1.3991857724448253e-05, "loss": 0.1251, "step": 3371 }, { "epoch": 2.1629249518922387, "grad_norm": 1.142805814743042, "learning_rate": 1.398114420398543e-05, "loss": 0.1302, "step": 3372 }, { "epoch": 2.163566388710712, "grad_norm": 1.2119457721710205, "learning_rate": 1.3970430683522606e-05, "loss": 0.1303, "step": 3373 }, { "epoch": 2.1642078255291852, "grad_norm": 1.201508641242981, "learning_rate": 1.3959717163059783e-05, "loss": 0.1296, "step": 3374 }, { "epoch": 2.164849262347659, "grad_norm": 1.0869791507720947, "learning_rate": 1.3949003642596958e-05, "loss": 0.1281, "step": 3375 }, { "epoch": 2.1654906991661322, "grad_norm": 1.1712199449539185, "learning_rate": 1.3938290122134134e-05, "loss": 0.1275, "step": 3376 }, { "epoch": 2.1661321359846055, "grad_norm": 1.9118354320526123, "learning_rate": 1.392757660167131e-05, "loss": 0.1365, "step": 3377 }, { "epoch": 2.166773572803079, "grad_norm": 1.2496776580810547, "learning_rate": 1.3916863081208487e-05, "loss": 0.1391, "step": 3378 }, { "epoch": 2.167415009621552, "grad_norm": 1.4250373840332031, "learning_rate": 1.3906149560745663e-05, "loss": 0.1199, "step": 3379 }, { "epoch": 2.168056446440026, "grad_norm": 1.2470331192016602, "learning_rate": 1.3895436040282838e-05, "loss": 0.1255, "step": 3380 }, { "epoch": 2.168697883258499, "grad_norm": 1.335131049156189, "learning_rate": 1.3884722519820015e-05, "loss": 0.1284, "step": 3381 }, { "epoch": 2.1693393200769724, "grad_norm": 1.181604266166687, "learning_rate": 1.3874008999357188e-05, "loss": 0.1251, "step": 3382 }, { "epoch": 2.1699807568954457, "grad_norm": 1.286368727684021, "learning_rate": 1.3863295478894364e-05, "loss": 0.1302, "step": 3383 }, { "epoch": 2.1706221937139194, "grad_norm": 1.8886438608169556, "learning_rate": 1.385258195843154e-05, "loss": 0.1826, "step": 3384 }, { "epoch": 2.1712636305323927, "grad_norm": 1.0905652046203613, "learning_rate": 1.3841868437968717e-05, "loss": 0.1225, "step": 3385 }, { "epoch": 2.171905067350866, "grad_norm": 1.3887574672698975, "learning_rate": 1.3831154917505892e-05, "loss": 0.1422, "step": 3386 }, { "epoch": 2.1725465041693393, "grad_norm": 1.609331727027893, "learning_rate": 1.3820441397043068e-05, "loss": 0.1474, "step": 3387 }, { "epoch": 2.1731879409878125, "grad_norm": 1.3335047960281372, "learning_rate": 1.3809727876580245e-05, "loss": 0.1349, "step": 3388 }, { "epoch": 2.1738293778062863, "grad_norm": 1.328471302986145, "learning_rate": 1.3799014356117421e-05, "loss": 0.1249, "step": 3389 }, { "epoch": 2.1744708146247596, "grad_norm": 1.1765403747558594, "learning_rate": 1.3788300835654596e-05, "loss": 0.1397, "step": 3390 }, { "epoch": 2.175112251443233, "grad_norm": 1.4387123584747314, "learning_rate": 1.3777587315191772e-05, "loss": 0.1315, "step": 3391 }, { "epoch": 2.175753688261706, "grad_norm": 2.182009220123291, "learning_rate": 1.3766873794728949e-05, "loss": 0.1942, "step": 3392 }, { "epoch": 2.1763951250801794, "grad_norm": 1.226906180381775, "learning_rate": 1.3756160274266125e-05, "loss": 0.1394, "step": 3393 }, { "epoch": 2.177036561898653, "grad_norm": 1.174424409866333, "learning_rate": 1.3745446753803302e-05, "loss": 0.1321, "step": 3394 }, { "epoch": 2.1776779987171264, "grad_norm": 1.2192493677139282, "learning_rate": 1.3734733233340476e-05, "loss": 0.1342, "step": 3395 }, { "epoch": 2.1783194355355997, "grad_norm": 1.468245029449463, "learning_rate": 1.3724019712877653e-05, "loss": 0.1516, "step": 3396 }, { "epoch": 2.178960872354073, "grad_norm": 1.1859923601150513, "learning_rate": 1.371330619241483e-05, "loss": 0.1189, "step": 3397 }, { "epoch": 2.1796023091725463, "grad_norm": 1.238158941268921, "learning_rate": 1.3702592671952006e-05, "loss": 0.1174, "step": 3398 }, { "epoch": 2.18024374599102, "grad_norm": 1.0451691150665283, "learning_rate": 1.3691879151489179e-05, "loss": 0.1245, "step": 3399 }, { "epoch": 2.1808851828094933, "grad_norm": 2.263279914855957, "learning_rate": 1.3681165631026355e-05, "loss": 0.1413, "step": 3400 }, { "epoch": 2.1815266196279666, "grad_norm": 0.958166778087616, "learning_rate": 1.367045211056353e-05, "loss": 0.1222, "step": 3401 }, { "epoch": 2.18216805644644, "grad_norm": 1.2565208673477173, "learning_rate": 1.3659738590100707e-05, "loss": 0.1361, "step": 3402 }, { "epoch": 2.1828094932649136, "grad_norm": 1.2230037450790405, "learning_rate": 1.3649025069637883e-05, "loss": 0.1322, "step": 3403 }, { "epoch": 2.183450930083387, "grad_norm": 1.606473684310913, "learning_rate": 1.363831154917506e-05, "loss": 0.141, "step": 3404 }, { "epoch": 2.18409236690186, "grad_norm": 1.0548357963562012, "learning_rate": 1.3627598028712236e-05, "loss": 0.1324, "step": 3405 }, { "epoch": 2.1847338037203334, "grad_norm": 1.1043615341186523, "learning_rate": 1.361688450824941e-05, "loss": 0.1276, "step": 3406 }, { "epoch": 2.185375240538807, "grad_norm": 1.115897297859192, "learning_rate": 1.3606170987786587e-05, "loss": 0.1332, "step": 3407 }, { "epoch": 2.1860166773572804, "grad_norm": 1.2546430826187134, "learning_rate": 1.3595457467323764e-05, "loss": 0.1323, "step": 3408 }, { "epoch": 2.1866581141757537, "grad_norm": 1.0784903764724731, "learning_rate": 1.358474394686094e-05, "loss": 0.13, "step": 3409 }, { "epoch": 2.187299550994227, "grad_norm": 0.970577597618103, "learning_rate": 1.3574030426398115e-05, "loss": 0.1275, "step": 3410 }, { "epoch": 2.1879409878127003, "grad_norm": 1.3489528894424438, "learning_rate": 1.3563316905935291e-05, "loss": 0.1362, "step": 3411 }, { "epoch": 2.188582424631174, "grad_norm": 1.3228013515472412, "learning_rate": 1.3552603385472468e-05, "loss": 0.1366, "step": 3412 }, { "epoch": 2.1892238614496473, "grad_norm": 1.3672813177108765, "learning_rate": 1.3541889865009644e-05, "loss": 0.1404, "step": 3413 }, { "epoch": 2.1898652982681206, "grad_norm": 1.0056146383285522, "learning_rate": 1.353117634454682e-05, "loss": 0.1163, "step": 3414 }, { "epoch": 2.190506735086594, "grad_norm": 1.2280715703964233, "learning_rate": 1.3520462824083995e-05, "loss": 0.1375, "step": 3415 }, { "epoch": 2.191148171905067, "grad_norm": 1.0486209392547607, "learning_rate": 1.3509749303621168e-05, "loss": 0.1184, "step": 3416 }, { "epoch": 2.191789608723541, "grad_norm": 1.3412964344024658, "learning_rate": 1.3499035783158345e-05, "loss": 0.1417, "step": 3417 }, { "epoch": 2.192431045542014, "grad_norm": 1.2448071241378784, "learning_rate": 1.3488322262695521e-05, "loss": 0.1446, "step": 3418 }, { "epoch": 2.1930724823604875, "grad_norm": 1.149255633354187, "learning_rate": 1.3477608742232698e-05, "loss": 0.1307, "step": 3419 }, { "epoch": 2.1937139191789607, "grad_norm": 1.5785905122756958, "learning_rate": 1.3466895221769874e-05, "loss": 0.1482, "step": 3420 }, { "epoch": 2.194355355997434, "grad_norm": 1.0181591510772705, "learning_rate": 1.3456181701307049e-05, "loss": 0.1248, "step": 3421 }, { "epoch": 2.1949967928159078, "grad_norm": 1.101881980895996, "learning_rate": 1.3445468180844225e-05, "loss": 0.1322, "step": 3422 }, { "epoch": 2.195638229634381, "grad_norm": 1.2536733150482178, "learning_rate": 1.3434754660381402e-05, "loss": 0.138, "step": 3423 }, { "epoch": 2.1962796664528543, "grad_norm": 1.019513726234436, "learning_rate": 1.3424041139918578e-05, "loss": 0.1292, "step": 3424 }, { "epoch": 2.1969211032713276, "grad_norm": 0.9656012058258057, "learning_rate": 1.3413327619455753e-05, "loss": 0.1129, "step": 3425 }, { "epoch": 2.1975625400898013, "grad_norm": 1.1566102504730225, "learning_rate": 1.340261409899293e-05, "loss": 0.1391, "step": 3426 }, { "epoch": 2.1982039769082746, "grad_norm": 1.022811770439148, "learning_rate": 1.3391900578530106e-05, "loss": 0.135, "step": 3427 }, { "epoch": 2.198845413726748, "grad_norm": 1.3756805658340454, "learning_rate": 1.3381187058067283e-05, "loss": 0.1396, "step": 3428 }, { "epoch": 2.199486850545221, "grad_norm": 1.070984959602356, "learning_rate": 1.3370473537604459e-05, "loss": 0.1347, "step": 3429 }, { "epoch": 2.2001282873636945, "grad_norm": 2.461007833480835, "learning_rate": 1.3359760017141634e-05, "loss": 0.1604, "step": 3430 }, { "epoch": 2.200769724182168, "grad_norm": 1.3827743530273438, "learning_rate": 1.334904649667881e-05, "loss": 0.1311, "step": 3431 }, { "epoch": 2.2014111610006415, "grad_norm": 1.5786510705947876, "learning_rate": 1.3338332976215987e-05, "loss": 0.149, "step": 3432 }, { "epoch": 2.2020525978191148, "grad_norm": 1.464525818824768, "learning_rate": 1.3327619455753163e-05, "loss": 0.1345, "step": 3433 }, { "epoch": 2.202694034637588, "grad_norm": 1.592724323272705, "learning_rate": 1.3316905935290336e-05, "loss": 0.1439, "step": 3434 }, { "epoch": 2.203335471456062, "grad_norm": 0.8879175186157227, "learning_rate": 1.3306192414827513e-05, "loss": 0.1219, "step": 3435 }, { "epoch": 2.203976908274535, "grad_norm": 1.353562593460083, "learning_rate": 1.3295478894364687e-05, "loss": 0.1404, "step": 3436 }, { "epoch": 2.2046183450930084, "grad_norm": 1.072875738143921, "learning_rate": 1.3284765373901864e-05, "loss": 0.1302, "step": 3437 }, { "epoch": 2.2052597819114816, "grad_norm": 1.5878688097000122, "learning_rate": 1.327405185343904e-05, "loss": 0.1481, "step": 3438 }, { "epoch": 2.205901218729955, "grad_norm": 1.5456109046936035, "learning_rate": 1.3263338332976217e-05, "loss": 0.1372, "step": 3439 }, { "epoch": 2.2065426555484287, "grad_norm": 0.9877792000770569, "learning_rate": 1.3252624812513392e-05, "loss": 0.1268, "step": 3440 }, { "epoch": 2.207184092366902, "grad_norm": 1.0013548135757446, "learning_rate": 1.3241911292050568e-05, "loss": 0.125, "step": 3441 }, { "epoch": 2.2078255291853752, "grad_norm": 1.1555293798446655, "learning_rate": 1.3231197771587744e-05, "loss": 0.1244, "step": 3442 }, { "epoch": 2.2084669660038485, "grad_norm": 1.383288025856018, "learning_rate": 1.3220484251124921e-05, "loss": 0.1378, "step": 3443 }, { "epoch": 2.209108402822322, "grad_norm": 1.0463651418685913, "learning_rate": 1.3209770730662097e-05, "loss": 0.1291, "step": 3444 }, { "epoch": 2.2097498396407955, "grad_norm": 1.1637860536575317, "learning_rate": 1.3199057210199272e-05, "loss": 0.1351, "step": 3445 }, { "epoch": 2.210391276459269, "grad_norm": 1.4647759199142456, "learning_rate": 1.3188343689736449e-05, "loss": 0.1417, "step": 3446 }, { "epoch": 2.211032713277742, "grad_norm": 1.5512422323226929, "learning_rate": 1.3177630169273625e-05, "loss": 0.1418, "step": 3447 }, { "epoch": 2.2116741500962154, "grad_norm": 1.3450912237167358, "learning_rate": 1.3166916648810801e-05, "loss": 0.1273, "step": 3448 }, { "epoch": 2.212315586914689, "grad_norm": 1.2855844497680664, "learning_rate": 1.3156203128347976e-05, "loss": 0.1413, "step": 3449 }, { "epoch": 2.2129570237331624, "grad_norm": 1.1604161262512207, "learning_rate": 1.3145489607885153e-05, "loss": 0.1379, "step": 3450 }, { "epoch": 2.2135984605516357, "grad_norm": 1.1479030847549438, "learning_rate": 1.3134776087422326e-05, "loss": 0.1273, "step": 3451 }, { "epoch": 2.214239897370109, "grad_norm": 1.1099894046783447, "learning_rate": 1.3124062566959502e-05, "loss": 0.1361, "step": 3452 }, { "epoch": 2.2148813341885822, "grad_norm": 1.2196506261825562, "learning_rate": 1.3113349046496679e-05, "loss": 0.1265, "step": 3453 }, { "epoch": 2.215522771007056, "grad_norm": 1.037832498550415, "learning_rate": 1.3102635526033855e-05, "loss": 0.1226, "step": 3454 }, { "epoch": 2.2161642078255293, "grad_norm": 1.0301125049591064, "learning_rate": 1.309192200557103e-05, "loss": 0.1261, "step": 3455 }, { "epoch": 2.2168056446440025, "grad_norm": 0.9804339408874512, "learning_rate": 1.3081208485108206e-05, "loss": 0.1281, "step": 3456 }, { "epoch": 2.217447081462476, "grad_norm": 1.058591365814209, "learning_rate": 1.3070494964645383e-05, "loss": 0.13, "step": 3457 }, { "epoch": 2.2180885182809496, "grad_norm": 1.319521188735962, "learning_rate": 1.305978144418256e-05, "loss": 0.1506, "step": 3458 }, { "epoch": 2.218729955099423, "grad_norm": 1.424768328666687, "learning_rate": 1.3049067923719736e-05, "loss": 0.1437, "step": 3459 }, { "epoch": 2.219371391917896, "grad_norm": 1.084383487701416, "learning_rate": 1.303835440325691e-05, "loss": 0.1342, "step": 3460 }, { "epoch": 2.2200128287363694, "grad_norm": 1.5632405281066895, "learning_rate": 1.3027640882794087e-05, "loss": 0.1487, "step": 3461 }, { "epoch": 2.2206542655548427, "grad_norm": 1.1561875343322754, "learning_rate": 1.3016927362331263e-05, "loss": 0.1328, "step": 3462 }, { "epoch": 2.2212957023733164, "grad_norm": 1.6152353286743164, "learning_rate": 1.300621384186844e-05, "loss": 0.156, "step": 3463 }, { "epoch": 2.2219371391917897, "grad_norm": 0.9734735488891602, "learning_rate": 1.2995500321405615e-05, "loss": 0.1212, "step": 3464 }, { "epoch": 2.222578576010263, "grad_norm": 1.263521432876587, "learning_rate": 1.2984786800942791e-05, "loss": 0.1344, "step": 3465 }, { "epoch": 2.2232200128287363, "grad_norm": 1.073771595954895, "learning_rate": 1.2974073280479967e-05, "loss": 0.1254, "step": 3466 }, { "epoch": 2.2238614496472096, "grad_norm": 1.039601445198059, "learning_rate": 1.2963359760017144e-05, "loss": 0.1301, "step": 3467 }, { "epoch": 2.2245028864656833, "grad_norm": 1.0616734027862549, "learning_rate": 1.2952646239554317e-05, "loss": 0.1344, "step": 3468 }, { "epoch": 2.2251443232841566, "grad_norm": 1.9030003547668457, "learning_rate": 1.2941932719091493e-05, "loss": 0.1708, "step": 3469 }, { "epoch": 2.22578576010263, "grad_norm": 1.34011971950531, "learning_rate": 1.2931219198628668e-05, "loss": 0.1573, "step": 3470 }, { "epoch": 2.226427196921103, "grad_norm": 0.7514581680297852, "learning_rate": 1.2920505678165845e-05, "loss": 0.1161, "step": 3471 }, { "epoch": 2.2270686337395764, "grad_norm": 1.39512300491333, "learning_rate": 1.2909792157703021e-05, "loss": 0.1312, "step": 3472 }, { "epoch": 2.22771007055805, "grad_norm": 1.0731029510498047, "learning_rate": 1.2899078637240198e-05, "loss": 0.1243, "step": 3473 }, { "epoch": 2.2283515073765234, "grad_norm": 1.0379931926727295, "learning_rate": 1.2888365116777374e-05, "loss": 0.1269, "step": 3474 }, { "epoch": 2.2289929441949967, "grad_norm": 1.115376353263855, "learning_rate": 1.2877651596314549e-05, "loss": 0.1338, "step": 3475 }, { "epoch": 2.22963438101347, "grad_norm": 2.0044617652893066, "learning_rate": 1.2866938075851725e-05, "loss": 0.1601, "step": 3476 }, { "epoch": 2.2302758178319437, "grad_norm": 1.0005751848220825, "learning_rate": 1.2856224555388902e-05, "loss": 0.1407, "step": 3477 }, { "epoch": 2.230917254650417, "grad_norm": 1.2969173192977905, "learning_rate": 1.2845511034926078e-05, "loss": 0.1314, "step": 3478 }, { "epoch": 2.2315586914688903, "grad_norm": 1.0895812511444092, "learning_rate": 1.2834797514463253e-05, "loss": 0.1185, "step": 3479 }, { "epoch": 2.2322001282873636, "grad_norm": 1.116783618927002, "learning_rate": 1.282408399400043e-05, "loss": 0.1296, "step": 3480 }, { "epoch": 2.232841565105837, "grad_norm": 1.819164752960205, "learning_rate": 1.2813370473537606e-05, "loss": 0.1514, "step": 3481 }, { "epoch": 2.2334830019243106, "grad_norm": 0.9508031010627747, "learning_rate": 1.2802656953074782e-05, "loss": 0.1192, "step": 3482 }, { "epoch": 2.234124438742784, "grad_norm": 1.4851040840148926, "learning_rate": 1.2791943432611959e-05, "loss": 0.143, "step": 3483 }, { "epoch": 2.234765875561257, "grad_norm": 0.8771987557411194, "learning_rate": 1.2781229912149133e-05, "loss": 0.1208, "step": 3484 }, { "epoch": 2.2354073123797304, "grad_norm": 1.5004953145980835, "learning_rate": 1.2770516391686308e-05, "loss": 0.1451, "step": 3485 }, { "epoch": 2.236048749198204, "grad_norm": 1.1665327548980713, "learning_rate": 1.2759802871223483e-05, "loss": 0.1364, "step": 3486 }, { "epoch": 2.2366901860166775, "grad_norm": 1.2753480672836304, "learning_rate": 1.274908935076066e-05, "loss": 0.1334, "step": 3487 }, { "epoch": 2.2373316228351507, "grad_norm": 0.8614938855171204, "learning_rate": 1.2738375830297836e-05, "loss": 0.1214, "step": 3488 }, { "epoch": 2.237973059653624, "grad_norm": 0.9049705266952515, "learning_rate": 1.2727662309835012e-05, "loss": 0.1273, "step": 3489 }, { "epoch": 2.2386144964720973, "grad_norm": 0.9845554232597351, "learning_rate": 1.2716948789372187e-05, "loss": 0.1252, "step": 3490 }, { "epoch": 2.239255933290571, "grad_norm": 1.1550410985946655, "learning_rate": 1.2706235268909364e-05, "loss": 0.1397, "step": 3491 }, { "epoch": 2.2398973701090443, "grad_norm": 1.6611515283584595, "learning_rate": 1.269552174844654e-05, "loss": 0.1487, "step": 3492 }, { "epoch": 2.2405388069275176, "grad_norm": 1.121070146560669, "learning_rate": 1.2684808227983716e-05, "loss": 0.1307, "step": 3493 }, { "epoch": 2.241180243745991, "grad_norm": 1.0716403722763062, "learning_rate": 1.2674094707520893e-05, "loss": 0.1334, "step": 3494 }, { "epoch": 2.241821680564464, "grad_norm": 1.2707160711288452, "learning_rate": 1.2663381187058068e-05, "loss": 0.142, "step": 3495 }, { "epoch": 2.242463117382938, "grad_norm": 1.628706693649292, "learning_rate": 1.2652667666595244e-05, "loss": 0.1634, "step": 3496 }, { "epoch": 2.243104554201411, "grad_norm": 1.1729011535644531, "learning_rate": 1.264195414613242e-05, "loss": 0.129, "step": 3497 }, { "epoch": 2.2437459910198845, "grad_norm": 1.154311180114746, "learning_rate": 1.2631240625669597e-05, "loss": 0.1443, "step": 3498 }, { "epoch": 2.2443874278383578, "grad_norm": 1.5475869178771973, "learning_rate": 1.2620527105206772e-05, "loss": 0.1508, "step": 3499 }, { "epoch": 2.2450288646568315, "grad_norm": 1.1911468505859375, "learning_rate": 1.2609813584743948e-05, "loss": 0.1258, "step": 3500 } ], "logging_steps": 1, "max_steps": 4677, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.4029500652272845e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }