{ "best_metric": 0.958063608218407, "best_model_checkpoint": "melanoma-v4\\checkpoint-69000", "epoch": 5.0, "eval_steps": 1000, "global_step": 71615, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00349088878028346, "grad_norm": 44.0053596496582, "learning_rate": 3.1415805640882436e-07, "loss": 0.6936, "step": 50 }, { "epoch": 0.00698177756056692, "grad_norm": 52.222103118896484, "learning_rate": 6.632225635297403e-07, "loss": 0.6915, "step": 100 }, { "epoch": 0.010472666340850381, "grad_norm": 34.52608108520508, "learning_rate": 1.0053057805082381e-06, "loss": 0.6849, "step": 150 }, { "epoch": 0.01396355512113384, "grad_norm": 23.362762451171875, "learning_rate": 1.3543702876291538e-06, "loss": 0.6753, "step": 200 }, { "epoch": 0.017454443901417302, "grad_norm": 55.74956512451172, "learning_rate": 1.7034347947500698e-06, "loss": 0.6634, "step": 250 }, { "epoch": 0.020945332681700762, "grad_norm": 14.746822357177734, "learning_rate": 2.052499301870986e-06, "loss": 0.6475, "step": 300 }, { "epoch": 0.024436221461984223, "grad_norm": 22.278230667114258, "learning_rate": 2.401563808991902e-06, "loss": 0.6281, "step": 350 }, { "epoch": 0.02792711024226768, "grad_norm": 23.80645751953125, "learning_rate": 2.7506283161128178e-06, "loss": 0.6048, "step": 400 }, { "epoch": 0.031417999022551144, "grad_norm": 16.161684036254883, "learning_rate": 3.0996928232337337e-06, "loss": 0.5785, "step": 450 }, { "epoch": 0.034908887802834604, "grad_norm": 19.891162872314453, "learning_rate": 3.4487573303546497e-06, "loss": 0.5491, "step": 500 }, { "epoch": 0.038399776583118064, "grad_norm": 25.53693199157715, "learning_rate": 3.797821837475566e-06, "loss": 0.5138, "step": 550 }, { "epoch": 0.041890665363401525, "grad_norm": 36.38960647583008, "learning_rate": 4.146886344596482e-06, "loss": 0.482, "step": 600 }, { "epoch": 0.045381554143684985, "grad_norm": 29.26142120361328, "learning_rate": 4.4959508517173975e-06, "loss": 0.4375, "step": 650 }, { "epoch": 0.048872442923968445, "grad_norm": 55.49436950683594, "learning_rate": 4.845015358838314e-06, "loss": 0.4107, "step": 700 }, { "epoch": 0.052363331704251906, "grad_norm": 29.13718605041504, "learning_rate": 5.194079865959229e-06, "loss": 0.3595, "step": 750 }, { "epoch": 0.05585422048453536, "grad_norm": 27.340045928955078, "learning_rate": 5.543144373080146e-06, "loss": 0.313, "step": 800 }, { "epoch": 0.05934510926481882, "grad_norm": 18.835979461669922, "learning_rate": 5.892208880201061e-06, "loss": 0.2754, "step": 850 }, { "epoch": 0.06283599804510229, "grad_norm": 28.285400390625, "learning_rate": 6.2412733873219775e-06, "loss": 0.2423, "step": 900 }, { "epoch": 0.06632688682538575, "grad_norm": 14.74078369140625, "learning_rate": 6.590337894442894e-06, "loss": 0.1988, "step": 950 }, { "epoch": 0.06981777560566921, "grad_norm": 12.510848999023438, "learning_rate": 6.939402401563809e-06, "loss": 0.193, "step": 1000 }, { "epoch": 0.06981777560566921, "eval_f1": 0.0, "eval_loss": 1.0061070919036865, "eval_runtime": 16.8276, "eval_samples_per_second": 217.5, "eval_steps_per_second": 6.834, "step": 1000 }, { "epoch": 0.07330866438595267, "grad_norm": 24.07497215270996, "learning_rate": 7.288466908684726e-06, "loss": 0.1646, "step": 1050 }, { "epoch": 0.07679955316623613, "grad_norm": 34.75226593017578, "learning_rate": 7.637531415805642e-06, "loss": 0.1535, "step": 1100 }, { "epoch": 0.08029044194651959, "grad_norm": 8.6325101852417, "learning_rate": 7.986595922926558e-06, "loss": 0.1338, "step": 1150 }, { "epoch": 0.08378133072680305, "grad_norm": 7.621668338775635, "learning_rate": 8.335660430047473e-06, "loss": 0.1306, "step": 1200 }, { "epoch": 0.08727221950708651, "grad_norm": 12.712471008300781, "learning_rate": 8.68472493716839e-06, "loss": 0.1297, "step": 1250 }, { "epoch": 0.09076310828736997, "grad_norm": 24.750577926635742, "learning_rate": 9.033789444289304e-06, "loss": 0.1106, "step": 1300 }, { "epoch": 0.09425399706765343, "grad_norm": 6.89915657043457, "learning_rate": 9.382853951410221e-06, "loss": 0.087, "step": 1350 }, { "epoch": 0.09774488584793689, "grad_norm": 31.87893295288086, "learning_rate": 9.731918458531137e-06, "loss": 0.1054, "step": 1400 }, { "epoch": 0.10123577462822035, "grad_norm": 18.434345245361328, "learning_rate": 1.0080982965652052e-05, "loss": 0.1052, "step": 1450 }, { "epoch": 0.10472666340850381, "grad_norm": 2.470731258392334, "learning_rate": 1.043004747277297e-05, "loss": 0.0819, "step": 1500 }, { "epoch": 0.10821755218878727, "grad_norm": 4.6369853019714355, "learning_rate": 1.0779111979893885e-05, "loss": 0.0817, "step": 1550 }, { "epoch": 0.11170844096907072, "grad_norm": 59.104862213134766, "learning_rate": 1.11281764870148e-05, "loss": 0.0819, "step": 1600 }, { "epoch": 0.11519932974935418, "grad_norm": 46.61822509765625, "learning_rate": 1.1477240994135718e-05, "loss": 0.0556, "step": 1650 }, { "epoch": 0.11869021852963764, "grad_norm": 46.31196212768555, "learning_rate": 1.1826305501256633e-05, "loss": 0.0828, "step": 1700 }, { "epoch": 0.1221811073099211, "grad_norm": 3.6305160522460938, "learning_rate": 1.2175370008377549e-05, "loss": 0.0857, "step": 1750 }, { "epoch": 0.12567199609020457, "grad_norm": 6.165530204772949, "learning_rate": 1.2524434515498464e-05, "loss": 0.0811, "step": 1800 }, { "epoch": 0.12916288487048802, "grad_norm": 2.393606185913086, "learning_rate": 1.2873499022619382e-05, "loss": 0.0601, "step": 1850 }, { "epoch": 0.1326537736507715, "grad_norm": 9.886051177978516, "learning_rate": 1.3222563529740297e-05, "loss": 0.0757, "step": 1900 }, { "epoch": 0.13614466243105494, "grad_norm": 9.62782096862793, "learning_rate": 1.3571628036861211e-05, "loss": 0.0798, "step": 1950 }, { "epoch": 0.13963555121133842, "grad_norm": 4.66179895401001, "learning_rate": 1.3920692543982128e-05, "loss": 0.0585, "step": 2000 }, { "epoch": 0.13963555121133842, "eval_f1": 0.4475823705605477, "eval_loss": 1.2610578536987305, "eval_runtime": 16.5432, "eval_samples_per_second": 221.239, "eval_steps_per_second": 6.951, "step": 2000 }, { "epoch": 0.14312643999162186, "grad_norm": 7.12888240814209, "learning_rate": 1.4269757051103044e-05, "loss": 0.0682, "step": 2050 }, { "epoch": 0.14661732877190534, "grad_norm": 35.82215881347656, "learning_rate": 1.461882155822396e-05, "loss": 0.0577, "step": 2100 }, { "epoch": 0.15010821755218878, "grad_norm": 6.529672145843506, "learning_rate": 1.4960904775202458e-05, "loss": 0.0922, "step": 2150 }, { "epoch": 0.15359910633247226, "grad_norm": 8.664837837219238, "learning_rate": 1.5309969282323375e-05, "loss": 0.0574, "step": 2200 }, { "epoch": 0.1570899951127557, "grad_norm": 2.3119335174560547, "learning_rate": 1.5659033789444292e-05, "loss": 0.0593, "step": 2250 }, { "epoch": 0.16058088389303918, "grad_norm": 1.3561939001083374, "learning_rate": 1.6008098296565206e-05, "loss": 0.066, "step": 2300 }, { "epoch": 0.16407177267332262, "grad_norm": 38.09425735473633, "learning_rate": 1.635716280368612e-05, "loss": 0.0757, "step": 2350 }, { "epoch": 0.1675626614536061, "grad_norm": 61.25293731689453, "learning_rate": 1.6706227310807037e-05, "loss": 0.0778, "step": 2400 }, { "epoch": 0.17105355023388955, "grad_norm": 39.29595947265625, "learning_rate": 1.7055291817927955e-05, "loss": 0.0775, "step": 2450 }, { "epoch": 0.17454443901417302, "grad_norm": 35.1599235534668, "learning_rate": 1.7404356325048872e-05, "loss": 0.0775, "step": 2500 }, { "epoch": 0.17803532779445647, "grad_norm": 0.5723768472671509, "learning_rate": 1.7753420832169786e-05, "loss": 0.0658, "step": 2550 }, { "epoch": 0.18152621657473994, "grad_norm": 1.6450070142745972, "learning_rate": 1.81024853392907e-05, "loss": 0.0461, "step": 2600 }, { "epoch": 0.1850171053550234, "grad_norm": 0.417278528213501, "learning_rate": 1.8451549846411617e-05, "loss": 0.0648, "step": 2650 }, { "epoch": 0.18850799413530686, "grad_norm": 14.564908981323242, "learning_rate": 1.8800614353532534e-05, "loss": 0.0573, "step": 2700 }, { "epoch": 0.1919988829155903, "grad_norm": 0.4077770411968231, "learning_rate": 1.914967886065345e-05, "loss": 0.0695, "step": 2750 }, { "epoch": 0.19548977169587378, "grad_norm": 0.27182820439338684, "learning_rate": 1.9498743367774365e-05, "loss": 0.043, "step": 2800 }, { "epoch": 0.19898066047615723, "grad_norm": 3.3724520206451416, "learning_rate": 1.9847807874895282e-05, "loss": 0.0765, "step": 2850 }, { "epoch": 0.2024715492564407, "grad_norm": 27.066162109375, "learning_rate": 2.0196872382016196e-05, "loss": 0.047, "step": 2900 }, { "epoch": 0.20596243803672415, "grad_norm": 0.23048856854438782, "learning_rate": 2.0545936889137113e-05, "loss": 0.0589, "step": 2950 }, { "epoch": 0.20945332681700762, "grad_norm": 0.6853739619255066, "learning_rate": 2.089500139625803e-05, "loss": 0.0723, "step": 3000 }, { "epoch": 0.20945332681700762, "eval_f1": 0.8094940662086196, "eval_loss": 0.7442474961280823, "eval_runtime": 17.1452, "eval_samples_per_second": 213.471, "eval_steps_per_second": 6.707, "step": 3000 }, { "epoch": 0.21294421559729107, "grad_norm": 71.01643371582031, "learning_rate": 2.1244065903378947e-05, "loss": 0.0696, "step": 3050 }, { "epoch": 0.21643510437757454, "grad_norm": 54.42838668823242, "learning_rate": 2.159313041049986e-05, "loss": 0.0443, "step": 3100 }, { "epoch": 0.219925993157858, "grad_norm": 0.15738917887210846, "learning_rate": 2.194219491762078e-05, "loss": 0.0569, "step": 3150 }, { "epoch": 0.22341688193814144, "grad_norm": 0.6543312668800354, "learning_rate": 2.2291259424741692e-05, "loss": 0.0637, "step": 3200 }, { "epoch": 0.2269077707184249, "grad_norm": 0.44767776131629944, "learning_rate": 2.264032393186261e-05, "loss": 0.0498, "step": 3250 }, { "epoch": 0.23039865949870836, "grad_norm": 0.09834372252225876, "learning_rate": 2.2989388438983527e-05, "loss": 0.0344, "step": 3300 }, { "epoch": 0.23388954827899183, "grad_norm": 83.13866424560547, "learning_rate": 2.333845294610444e-05, "loss": 0.05, "step": 3350 }, { "epoch": 0.23738043705927528, "grad_norm": 14.352801322937012, "learning_rate": 2.3687517453225358e-05, "loss": 0.0536, "step": 3400 }, { "epoch": 0.24087132583955875, "grad_norm": 1.1875745058059692, "learning_rate": 2.4036581960346275e-05, "loss": 0.0617, "step": 3450 }, { "epoch": 0.2443622146198422, "grad_norm": 6.531946659088135, "learning_rate": 2.438564646746719e-05, "loss": 0.0636, "step": 3500 }, { "epoch": 0.24785310340012567, "grad_norm": 0.0965203046798706, "learning_rate": 2.4734710974588106e-05, "loss": 0.0423, "step": 3550 }, { "epoch": 0.25134399218040915, "grad_norm": 0.07983570545911789, "learning_rate": 2.508377548170902e-05, "loss": 0.036, "step": 3600 }, { "epoch": 0.25483488096069257, "grad_norm": 0.05020700767636299, "learning_rate": 2.5432839988829937e-05, "loss": 0.0448, "step": 3650 }, { "epoch": 0.25832576974097604, "grad_norm": 23.652816772460938, "learning_rate": 2.5781904495950854e-05, "loss": 0.0709, "step": 3700 }, { "epoch": 0.2618166585212595, "grad_norm": 0.33149588108062744, "learning_rate": 2.613096900307177e-05, "loss": 0.0426, "step": 3750 }, { "epoch": 0.265307547301543, "grad_norm": 57.82050704956055, "learning_rate": 2.6480033510192685e-05, "loss": 0.0535, "step": 3800 }, { "epoch": 0.2687984360818264, "grad_norm": 0.3976256847381592, "learning_rate": 2.6829098017313602e-05, "loss": 0.0591, "step": 3850 }, { "epoch": 0.2722893248621099, "grad_norm": 3.0774986743927, "learning_rate": 2.717816252443452e-05, "loss": 0.0312, "step": 3900 }, { "epoch": 0.27578021364239336, "grad_norm": 91.02516174316406, "learning_rate": 2.752722703155543e-05, "loss": 0.0541, "step": 3950 }, { "epoch": 0.27927110242267683, "grad_norm": 0.111998550593853, "learning_rate": 2.7876291538676347e-05, "loss": 0.0442, "step": 4000 }, { "epoch": 0.27927110242267683, "eval_f1": 0.8585769117212873, "eval_loss": 0.41572797298431396, "eval_runtime": 23.9384, "eval_samples_per_second": 152.893, "eval_steps_per_second": 4.804, "step": 4000 }, { "epoch": 0.28276199120296025, "grad_norm": 0.1660720556974411, "learning_rate": 2.8225356045797264e-05, "loss": 0.0418, "step": 4050 }, { "epoch": 0.2862528799832437, "grad_norm": 2.238628387451172, "learning_rate": 2.8574420552918178e-05, "loss": 0.0578, "step": 4100 }, { "epoch": 0.2897437687635272, "grad_norm": 34.74903106689453, "learning_rate": 2.8923485060039095e-05, "loss": 0.0474, "step": 4150 }, { "epoch": 0.2932346575438107, "grad_norm": 0.0780772715806961, "learning_rate": 2.9272549567160013e-05, "loss": 0.0311, "step": 4200 }, { "epoch": 0.2967255463240941, "grad_norm": 112.21017456054688, "learning_rate": 2.962161407428093e-05, "loss": 0.0683, "step": 4250 }, { "epoch": 0.30021643510437757, "grad_norm": 92.58731842041016, "learning_rate": 2.9970678581401847e-05, "loss": 0.0481, "step": 4300 }, { "epoch": 0.30370732388466104, "grad_norm": 0.12998327612876892, "learning_rate": 3.031974308852276e-05, "loss": 0.0473, "step": 4350 }, { "epoch": 0.3071982126649445, "grad_norm": 26.06481170654297, "learning_rate": 3.066880759564368e-05, "loss": 0.0474, "step": 4400 }, { "epoch": 0.31068910144522793, "grad_norm": 18.633922576904297, "learning_rate": 3.1017872102764595e-05, "loss": 0.064, "step": 4450 }, { "epoch": 0.3141799902255114, "grad_norm": 0.08274181932210922, "learning_rate": 3.136693660988551e-05, "loss": 0.0502, "step": 4500 }, { "epoch": 0.3176708790057949, "grad_norm": 15.958111763000488, "learning_rate": 3.171600111700642e-05, "loss": 0.0384, "step": 4550 }, { "epoch": 0.32116176778607836, "grad_norm": 0.12025760114192963, "learning_rate": 3.206506562412734e-05, "loss": 0.0612, "step": 4600 }, { "epoch": 0.3246526565663618, "grad_norm": 1.3214194774627686, "learning_rate": 3.241413013124826e-05, "loss": 0.0438, "step": 4650 }, { "epoch": 0.32814354534664525, "grad_norm": 19.262493133544922, "learning_rate": 3.2763194638369174e-05, "loss": 0.0352, "step": 4700 }, { "epoch": 0.3316344341269287, "grad_norm": 0.8211079239845276, "learning_rate": 3.3112259145490085e-05, "loss": 0.0594, "step": 4750 }, { "epoch": 0.3351253229072122, "grad_norm": 7.5860419273376465, "learning_rate": 3.3461323652611e-05, "loss": 0.0508, "step": 4800 }, { "epoch": 0.3386162116874956, "grad_norm": 0.6498265266418457, "learning_rate": 3.381038815973192e-05, "loss": 0.0394, "step": 4850 }, { "epoch": 0.3421071004677791, "grad_norm": 16.379024505615234, "learning_rate": 3.4159452666852836e-05, "loss": 0.0495, "step": 4900 }, { "epoch": 0.34559798924806256, "grad_norm": 16.531225204467773, "learning_rate": 3.4508517173973754e-05, "loss": 0.0447, "step": 4950 }, { "epoch": 0.34908887802834604, "grad_norm": 0.04507113993167877, "learning_rate": 3.485758168109467e-05, "loss": 0.0462, "step": 5000 }, { "epoch": 0.34908887802834604, "eval_f1": 0.7487179487179487, "eval_loss": 0.7435426712036133, "eval_runtime": 24.3491, "eval_samples_per_second": 150.313, "eval_steps_per_second": 4.723, "step": 5000 }, { "epoch": 0.35257976680862946, "grad_norm": 2.0328619480133057, "learning_rate": 3.520664618821559e-05, "loss": 0.0417, "step": 5050 }, { "epoch": 0.35607065558891293, "grad_norm": 2.6427454948425293, "learning_rate": 3.55557106953365e-05, "loss": 0.0445, "step": 5100 }, { "epoch": 0.3595615443691964, "grad_norm": 1.2316179275512695, "learning_rate": 3.5904775202457416e-05, "loss": 0.0536, "step": 5150 }, { "epoch": 0.3630524331494799, "grad_norm": 6.056459426879883, "learning_rate": 3.625383970957833e-05, "loss": 0.0532, "step": 5200 }, { "epoch": 0.3665433219297633, "grad_norm": 0.5685946345329285, "learning_rate": 3.660290421669924e-05, "loss": 0.0296, "step": 5250 }, { "epoch": 0.3700342107100468, "grad_norm": 0.04925481602549553, "learning_rate": 3.695196872382016e-05, "loss": 0.0366, "step": 5300 }, { "epoch": 0.37352509949033025, "grad_norm": 2.920793294906616, "learning_rate": 3.730103323094108e-05, "loss": 0.0384, "step": 5350 }, { "epoch": 0.3770159882706137, "grad_norm": 0.09377721697092056, "learning_rate": 3.7650097738061995e-05, "loss": 0.0322, "step": 5400 }, { "epoch": 0.38050687705089714, "grad_norm": 0.10469520837068558, "learning_rate": 3.799916224518291e-05, "loss": 0.0327, "step": 5450 }, { "epoch": 0.3839977658311806, "grad_norm": 8.783834457397461, "learning_rate": 3.834822675230383e-05, "loss": 0.0356, "step": 5500 }, { "epoch": 0.3874886546114641, "grad_norm": 0.16647844016551971, "learning_rate": 3.8697291259424747e-05, "loss": 0.0437, "step": 5550 }, { "epoch": 0.39097954339174756, "grad_norm": 1.108203411102295, "learning_rate": 3.9046355766545664e-05, "loss": 0.0422, "step": 5600 }, { "epoch": 0.394470432172031, "grad_norm": 0.06744211167097092, "learning_rate": 3.939542027366658e-05, "loss": 0.0345, "step": 5650 }, { "epoch": 0.39796132095231446, "grad_norm": 6.879347801208496, "learning_rate": 3.974448478078749e-05, "loss": 0.0625, "step": 5700 }, { "epoch": 0.40145220973259793, "grad_norm": 0.05510378256440163, "learning_rate": 4.00935492879084e-05, "loss": 0.0279, "step": 5750 }, { "epoch": 0.4049430985128814, "grad_norm": 0.2473091334104538, "learning_rate": 4.044261379502932e-05, "loss": 0.0381, "step": 5800 }, { "epoch": 0.4084339872931648, "grad_norm": 0.3688828945159912, "learning_rate": 4.0791678302150236e-05, "loss": 0.0428, "step": 5850 }, { "epoch": 0.4119248760734483, "grad_norm": 40.18030548095703, "learning_rate": 4.1140742809271153e-05, "loss": 0.0411, "step": 5900 }, { "epoch": 0.4154157648537318, "grad_norm": 0.1600964218378067, "learning_rate": 4.148980731639207e-05, "loss": 0.0373, "step": 5950 }, { "epoch": 0.41890665363401525, "grad_norm": 0.1175314262509346, "learning_rate": 4.183887182351299e-05, "loss": 0.041, "step": 6000 }, { "epoch": 0.41890665363401525, "eval_f1": 0.7534340659340659, "eval_loss": 0.6432412266731262, "eval_runtime": 23.6996, "eval_samples_per_second": 154.433, "eval_steps_per_second": 4.852, "step": 6000 }, { "epoch": 0.42239754241429867, "grad_norm": 1.2376866340637207, "learning_rate": 4.2187936330633905e-05, "loss": 0.0336, "step": 6050 }, { "epoch": 0.42588843119458214, "grad_norm": 0.07168062031269073, "learning_rate": 4.253700083775482e-05, "loss": 0.0167, "step": 6100 }, { "epoch": 0.4293793199748656, "grad_norm": 0.06772568076848984, "learning_rate": 4.288606534487574e-05, "loss": 0.0614, "step": 6150 }, { "epoch": 0.4328702087551491, "grad_norm": 0.06727208942174911, "learning_rate": 4.323512985199665e-05, "loss": 0.0464, "step": 6200 }, { "epoch": 0.4363610975354325, "grad_norm": 0.1292935311794281, "learning_rate": 4.358419435911757e-05, "loss": 0.0546, "step": 6250 }, { "epoch": 0.439851986315716, "grad_norm": 9.874855995178223, "learning_rate": 4.393325886623848e-05, "loss": 0.06, "step": 6300 }, { "epoch": 0.44334287509599946, "grad_norm": 7.21466588973999, "learning_rate": 4.4282323373359395e-05, "loss": 0.0611, "step": 6350 }, { "epoch": 0.4468337638762829, "grad_norm": 16.634078979492188, "learning_rate": 4.463138788048031e-05, "loss": 0.0524, "step": 6400 }, { "epoch": 0.45032465265656635, "grad_norm": 0.3858453929424286, "learning_rate": 4.498045238760123e-05, "loss": 0.0444, "step": 6450 }, { "epoch": 0.4538155414368498, "grad_norm": 0.7371519804000854, "learning_rate": 4.532253560457973e-05, "loss": 0.0587, "step": 6500 }, { "epoch": 0.4573064302171333, "grad_norm": 0.24755489826202393, "learning_rate": 4.5671600111700645e-05, "loss": 0.055, "step": 6550 }, { "epoch": 0.4607973189974167, "grad_norm": 0.055525269359350204, "learning_rate": 4.602066461882156e-05, "loss": 0.0381, "step": 6600 }, { "epoch": 0.4642882077777002, "grad_norm": 0.21897675096988678, "learning_rate": 4.636972912594247e-05, "loss": 0.042, "step": 6650 }, { "epoch": 0.46777909655798366, "grad_norm": 0.34035515785217285, "learning_rate": 4.671879363306339e-05, "loss": 0.0369, "step": 6700 }, { "epoch": 0.47126998533826714, "grad_norm": 0.18382132053375244, "learning_rate": 4.706785814018431e-05, "loss": 0.0569, "step": 6750 }, { "epoch": 0.47476087411855056, "grad_norm": 0.040306150913238525, "learning_rate": 4.7416922647305225e-05, "loss": 0.0353, "step": 6800 }, { "epoch": 0.47825176289883403, "grad_norm": 6.798317909240723, "learning_rate": 4.776598715442614e-05, "loss": 0.0672, "step": 6850 }, { "epoch": 0.4817426516791175, "grad_norm": 0.12166763097047806, "learning_rate": 4.811505166154706e-05, "loss": 0.0291, "step": 6900 }, { "epoch": 0.485233540459401, "grad_norm": 0.11069754511117935, "learning_rate": 4.846411616866797e-05, "loss": 0.0451, "step": 6950 }, { "epoch": 0.4887244292396844, "grad_norm": 1.7939095497131348, "learning_rate": 4.881318067578889e-05, "loss": 0.0449, "step": 7000 }, { "epoch": 0.4887244292396844, "eval_f1": 0.8512851897184822, "eval_loss": 0.42608362436294556, "eval_runtime": 24.2404, "eval_samples_per_second": 150.987, "eval_steps_per_second": 4.744, "step": 7000 }, { "epoch": 0.4922153180199679, "grad_norm": 11.614912033081055, "learning_rate": 4.9162245182909804e-05, "loss": 0.0316, "step": 7050 }, { "epoch": 0.49570620680025135, "grad_norm": 0.2757989764213562, "learning_rate": 4.951130969003072e-05, "loss": 0.0491, "step": 7100 }, { "epoch": 0.4991970955805348, "grad_norm": 0.044196244329214096, "learning_rate": 4.986037419715164e-05, "loss": 0.0377, "step": 7150 }, { "epoch": 0.5026879843608183, "grad_norm": 1.8137636184692383, "learning_rate": 4.99767272275922e-05, "loss": 0.0446, "step": 7200 }, { "epoch": 0.5061788731411018, "grad_norm": 0.11980023980140686, "learning_rate": 4.9937939273579196e-05, "loss": 0.0507, "step": 7250 }, { "epoch": 0.5096697619213851, "grad_norm": 13.373326301574707, "learning_rate": 4.9899151319566196e-05, "loss": 0.051, "step": 7300 }, { "epoch": 0.5131606507016686, "grad_norm": 0.118846096098423, "learning_rate": 4.9860363365553196e-05, "loss": 0.0312, "step": 7350 }, { "epoch": 0.5166515394819521, "grad_norm": 0.17438098788261414, "learning_rate": 4.9821575411540196e-05, "loss": 0.0352, "step": 7400 }, { "epoch": 0.5201424282622356, "grad_norm": 0.04345332458615303, "learning_rate": 4.978278745752719e-05, "loss": 0.0423, "step": 7450 }, { "epoch": 0.523633317042519, "grad_norm": 0.9469249844551086, "learning_rate": 4.9743999503514196e-05, "loss": 0.0336, "step": 7500 }, { "epoch": 0.5271242058228025, "grad_norm": 0.14183463156223297, "learning_rate": 4.970521154950119e-05, "loss": 0.043, "step": 7550 }, { "epoch": 0.530615094603086, "grad_norm": 15.920584678649902, "learning_rate": 4.966642359548819e-05, "loss": 0.0447, "step": 7600 }, { "epoch": 0.5341059833833695, "grad_norm": 0.06846634298563004, "learning_rate": 4.962763564147518e-05, "loss": 0.0323, "step": 7650 }, { "epoch": 0.5375968721636528, "grad_norm": 0.02816701866686344, "learning_rate": 4.958884768746219e-05, "loss": 0.0547, "step": 7700 }, { "epoch": 0.5410877609439363, "grad_norm": 6.20529842376709, "learning_rate": 4.955005973344918e-05, "loss": 0.0415, "step": 7750 }, { "epoch": 0.5445786497242198, "grad_norm": 0.5864895582199097, "learning_rate": 4.951127177943618e-05, "loss": 0.0339, "step": 7800 }, { "epoch": 0.5480695385045032, "grad_norm": 0.13454869389533997, "learning_rate": 4.9472483825423176e-05, "loss": 0.0349, "step": 7850 }, { "epoch": 0.5515604272847867, "grad_norm": 0.06532914191484451, "learning_rate": 4.9433695871410176e-05, "loss": 0.0412, "step": 7900 }, { "epoch": 0.5550513160650702, "grad_norm": 0.11592119932174683, "learning_rate": 4.9394907917397176e-05, "loss": 0.0285, "step": 7950 }, { "epoch": 0.5585422048453537, "grad_norm": 0.1591779738664627, "learning_rate": 4.935611996338417e-05, "loss": 0.0537, "step": 8000 }, { "epoch": 0.5585422048453537, "eval_f1": 0.7704807041299933, "eval_loss": 0.5664303302764893, "eval_runtime": 23.2978, "eval_samples_per_second": 157.096, "eval_steps_per_second": 4.936, "step": 8000 }, { "epoch": 0.5620330936256371, "grad_norm": 0.6088069081306458, "learning_rate": 4.931733200937117e-05, "loss": 0.0362, "step": 8050 }, { "epoch": 0.5655239824059205, "grad_norm": 0.14050616323947906, "learning_rate": 4.927854405535817e-05, "loss": 0.0383, "step": 8100 }, { "epoch": 0.569014871186204, "grad_norm": 0.5788790583610535, "learning_rate": 4.923975610134517e-05, "loss": 0.0283, "step": 8150 }, { "epoch": 0.5725057599664874, "grad_norm": 0.0380774587392807, "learning_rate": 4.920096814733216e-05, "loss": 0.0211, "step": 8200 }, { "epoch": 0.5759966487467709, "grad_norm": 2.891254186630249, "learning_rate": 4.916218019331916e-05, "loss": 0.0512, "step": 8250 }, { "epoch": 0.5794875375270544, "grad_norm": 0.1362515538930893, "learning_rate": 4.9123392239306163e-05, "loss": 0.0363, "step": 8300 }, { "epoch": 0.5829784263073379, "grad_norm": 0.1054207980632782, "learning_rate": 4.9084604285293163e-05, "loss": 0.0278, "step": 8350 }, { "epoch": 0.5864693150876213, "grad_norm": 0.07413887977600098, "learning_rate": 4.904581633128016e-05, "loss": 0.0308, "step": 8400 }, { "epoch": 0.5899602038679048, "grad_norm": 0.11982708424329758, "learning_rate": 4.900702837726716e-05, "loss": 0.0397, "step": 8450 }, { "epoch": 0.5934510926481882, "grad_norm": 8.392777442932129, "learning_rate": 4.896824042325416e-05, "loss": 0.0469, "step": 8500 }, { "epoch": 0.5969419814284717, "grad_norm": 1.516817569732666, "learning_rate": 4.892945246924116e-05, "loss": 0.0384, "step": 8550 }, { "epoch": 0.6004328702087551, "grad_norm": 1.3836275339126587, "learning_rate": 4.889066451522815e-05, "loss": 0.0451, "step": 8600 }, { "epoch": 0.6039237589890386, "grad_norm": 15.225459098815918, "learning_rate": 4.885187656121516e-05, "loss": 0.0364, "step": 8650 }, { "epoch": 0.6074146477693221, "grad_norm": 4.840157985687256, "learning_rate": 4.881308860720215e-05, "loss": 0.0309, "step": 8700 }, { "epoch": 0.6109055365496056, "grad_norm": 0.05251197889447212, "learning_rate": 4.877430065318915e-05, "loss": 0.036, "step": 8750 }, { "epoch": 0.614396425329889, "grad_norm": 0.1797259896993637, "learning_rate": 4.8735512699176144e-05, "loss": 0.029, "step": 8800 }, { "epoch": 0.6178873141101725, "grad_norm": 0.033766426146030426, "learning_rate": 4.8696724745163144e-05, "loss": 0.0344, "step": 8850 }, { "epoch": 0.6213782028904559, "grad_norm": 0.5324440598487854, "learning_rate": 4.8657936791150144e-05, "loss": 0.0452, "step": 8900 }, { "epoch": 0.6248690916707393, "grad_norm": 7.406557083129883, "learning_rate": 4.861914883713714e-05, "loss": 0.0359, "step": 8950 }, { "epoch": 0.6283599804510228, "grad_norm": 0.05340263247489929, "learning_rate": 4.858036088312414e-05, "loss": 0.0222, "step": 9000 }, { "epoch": 0.6283599804510228, "eval_f1": 0.8430634023854363, "eval_loss": 0.4622673988342285, "eval_runtime": 24.6998, "eval_samples_per_second": 148.179, "eval_steps_per_second": 4.656, "step": 9000 }, { "epoch": 0.6318508692313063, "grad_norm": 5.009503364562988, "learning_rate": 4.854157292911114e-05, "loss": 0.0504, "step": 9050 }, { "epoch": 0.6353417580115898, "grad_norm": 3.2266767024993896, "learning_rate": 4.850278497509814e-05, "loss": 0.0406, "step": 9100 }, { "epoch": 0.6388326467918732, "grad_norm": 3.8187813758850098, "learning_rate": 4.846399702108513e-05, "loss": 0.0347, "step": 9150 }, { "epoch": 0.6423235355721567, "grad_norm": 0.10258898138999939, "learning_rate": 4.842520906707213e-05, "loss": 0.0447, "step": 9200 }, { "epoch": 0.6458144243524402, "grad_norm": 2.6401562690734863, "learning_rate": 4.838642111305913e-05, "loss": 0.0313, "step": 9250 }, { "epoch": 0.6493053131327235, "grad_norm": 2.760939359664917, "learning_rate": 4.834763315904613e-05, "loss": 0.0412, "step": 9300 }, { "epoch": 0.652796201913007, "grad_norm": 0.3721374571323395, "learning_rate": 4.8308845205033124e-05, "loss": 0.0526, "step": 9350 }, { "epoch": 0.6562870906932905, "grad_norm": 11.350992202758789, "learning_rate": 4.8270057251020124e-05, "loss": 0.0435, "step": 9400 }, { "epoch": 0.659777979473574, "grad_norm": 0.2358410507440567, "learning_rate": 4.8231269297007124e-05, "loss": 0.0332, "step": 9450 }, { "epoch": 0.6632688682538574, "grad_norm": 0.27583521604537964, "learning_rate": 4.8192481342994125e-05, "loss": 0.0219, "step": 9500 }, { "epoch": 0.6667597570341409, "grad_norm": 0.19773100316524506, "learning_rate": 4.815369338898112e-05, "loss": 0.0365, "step": 9550 }, { "epoch": 0.6702506458144244, "grad_norm": 0.3891470730304718, "learning_rate": 4.811490543496812e-05, "loss": 0.04, "step": 9600 }, { "epoch": 0.6737415345947078, "grad_norm": 0.1269948035478592, "learning_rate": 4.807611748095512e-05, "loss": 0.0199, "step": 9650 }, { "epoch": 0.6772324233749912, "grad_norm": 1.7669986486434937, "learning_rate": 4.803732952694212e-05, "loss": 0.0342, "step": 9700 }, { "epoch": 0.6807233121552747, "grad_norm": 0.2495778650045395, "learning_rate": 4.799854157292911e-05, "loss": 0.0324, "step": 9750 }, { "epoch": 0.6842142009355582, "grad_norm": 0.7094061970710754, "learning_rate": 4.795975361891611e-05, "loss": 0.0268, "step": 9800 }, { "epoch": 0.6877050897158417, "grad_norm": 0.10349750518798828, "learning_rate": 4.792096566490311e-05, "loss": 0.0197, "step": 9850 }, { "epoch": 0.6911959784961251, "grad_norm": 0.08370551466941833, "learning_rate": 4.7882177710890105e-05, "loss": 0.026, "step": 9900 }, { "epoch": 0.6946868672764086, "grad_norm": 0.6155123114585876, "learning_rate": 4.7843389756877105e-05, "loss": 0.0354, "step": 9950 }, { "epoch": 0.6981777560566921, "grad_norm": 0.2413477748632431, "learning_rate": 4.7804601802864105e-05, "loss": 0.0372, "step": 10000 }, { "epoch": 0.6981777560566921, "eval_f1": 0.8886486486486487, "eval_loss": 0.30088678002357483, "eval_runtime": 26.0507, "eval_samples_per_second": 140.495, "eval_steps_per_second": 4.414, "step": 10000 }, { "epoch": 0.7016686448369754, "grad_norm": 0.028105752542614937, "learning_rate": 4.7765813848851105e-05, "loss": 0.021, "step": 10050 }, { "epoch": 0.7051595336172589, "grad_norm": 0.44947728514671326, "learning_rate": 4.77270258948381e-05, "loss": 0.0436, "step": 10100 }, { "epoch": 0.7086504223975424, "grad_norm": 0.13265953958034515, "learning_rate": 4.76882379408251e-05, "loss": 0.0224, "step": 10150 }, { "epoch": 0.7121413111778259, "grad_norm": 0.060906197875738144, "learning_rate": 4.76494499868121e-05, "loss": 0.0419, "step": 10200 }, { "epoch": 0.7156321999581093, "grad_norm": 3.1417033672332764, "learning_rate": 4.76106620327991e-05, "loss": 0.0324, "step": 10250 }, { "epoch": 0.7191230887383928, "grad_norm": 8.964985847473145, "learning_rate": 4.757187407878609e-05, "loss": 0.0326, "step": 10300 }, { "epoch": 0.7226139775186763, "grad_norm": 0.026810016483068466, "learning_rate": 4.753308612477309e-05, "loss": 0.034, "step": 10350 }, { "epoch": 0.7261048662989598, "grad_norm": 2.3609349727630615, "learning_rate": 4.749429817076009e-05, "loss": 0.036, "step": 10400 }, { "epoch": 0.7295957550792431, "grad_norm": 0.163139209151268, "learning_rate": 4.745551021674709e-05, "loss": 0.0251, "step": 10450 }, { "epoch": 0.7330866438595266, "grad_norm": 10.265725135803223, "learning_rate": 4.7416722262734085e-05, "loss": 0.0198, "step": 10500 }, { "epoch": 0.7365775326398101, "grad_norm": 0.017616290599107742, "learning_rate": 4.7377934308721085e-05, "loss": 0.0321, "step": 10550 }, { "epoch": 0.7400684214200935, "grad_norm": 4.75632381439209, "learning_rate": 4.7339146354708086e-05, "loss": 0.0406, "step": 10600 }, { "epoch": 0.743559310200377, "grad_norm": 4.056220531463623, "learning_rate": 4.7300358400695086e-05, "loss": 0.0268, "step": 10650 }, { "epoch": 0.7470501989806605, "grad_norm": 0.041627198457717896, "learning_rate": 4.726157044668208e-05, "loss": 0.0355, "step": 10700 }, { "epoch": 0.750541087760944, "grad_norm": 0.05310547724366188, "learning_rate": 4.722278249266908e-05, "loss": 0.0298, "step": 10750 }, { "epoch": 0.7540319765412274, "grad_norm": 3.478668451309204, "learning_rate": 4.718399453865608e-05, "loss": 0.0469, "step": 10800 }, { "epoch": 0.7575228653215108, "grad_norm": 0.046948205679655075, "learning_rate": 4.714520658464307e-05, "loss": 0.0217, "step": 10850 }, { "epoch": 0.7610137541017943, "grad_norm": 1.857437252998352, "learning_rate": 4.710641863063007e-05, "loss": 0.0452, "step": 10900 }, { "epoch": 0.7645046428820778, "grad_norm": 0.5096798539161682, "learning_rate": 4.706763067661707e-05, "loss": 0.0448, "step": 10950 }, { "epoch": 0.7679955316623612, "grad_norm": 0.05577833950519562, "learning_rate": 4.702884272260407e-05, "loss": 0.0236, "step": 11000 }, { "epoch": 0.7679955316623612, "eval_f1": 0.7422608695652174, "eval_loss": 0.6054657101631165, "eval_runtime": 18.0002, "eval_samples_per_second": 203.331, "eval_steps_per_second": 6.389, "step": 11000 }, { "epoch": 0.7714864204426447, "grad_norm": 0.32365405559539795, "learning_rate": 4.6990054768591066e-05, "loss": 0.0275, "step": 11050 }, { "epoch": 0.7749773092229282, "grad_norm": 0.054900918155908585, "learning_rate": 4.6951266814578066e-05, "loss": 0.0428, "step": 11100 }, { "epoch": 0.7784681980032117, "grad_norm": 0.1931043118238449, "learning_rate": 4.6912478860565066e-05, "loss": 0.0385, "step": 11150 }, { "epoch": 0.7819590867834951, "grad_norm": 11.70685863494873, "learning_rate": 4.6873690906552066e-05, "loss": 0.0386, "step": 11200 }, { "epoch": 0.7854499755637785, "grad_norm": 1.1998494863510132, "learning_rate": 4.683490295253906e-05, "loss": 0.0412, "step": 11250 }, { "epoch": 0.788940864344062, "grad_norm": 4.7984161376953125, "learning_rate": 4.679611499852606e-05, "loss": 0.0451, "step": 11300 }, { "epoch": 0.7924317531243454, "grad_norm": 0.10824266076087952, "learning_rate": 4.675732704451306e-05, "loss": 0.027, "step": 11350 }, { "epoch": 0.7959226419046289, "grad_norm": 11.501607894897461, "learning_rate": 4.671853909050006e-05, "loss": 0.0311, "step": 11400 }, { "epoch": 0.7994135306849124, "grad_norm": 4.8870978355407715, "learning_rate": 4.667975113648705e-05, "loss": 0.0382, "step": 11450 }, { "epoch": 0.8029044194651959, "grad_norm": 0.020871387794613838, "learning_rate": 4.664096318247405e-05, "loss": 0.0288, "step": 11500 }, { "epoch": 0.8063953082454793, "grad_norm": 1.9301238059997559, "learning_rate": 4.660217522846105e-05, "loss": 0.0388, "step": 11550 }, { "epoch": 0.8098861970257628, "grad_norm": 0.012374644167721272, "learning_rate": 4.656338727444805e-05, "loss": 0.0188, "step": 11600 }, { "epoch": 0.8133770858060462, "grad_norm": 0.08517163246870041, "learning_rate": 4.6524599320435046e-05, "loss": 0.0359, "step": 11650 }, { "epoch": 0.8168679745863296, "grad_norm": 0.027777288109064102, "learning_rate": 4.6485811366422046e-05, "loss": 0.0351, "step": 11700 }, { "epoch": 0.8203588633666131, "grad_norm": 4.146106719970703, "learning_rate": 4.6447023412409047e-05, "loss": 0.0286, "step": 11750 }, { "epoch": 0.8238497521468966, "grad_norm": 0.18585865199565887, "learning_rate": 4.640823545839604e-05, "loss": 0.0196, "step": 11800 }, { "epoch": 0.8273406409271801, "grad_norm": 0.03932729363441467, "learning_rate": 4.636944750438304e-05, "loss": 0.0205, "step": 11850 }, { "epoch": 0.8308315297074635, "grad_norm": 0.49756330251693726, "learning_rate": 4.633065955037004e-05, "loss": 0.0411, "step": 11900 }, { "epoch": 0.834322418487747, "grad_norm": 1.2074888944625854, "learning_rate": 4.629187159635704e-05, "loss": 0.0204, "step": 11950 }, { "epoch": 0.8378133072680305, "grad_norm": 0.08632475137710571, "learning_rate": 4.6253083642344033e-05, "loss": 0.0293, "step": 12000 }, { "epoch": 0.8378133072680305, "eval_f1": 0.8149117069980379, "eval_loss": 0.44084152579307556, "eval_runtime": 18.2839, "eval_samples_per_second": 200.176, "eval_steps_per_second": 6.29, "step": 12000 }, { "epoch": 0.8413041960483139, "grad_norm": 0.5310344696044922, "learning_rate": 4.6214295688331033e-05, "loss": 0.0364, "step": 12050 }, { "epoch": 0.8447950848285973, "grad_norm": 2.3562138080596924, "learning_rate": 4.617550773431803e-05, "loss": 0.0438, "step": 12100 }, { "epoch": 0.8482859736088808, "grad_norm": 3.931169271469116, "learning_rate": 4.6136719780305034e-05, "loss": 0.0181, "step": 12150 }, { "epoch": 0.8517768623891643, "grad_norm": 0.08786332607269287, "learning_rate": 4.609793182629203e-05, "loss": 0.0256, "step": 12200 }, { "epoch": 0.8552677511694478, "grad_norm": 0.5437902212142944, "learning_rate": 4.605914387227903e-05, "loss": 0.0353, "step": 12250 }, { "epoch": 0.8587586399497312, "grad_norm": 0.21293994784355164, "learning_rate": 4.602035591826603e-05, "loss": 0.0301, "step": 12300 }, { "epoch": 0.8622495287300147, "grad_norm": 0.2317512184381485, "learning_rate": 4.598156796425303e-05, "loss": 0.037, "step": 12350 }, { "epoch": 0.8657404175102982, "grad_norm": 4.126286029815674, "learning_rate": 4.594278001024002e-05, "loss": 0.0303, "step": 12400 }, { "epoch": 0.8692313062905815, "grad_norm": 0.08132177591323853, "learning_rate": 4.590399205622702e-05, "loss": 0.0292, "step": 12450 }, { "epoch": 0.872722195070865, "grad_norm": 4.297754764556885, "learning_rate": 4.586520410221402e-05, "loss": 0.0493, "step": 12500 }, { "epoch": 0.8762130838511485, "grad_norm": 6.928088188171387, "learning_rate": 4.582641614820102e-05, "loss": 0.0347, "step": 12550 }, { "epoch": 0.879703972631432, "grad_norm": 0.3550567924976349, "learning_rate": 4.5787628194188014e-05, "loss": 0.0265, "step": 12600 }, { "epoch": 0.8831948614117154, "grad_norm": 1.2661274671554565, "learning_rate": 4.5748840240175014e-05, "loss": 0.0343, "step": 12650 }, { "epoch": 0.8866857501919989, "grad_norm": 5.1844258308410645, "learning_rate": 4.5710052286162014e-05, "loss": 0.0209, "step": 12700 }, { "epoch": 0.8901766389722824, "grad_norm": 0.31868308782577515, "learning_rate": 4.5671264332149014e-05, "loss": 0.0238, "step": 12750 }, { "epoch": 0.8936675277525657, "grad_norm": 0.03575870022177696, "learning_rate": 4.563247637813601e-05, "loss": 0.0369, "step": 12800 }, { "epoch": 0.8971584165328492, "grad_norm": 1.8217113018035889, "learning_rate": 4.559368842412301e-05, "loss": 0.0298, "step": 12850 }, { "epoch": 0.9006493053131327, "grad_norm": 0.02744445390999317, "learning_rate": 4.555490047011001e-05, "loss": 0.0205, "step": 12900 }, { "epoch": 0.9041401940934162, "grad_norm": 0.18486034870147705, "learning_rate": 4.5516112516097e-05, "loss": 0.0399, "step": 12950 }, { "epoch": 0.9076310828736996, "grad_norm": 0.38794612884521484, "learning_rate": 4.5477324562084e-05, "loss": 0.0345, "step": 13000 }, { "epoch": 0.9076310828736996, "eval_f1": 0.6745126884884148, "eval_loss": 0.701119065284729, "eval_runtime": 19.4257, "eval_samples_per_second": 188.411, "eval_steps_per_second": 5.92, "step": 13000 }, { "epoch": 0.9111219716539831, "grad_norm": 0.06368507444858551, "learning_rate": 4.5438536608070994e-05, "loss": 0.0474, "step": 13050 }, { "epoch": 0.9146128604342666, "grad_norm": 1.3088841438293457, "learning_rate": 4.5399748654058e-05, "loss": 0.0306, "step": 13100 }, { "epoch": 0.9181037492145501, "grad_norm": 0.21806786954402924, "learning_rate": 4.5360960700044994e-05, "loss": 0.0299, "step": 13150 }, { "epoch": 0.9215946379948334, "grad_norm": 0.05096457153558731, "learning_rate": 4.5322172746031995e-05, "loss": 0.0261, "step": 13200 }, { "epoch": 0.9250855267751169, "grad_norm": 0.14264649152755737, "learning_rate": 4.528338479201899e-05, "loss": 0.0426, "step": 13250 }, { "epoch": 0.9285764155554004, "grad_norm": 0.4436454474925995, "learning_rate": 4.5244596838005995e-05, "loss": 0.0284, "step": 13300 }, { "epoch": 0.9320673043356839, "grad_norm": 2.399491786956787, "learning_rate": 4.520580888399299e-05, "loss": 0.0336, "step": 13350 }, { "epoch": 0.9355581931159673, "grad_norm": 0.05171108618378639, "learning_rate": 4.516702092997999e-05, "loss": 0.0151, "step": 13400 }, { "epoch": 0.9390490818962508, "grad_norm": 0.022188086062669754, "learning_rate": 4.512823297596699e-05, "loss": 0.0235, "step": 13450 }, { "epoch": 0.9425399706765343, "grad_norm": 8.833630561828613, "learning_rate": 4.508944502195399e-05, "loss": 0.0405, "step": 13500 }, { "epoch": 0.9460308594568178, "grad_norm": 0.18842175602912903, "learning_rate": 4.505065706794098e-05, "loss": 0.0237, "step": 13550 }, { "epoch": 0.9495217482371011, "grad_norm": 0.501763105392456, "learning_rate": 4.501186911392798e-05, "loss": 0.0247, "step": 13600 }, { "epoch": 0.9530126370173846, "grad_norm": 0.012047139927744865, "learning_rate": 4.497308115991498e-05, "loss": 0.0208, "step": 13650 }, { "epoch": 0.9565035257976681, "grad_norm": 0.04417157545685768, "learning_rate": 4.493429320590198e-05, "loss": 0.0314, "step": 13700 }, { "epoch": 0.9599944145779515, "grad_norm": 0.29428571462631226, "learning_rate": 4.4895505251888975e-05, "loss": 0.0279, "step": 13750 }, { "epoch": 0.963485303358235, "grad_norm": 6.524214744567871, "learning_rate": 4.4856717297875975e-05, "loss": 0.0225, "step": 13800 }, { "epoch": 0.9669761921385185, "grad_norm": 0.15136292576789856, "learning_rate": 4.4817929343862975e-05, "loss": 0.0158, "step": 13850 }, { "epoch": 0.970467080918802, "grad_norm": 0.1073521077632904, "learning_rate": 4.477914138984997e-05, "loss": 0.0219, "step": 13900 }, { "epoch": 0.9739579696990854, "grad_norm": 0.04597226157784462, "learning_rate": 4.474035343583697e-05, "loss": 0.0271, "step": 13950 }, { "epoch": 0.9774488584793688, "grad_norm": 0.5995834469795227, "learning_rate": 4.470156548182396e-05, "loss": 0.0655, "step": 14000 }, { "epoch": 0.9774488584793688, "eval_f1": 0.8513046211883055, "eval_loss": 0.3484158217906952, "eval_runtime": 16.0623, "eval_samples_per_second": 227.863, "eval_steps_per_second": 7.16, "step": 14000 }, { "epoch": 0.9809397472596523, "grad_norm": 0.03915255144238472, "learning_rate": 4.466277752781097e-05, "loss": 0.0194, "step": 14050 }, { "epoch": 0.9844306360399357, "grad_norm": 5.527812957763672, "learning_rate": 4.462398957379796e-05, "loss": 0.0187, "step": 14100 }, { "epoch": 0.9879215248202192, "grad_norm": 0.04913317784667015, "learning_rate": 4.458520161978496e-05, "loss": 0.0282, "step": 14150 }, { "epoch": 0.9914124136005027, "grad_norm": 0.11622585356235504, "learning_rate": 4.4546413665771955e-05, "loss": 0.0372, "step": 14200 }, { "epoch": 0.9949033023807862, "grad_norm": 16.156946182250977, "learning_rate": 4.450762571175896e-05, "loss": 0.0193, "step": 14250 }, { "epoch": 0.9983941911610696, "grad_norm": 0.05035267770290375, "learning_rate": 4.4468837757745955e-05, "loss": 0.0273, "step": 14300 }, { "epoch": 1.001885079941353, "grad_norm": 1.6708167791366577, "learning_rate": 4.4430049803732956e-05, "loss": 0.0203, "step": 14350 }, { "epoch": 1.0053759687216366, "grad_norm": 0.09074031561613083, "learning_rate": 4.439126184971995e-05, "loss": 0.0338, "step": 14400 }, { "epoch": 1.00886685750192, "grad_norm": 3.624009609222412, "learning_rate": 4.4352473895706956e-05, "loss": 0.0455, "step": 14450 }, { "epoch": 1.0123577462822035, "grad_norm": 0.1196989193558693, "learning_rate": 4.431368594169395e-05, "loss": 0.0419, "step": 14500 }, { "epoch": 1.015848635062487, "grad_norm": 0.21538716554641724, "learning_rate": 4.427489798768095e-05, "loss": 0.0332, "step": 14550 }, { "epoch": 1.0193395238427703, "grad_norm": 0.029338959604501724, "learning_rate": 4.423611003366794e-05, "loss": 0.0266, "step": 14600 }, { "epoch": 1.0228304126230539, "grad_norm": 0.02669925056397915, "learning_rate": 4.419732207965495e-05, "loss": 0.0321, "step": 14650 }, { "epoch": 1.0263213014033372, "grad_norm": 0.6535504460334778, "learning_rate": 4.415853412564194e-05, "loss": 0.0242, "step": 14700 }, { "epoch": 1.0298121901836208, "grad_norm": 0.07374177873134613, "learning_rate": 4.41205219307092e-05, "loss": 0.0381, "step": 14750 }, { "epoch": 1.0333030789639042, "grad_norm": 0.09146462380886078, "learning_rate": 4.40817339766962e-05, "loss": 0.0312, "step": 14800 }, { "epoch": 1.0367939677441877, "grad_norm": 0.1045895665884018, "learning_rate": 4.4042946022683194e-05, "loss": 0.0284, "step": 14850 }, { "epoch": 1.0402848565244711, "grad_norm": 0.008818292059004307, "learning_rate": 4.40041580686702e-05, "loss": 0.0116, "step": 14900 }, { "epoch": 1.0437757453047545, "grad_norm": 0.29010212421417236, "learning_rate": 4.3965370114657194e-05, "loss": 0.0395, "step": 14950 }, { "epoch": 1.047266634085038, "grad_norm": 0.12948091328144073, "learning_rate": 4.3926582160644194e-05, "loss": 0.0264, "step": 15000 }, { "epoch": 1.047266634085038, "eval_f1": 0.8383579217447081, "eval_loss": 0.4777594804763794, "eval_runtime": 16.0185, "eval_samples_per_second": 228.486, "eval_steps_per_second": 7.179, "step": 15000 }, { "epoch": 1.0507575228653214, "grad_norm": 0.032540276646614075, "learning_rate": 4.388779420663119e-05, "loss": 0.0378, "step": 15050 }, { "epoch": 1.054248411645605, "grad_norm": 0.2068353295326233, "learning_rate": 4.3849006252618194e-05, "loss": 0.0303, "step": 15100 }, { "epoch": 1.0577393004258884, "grad_norm": 0.1535184383392334, "learning_rate": 4.381021829860519e-05, "loss": 0.0143, "step": 15150 }, { "epoch": 1.061230189206172, "grad_norm": 0.21458381414413452, "learning_rate": 4.377143034459219e-05, "loss": 0.0464, "step": 15200 }, { "epoch": 1.0647210779864553, "grad_norm": 0.054905761033296585, "learning_rate": 4.373264239057918e-05, "loss": 0.0257, "step": 15250 }, { "epoch": 1.068211966766739, "grad_norm": 2.5384442806243896, "learning_rate": 4.369385443656618e-05, "loss": 0.0392, "step": 15300 }, { "epoch": 1.0717028555470223, "grad_norm": 1.0916022062301636, "learning_rate": 4.365506648255318e-05, "loss": 0.0273, "step": 15350 }, { "epoch": 1.0751937443273056, "grad_norm": 0.11858826130628586, "learning_rate": 4.3616278528540174e-05, "loss": 0.0271, "step": 15400 }, { "epoch": 1.0786846331075892, "grad_norm": 0.030543260276317596, "learning_rate": 4.3577490574527175e-05, "loss": 0.0331, "step": 15450 }, { "epoch": 1.0821755218878726, "grad_norm": 0.08327992260456085, "learning_rate": 4.3538702620514175e-05, "loss": 0.0375, "step": 15500 }, { "epoch": 1.0856664106681562, "grad_norm": 6.117712497711182, "learning_rate": 4.3499914666501175e-05, "loss": 0.037, "step": 15550 }, { "epoch": 1.0891572994484395, "grad_norm": 0.12089100480079651, "learning_rate": 4.346112671248817e-05, "loss": 0.0237, "step": 15600 }, { "epoch": 1.0926481882287231, "grad_norm": 1.5640662908554077, "learning_rate": 4.342233875847517e-05, "loss": 0.0227, "step": 15650 }, { "epoch": 1.0961390770090065, "grad_norm": 0.04615291208028793, "learning_rate": 4.338355080446217e-05, "loss": 0.0233, "step": 15700 }, { "epoch": 1.09962996578929, "grad_norm": 0.047694265842437744, "learning_rate": 4.334476285044917e-05, "loss": 0.0424, "step": 15750 }, { "epoch": 1.1031208545695734, "grad_norm": 0.022557301446795464, "learning_rate": 4.330597489643616e-05, "loss": 0.028, "step": 15800 }, { "epoch": 1.1066117433498568, "grad_norm": 0.41174036264419556, "learning_rate": 4.326718694242317e-05, "loss": 0.0423, "step": 15850 }, { "epoch": 1.1101026321301404, "grad_norm": 1.035745620727539, "learning_rate": 4.322839898841016e-05, "loss": 0.0158, "step": 15900 }, { "epoch": 1.1135935209104237, "grad_norm": 0.017003023996949196, "learning_rate": 4.318961103439716e-05, "loss": 0.0278, "step": 15950 }, { "epoch": 1.1170844096907073, "grad_norm": 0.030238742008805275, "learning_rate": 4.3150823080384155e-05, "loss": 0.0197, "step": 16000 }, { "epoch": 1.1170844096907073, "eval_f1": 0.8951467944877172, "eval_loss": 0.28839099407196045, "eval_runtime": 16.2773, "eval_samples_per_second": 224.853, "eval_steps_per_second": 7.065, "step": 16000 }, { "epoch": 1.1205752984709907, "grad_norm": 0.06707513332366943, "learning_rate": 4.311203512637116e-05, "loss": 0.0268, "step": 16050 }, { "epoch": 1.1240661872512743, "grad_norm": 0.29435157775878906, "learning_rate": 4.3073247172358155e-05, "loss": 0.0389, "step": 16100 }, { "epoch": 1.1275570760315576, "grad_norm": 0.4163244664669037, "learning_rate": 4.3034459218345155e-05, "loss": 0.0149, "step": 16150 }, { "epoch": 1.131047964811841, "grad_norm": 0.09609521925449371, "learning_rate": 4.299567126433215e-05, "loss": 0.0231, "step": 16200 }, { "epoch": 1.1345388535921246, "grad_norm": 1.1996419429779053, "learning_rate": 4.295688331031915e-05, "loss": 0.0279, "step": 16250 }, { "epoch": 1.138029742372408, "grad_norm": 0.03178354725241661, "learning_rate": 4.291809535630615e-05, "loss": 0.0396, "step": 16300 }, { "epoch": 1.1415206311526915, "grad_norm": 1.533177137374878, "learning_rate": 4.287930740229314e-05, "loss": 0.0274, "step": 16350 }, { "epoch": 1.145011519932975, "grad_norm": 0.05997175723314285, "learning_rate": 4.284051944828014e-05, "loss": 0.0196, "step": 16400 }, { "epoch": 1.1485024087132585, "grad_norm": 0.022537073120474815, "learning_rate": 4.280173149426714e-05, "loss": 0.0142, "step": 16450 }, { "epoch": 1.1519932974935418, "grad_norm": 0.022068103775382042, "learning_rate": 4.276294354025414e-05, "loss": 0.0226, "step": 16500 }, { "epoch": 1.1554841862738252, "grad_norm": 0.07768739014863968, "learning_rate": 4.2724155586241135e-05, "loss": 0.0323, "step": 16550 }, { "epoch": 1.1589750750541088, "grad_norm": 0.01296279113739729, "learning_rate": 4.2685367632228136e-05, "loss": 0.0253, "step": 16600 }, { "epoch": 1.1624659638343922, "grad_norm": 3.2529516220092773, "learning_rate": 4.2646579678215136e-05, "loss": 0.0266, "step": 16650 }, { "epoch": 1.1659568526146757, "grad_norm": 0.11319205164909363, "learning_rate": 4.2607791724202136e-05, "loss": 0.0225, "step": 16700 }, { "epoch": 1.169447741394959, "grad_norm": 0.36928442120552063, "learning_rate": 4.256900377018913e-05, "loss": 0.0373, "step": 16750 }, { "epoch": 1.1729386301752427, "grad_norm": 1.703551173210144, "learning_rate": 4.253021581617613e-05, "loss": 0.0229, "step": 16800 }, { "epoch": 1.176429518955526, "grad_norm": 4.041821479797363, "learning_rate": 4.249142786216313e-05, "loss": 0.0345, "step": 16850 }, { "epoch": 1.1799204077358096, "grad_norm": 0.21636897325515747, "learning_rate": 4.245263990815013e-05, "loss": 0.0255, "step": 16900 }, { "epoch": 1.183411296516093, "grad_norm": 5.224414348602295, "learning_rate": 4.241385195413712e-05, "loss": 0.0282, "step": 16950 }, { "epoch": 1.1869021852963764, "grad_norm": 2.6779046058654785, "learning_rate": 4.237506400012413e-05, "loss": 0.0327, "step": 17000 }, { "epoch": 1.1869021852963764, "eval_f1": 0.9023038786818315, "eval_loss": 0.2744286060333252, "eval_runtime": 15.7046, "eval_samples_per_second": 233.053, "eval_steps_per_second": 7.323, "step": 17000 }, { "epoch": 1.19039307407666, "grad_norm": 8.236310005187988, "learning_rate": 4.233627604611112e-05, "loss": 0.0313, "step": 17050 }, { "epoch": 1.1938839628569433, "grad_norm": 0.2507387101650238, "learning_rate": 4.229903961025864e-05, "loss": 0.0335, "step": 17100 }, { "epoch": 1.197374851637227, "grad_norm": 0.040820781141519547, "learning_rate": 4.226025165624564e-05, "loss": 0.0291, "step": 17150 }, { "epoch": 1.2008657404175103, "grad_norm": 0.06928090006113052, "learning_rate": 4.222146370223264e-05, "loss": 0.0247, "step": 17200 }, { "epoch": 1.2043566291977938, "grad_norm": 3.9058620929718018, "learning_rate": 4.218267574821964e-05, "loss": 0.0215, "step": 17250 }, { "epoch": 1.2078475179780772, "grad_norm": 0.13741998374462128, "learning_rate": 4.214388779420663e-05, "loss": 0.0164, "step": 17300 }, { "epoch": 1.2113384067583608, "grad_norm": 1.5019398927688599, "learning_rate": 4.210509984019363e-05, "loss": 0.0435, "step": 17350 }, { "epoch": 1.2148292955386442, "grad_norm": 6.695947647094727, "learning_rate": 4.206631188618063e-05, "loss": 0.0477, "step": 17400 }, { "epoch": 1.2183201843189275, "grad_norm": 0.024964775890111923, "learning_rate": 4.2027523932167626e-05, "loss": 0.0418, "step": 17450 }, { "epoch": 1.221811073099211, "grad_norm": 3.555797576904297, "learning_rate": 4.1988735978154626e-05, "loss": 0.0201, "step": 17500 }, { "epoch": 1.2253019618794945, "grad_norm": 0.15483196079730988, "learning_rate": 4.194994802414162e-05, "loss": 0.034, "step": 17550 }, { "epoch": 1.228792850659778, "grad_norm": 0.2912302017211914, "learning_rate": 4.1911160070128626e-05, "loss": 0.0295, "step": 17600 }, { "epoch": 1.2322837394400614, "grad_norm": 0.038849666714668274, "learning_rate": 4.187237211611562e-05, "loss": 0.0141, "step": 17650 }, { "epoch": 1.2357746282203448, "grad_norm": 0.03439799323678017, "learning_rate": 4.183358416210262e-05, "loss": 0.0345, "step": 17700 }, { "epoch": 1.2392655170006284, "grad_norm": 1.7664055824279785, "learning_rate": 4.179479620808961e-05, "loss": 0.0299, "step": 17750 }, { "epoch": 1.2427564057809117, "grad_norm": 5.20842981338501, "learning_rate": 4.175600825407662e-05, "loss": 0.0381, "step": 17800 }, { "epoch": 1.2462472945611953, "grad_norm": 1.018599510192871, "learning_rate": 4.171722030006361e-05, "loss": 0.0165, "step": 17850 }, { "epoch": 1.2497381833414787, "grad_norm": 3.1473236083984375, "learning_rate": 4.167843234605061e-05, "loss": 0.0236, "step": 17900 }, { "epoch": 1.2532290721217623, "grad_norm": 0.1991962492465973, "learning_rate": 4.163964439203761e-05, "loss": 0.0259, "step": 17950 }, { "epoch": 1.2567199609020456, "grad_norm": 3.431162118911743, "learning_rate": 4.160085643802461e-05, "loss": 0.0314, "step": 18000 }, { "epoch": 1.2567199609020456, "eval_f1": 0.8523869346733668, "eval_loss": 0.31631308794021606, "eval_runtime": 28.0145, "eval_samples_per_second": 130.646, "eval_steps_per_second": 4.105, "step": 18000 }, { "epoch": 1.2602108496823292, "grad_norm": 0.14350277185440063, "learning_rate": 4.1562068484011606e-05, "loss": 0.0316, "step": 18050 }, { "epoch": 1.2637017384626126, "grad_norm": 0.08211163431406021, "learning_rate": 4.1523280529998606e-05, "loss": 0.0288, "step": 18100 }, { "epoch": 1.267192627242896, "grad_norm": 3.4511756896972656, "learning_rate": 4.1484492575985606e-05, "loss": 0.0314, "step": 18150 }, { "epoch": 1.2706835160231795, "grad_norm": 0.03046209178864956, "learning_rate": 4.1445704621972606e-05, "loss": 0.0473, "step": 18200 }, { "epoch": 1.274174404803463, "grad_norm": 0.046366989612579346, "learning_rate": 4.14069166679596e-05, "loss": 0.0242, "step": 18250 }, { "epoch": 1.2776652935837465, "grad_norm": 0.03393428400158882, "learning_rate": 4.13681287139466e-05, "loss": 0.0106, "step": 18300 }, { "epoch": 1.2811561823640298, "grad_norm": 0.17253169417381287, "learning_rate": 4.13293407599336e-05, "loss": 0.0287, "step": 18350 }, { "epoch": 1.2846470711443134, "grad_norm": 2.6994237899780273, "learning_rate": 4.129055280592059e-05, "loss": 0.0269, "step": 18400 }, { "epoch": 1.2881379599245968, "grad_norm": 0.04223485663533211, "learning_rate": 4.125176485190759e-05, "loss": 0.0188, "step": 18450 }, { "epoch": 1.2916288487048804, "grad_norm": 0.0406014584004879, "learning_rate": 4.1212976897894587e-05, "loss": 0.0227, "step": 18500 }, { "epoch": 1.2951197374851637, "grad_norm": 4.774007797241211, "learning_rate": 4.1174188943881593e-05, "loss": 0.0263, "step": 18550 }, { "epoch": 1.298610626265447, "grad_norm": 7.6998677253723145, "learning_rate": 4.113540098986859e-05, "loss": 0.0469, "step": 18600 }, { "epoch": 1.3021015150457307, "grad_norm": 0.6437110304832458, "learning_rate": 4.109661303585559e-05, "loss": 0.0195, "step": 18650 }, { "epoch": 1.305592403826014, "grad_norm": 2.180750608444214, "learning_rate": 4.105782508184258e-05, "loss": 0.0281, "step": 18700 }, { "epoch": 1.3090832926062976, "grad_norm": 0.19855546951293945, "learning_rate": 4.101903712782959e-05, "loss": 0.0341, "step": 18750 }, { "epoch": 1.312574181386581, "grad_norm": 0.2433316558599472, "learning_rate": 4.098024917381658e-05, "loss": 0.0217, "step": 18800 }, { "epoch": 1.3160650701668644, "grad_norm": 5.1525163650512695, "learning_rate": 4.094146121980358e-05, "loss": 0.0245, "step": 18850 }, { "epoch": 1.319555958947148, "grad_norm": 4.688765525817871, "learning_rate": 4.0902673265790574e-05, "loss": 0.0361, "step": 18900 }, { "epoch": 1.3230468477274315, "grad_norm": 0.07672595232725143, "learning_rate": 4.086388531177758e-05, "loss": 0.0383, "step": 18950 }, { "epoch": 1.326537736507715, "grad_norm": 1.0029828548431396, "learning_rate": 4.0825097357764574e-05, "loss": 0.0172, "step": 19000 }, { "epoch": 1.326537736507715, "eval_f1": 0.8066115702479338, "eval_loss": 0.35703688859939575, "eval_runtime": 19.0041, "eval_samples_per_second": 192.59, "eval_steps_per_second": 6.051, "step": 19000 }, { "epoch": 1.3300286252879983, "grad_norm": 5.897487640380859, "learning_rate": 4.0786309403751574e-05, "loss": 0.031, "step": 19050 }, { "epoch": 1.3335195140682818, "grad_norm": 0.11918721348047256, "learning_rate": 4.0747521449738574e-05, "loss": 0.038, "step": 19100 }, { "epoch": 1.3370104028485652, "grad_norm": 1.9710204601287842, "learning_rate": 4.0708733495725574e-05, "loss": 0.0304, "step": 19150 }, { "epoch": 1.3405012916288488, "grad_norm": 2.3936004638671875, "learning_rate": 4.066994554171257e-05, "loss": 0.0344, "step": 19200 }, { "epoch": 1.3439921804091322, "grad_norm": 1.0621711015701294, "learning_rate": 4.063115758769957e-05, "loss": 0.03, "step": 19250 }, { "epoch": 1.3474830691894155, "grad_norm": 0.2894875407218933, "learning_rate": 4.059236963368657e-05, "loss": 0.02, "step": 19300 }, { "epoch": 1.350973957969699, "grad_norm": 4.131627559661865, "learning_rate": 4.055358167967356e-05, "loss": 0.0263, "step": 19350 }, { "epoch": 1.3544648467499825, "grad_norm": 0.20086219906806946, "learning_rate": 4.051479372566056e-05, "loss": 0.02, "step": 19400 }, { "epoch": 1.357955735530266, "grad_norm": 0.02272665873169899, "learning_rate": 4.0476005771647554e-05, "loss": 0.0281, "step": 19450 }, { "epoch": 1.3614466243105494, "grad_norm": 0.08684906363487244, "learning_rate": 4.043721781763456e-05, "loss": 0.0292, "step": 19500 }, { "epoch": 1.364937513090833, "grad_norm": 1.9985971450805664, "learning_rate": 4.0398429863621554e-05, "loss": 0.0269, "step": 19550 }, { "epoch": 1.3684284018711164, "grad_norm": 0.5104591250419617, "learning_rate": 4.0359641909608554e-05, "loss": 0.0305, "step": 19600 }, { "epoch": 1.3719192906514, "grad_norm": 0.08239644020795822, "learning_rate": 4.032085395559555e-05, "loss": 0.0235, "step": 19650 }, { "epoch": 1.3754101794316833, "grad_norm": 2.711775779724121, "learning_rate": 4.0282066001582554e-05, "loss": 0.0265, "step": 19700 }, { "epoch": 1.3789010682119667, "grad_norm": 0.027858775109052658, "learning_rate": 4.024327804756955e-05, "loss": 0.0155, "step": 19750 }, { "epoch": 1.3823919569922503, "grad_norm": 0.02158626727759838, "learning_rate": 4.020449009355655e-05, "loss": 0.0264, "step": 19800 }, { "epoch": 1.3858828457725336, "grad_norm": 0.09914042055606842, "learning_rate": 4.016570213954354e-05, "loss": 0.0249, "step": 19850 }, { "epoch": 1.3893737345528172, "grad_norm": 6.718291282653809, "learning_rate": 4.012691418553055e-05, "loss": 0.0213, "step": 19900 }, { "epoch": 1.3928646233331006, "grad_norm": 0.07551147788763046, "learning_rate": 4.008812623151754e-05, "loss": 0.0336, "step": 19950 }, { "epoch": 1.396355512113384, "grad_norm": 0.027326975017786026, "learning_rate": 4.004933827750454e-05, "loss": 0.0352, "step": 20000 }, { "epoch": 1.396355512113384, "eval_f1": 0.8510773130544994, "eval_loss": 0.35609981417655945, "eval_runtime": 17.6862, "eval_samples_per_second": 206.941, "eval_steps_per_second": 6.502, "step": 20000 }, { "epoch": 1.3998464008936675, "grad_norm": 0.031843628734350204, "learning_rate": 4.0010550323491535e-05, "loss": 0.028, "step": 20050 }, { "epoch": 1.403337289673951, "grad_norm": 3.878753900527954, "learning_rate": 3.997176236947854e-05, "loss": 0.0413, "step": 20100 }, { "epoch": 1.4068281784542345, "grad_norm": 0.048792481422424316, "learning_rate": 3.9932974415465535e-05, "loss": 0.0237, "step": 20150 }, { "epoch": 1.4103190672345178, "grad_norm": 0.07304860651493073, "learning_rate": 3.9894186461452535e-05, "loss": 0.031, "step": 20200 }, { "epoch": 1.4138099560148014, "grad_norm": 2.2251689434051514, "learning_rate": 3.9855398507439535e-05, "loss": 0.0415, "step": 20250 }, { "epoch": 1.4173008447950848, "grad_norm": 0.6833386421203613, "learning_rate": 3.981661055342653e-05, "loss": 0.0295, "step": 20300 }, { "epoch": 1.4207917335753684, "grad_norm": 0.09307420253753662, "learning_rate": 3.977782259941353e-05, "loss": 0.0194, "step": 20350 }, { "epoch": 1.4242826223556517, "grad_norm": 0.03538484126329422, "learning_rate": 3.973903464540052e-05, "loss": 0.024, "step": 20400 }, { "epoch": 1.427773511135935, "grad_norm": 0.026891808956861496, "learning_rate": 3.970024669138753e-05, "loss": 0.0182, "step": 20450 }, { "epoch": 1.4312643999162187, "grad_norm": 1.5537898540496826, "learning_rate": 3.966145873737452e-05, "loss": 0.0373, "step": 20500 }, { "epoch": 1.4347552886965023, "grad_norm": 0.010172019712626934, "learning_rate": 3.962267078336152e-05, "loss": 0.0301, "step": 20550 }, { "epoch": 1.4382461774767856, "grad_norm": 0.11504697799682617, "learning_rate": 3.9583882829348515e-05, "loss": 0.0182, "step": 20600 }, { "epoch": 1.441737066257069, "grad_norm": 0.08654824644327164, "learning_rate": 3.954509487533552e-05, "loss": 0.0281, "step": 20650 }, { "epoch": 1.4452279550373526, "grad_norm": 2.5057575702667236, "learning_rate": 3.9506306921322515e-05, "loss": 0.0352, "step": 20700 }, { "epoch": 1.448718843817636, "grad_norm": 0.024753259494900703, "learning_rate": 3.9467518967309515e-05, "loss": 0.0159, "step": 20750 }, { "epoch": 1.4522097325979195, "grad_norm": 0.1611536741256714, "learning_rate": 3.942873101329651e-05, "loss": 0.0307, "step": 20800 }, { "epoch": 1.4557006213782029, "grad_norm": 1.3975356817245483, "learning_rate": 3.9389943059283516e-05, "loss": 0.033, "step": 20850 }, { "epoch": 1.4591915101584862, "grad_norm": 3.3075716495513916, "learning_rate": 3.935115510527051e-05, "loss": 0.0307, "step": 20900 }, { "epoch": 1.4626823989387698, "grad_norm": 0.0137362414970994, "learning_rate": 3.931236715125751e-05, "loss": 0.0245, "step": 20950 }, { "epoch": 1.4661732877190532, "grad_norm": 6.11804723739624, "learning_rate": 3.92735791972445e-05, "loss": 0.0449, "step": 21000 }, { "epoch": 1.4661732877190532, "eval_f1": 0.8997429305912596, "eval_loss": 0.2522394359111786, "eval_runtime": 25.048, "eval_samples_per_second": 146.12, "eval_steps_per_second": 4.591, "step": 21000 }, { "epoch": 1.4696641764993368, "grad_norm": 0.04642185568809509, "learning_rate": 3.923479124323151e-05, "loss": 0.0183, "step": 21050 }, { "epoch": 1.4731550652796201, "grad_norm": 0.055214524269104004, "learning_rate": 3.91960032892185e-05, "loss": 0.0331, "step": 21100 }, { "epoch": 1.4766459540599037, "grad_norm": 0.2047278881072998, "learning_rate": 3.91572153352055e-05, "loss": 0.0197, "step": 21150 }, { "epoch": 1.480136842840187, "grad_norm": 4.8017578125, "learning_rate": 3.9118427381192496e-05, "loss": 0.0209, "step": 21200 }, { "epoch": 1.4836277316204707, "grad_norm": 0.14394736289978027, "learning_rate": 3.9079639427179496e-05, "loss": 0.0298, "step": 21250 }, { "epoch": 1.487118620400754, "grad_norm": 10.568550109863281, "learning_rate": 3.9040851473166496e-05, "loss": 0.0302, "step": 21300 }, { "epoch": 1.4906095091810374, "grad_norm": 9.700904846191406, "learning_rate": 3.900206351915349e-05, "loss": 0.0272, "step": 21350 }, { "epoch": 1.494100397961321, "grad_norm": 3.1005468368530273, "learning_rate": 3.8963275565140496e-05, "loss": 0.0375, "step": 21400 }, { "epoch": 1.4975912867416044, "grad_norm": 0.24566790461540222, "learning_rate": 3.892448761112749e-05, "loss": 0.0224, "step": 21450 }, { "epoch": 1.501082175521888, "grad_norm": 0.2529149651527405, "learning_rate": 3.888569965711449e-05, "loss": 0.0204, "step": 21500 }, { "epoch": 1.5045730643021713, "grad_norm": 0.12634265422821045, "learning_rate": 3.884691170310148e-05, "loss": 0.0412, "step": 21550 }, { "epoch": 1.5080639530824547, "grad_norm": 0.01092058327049017, "learning_rate": 3.880812374908849e-05, "loss": 0.0194, "step": 21600 }, { "epoch": 1.5115548418627383, "grad_norm": 0.050227969884872437, "learning_rate": 3.877011155415574e-05, "loss": 0.0212, "step": 21650 }, { "epoch": 1.5150457306430218, "grad_norm": 0.07757686078548431, "learning_rate": 3.873132360014274e-05, "loss": 0.0288, "step": 21700 }, { "epoch": 1.5185366194233052, "grad_norm": 3.599360466003418, "learning_rate": 3.869253564612974e-05, "loss": 0.0287, "step": 21750 }, { "epoch": 1.5220275082035886, "grad_norm": 0.23955953121185303, "learning_rate": 3.8653747692116734e-05, "loss": 0.024, "step": 21800 }, { "epoch": 1.5255183969838721, "grad_norm": 0.11281917244195938, "learning_rate": 3.8614959738103734e-05, "loss": 0.0255, "step": 21850 }, { "epoch": 1.5290092857641555, "grad_norm": 0.08529462665319443, "learning_rate": 3.8576171784090734e-05, "loss": 0.019, "step": 21900 }, { "epoch": 1.532500174544439, "grad_norm": 2.4979732036590576, "learning_rate": 3.8537383830077734e-05, "loss": 0.0259, "step": 21950 }, { "epoch": 1.5359910633247225, "grad_norm": 0.12030795216560364, "learning_rate": 3.849859587606473e-05, "loss": 0.0222, "step": 22000 }, { "epoch": 1.5359910633247225, "eval_f1": 0.8751926040061633, "eval_loss": 0.33271512389183044, "eval_runtime": 27.5904, "eval_samples_per_second": 132.655, "eval_steps_per_second": 4.168, "step": 22000 }, { "epoch": 1.5394819521050058, "grad_norm": 0.10101566463708878, "learning_rate": 3.845980792205173e-05, "loss": 0.04, "step": 22050 }, { "epoch": 1.5429728408852894, "grad_norm": 1.0485931634902954, "learning_rate": 3.842101996803873e-05, "loss": 0.0309, "step": 22100 }, { "epoch": 1.546463729665573, "grad_norm": 0.11740628629922867, "learning_rate": 3.838223201402573e-05, "loss": 0.0306, "step": 22150 }, { "epoch": 1.5499546184458564, "grad_norm": 0.20643308758735657, "learning_rate": 3.834344406001272e-05, "loss": 0.028, "step": 22200 }, { "epoch": 1.5534455072261397, "grad_norm": 0.04227438196539879, "learning_rate": 3.830465610599972e-05, "loss": 0.0268, "step": 22250 }, { "epoch": 1.556936396006423, "grad_norm": 0.17100311815738678, "learning_rate": 3.826586815198672e-05, "loss": 0.0267, "step": 22300 }, { "epoch": 1.5604272847867067, "grad_norm": 0.024188999086618423, "learning_rate": 3.822708019797372e-05, "loss": 0.0259, "step": 22350 }, { "epoch": 1.5639181735669903, "grad_norm": 0.13360099494457245, "learning_rate": 3.8188292243960715e-05, "loss": 0.0408, "step": 22400 }, { "epoch": 1.5674090623472736, "grad_norm": 0.06247904151678085, "learning_rate": 3.814950428994772e-05, "loss": 0.028, "step": 22450 }, { "epoch": 1.570899951127557, "grad_norm": 3.2922091484069824, "learning_rate": 3.8110716335934715e-05, "loss": 0.0246, "step": 22500 }, { "epoch": 1.5743908399078406, "grad_norm": 1.2226293087005615, "learning_rate": 3.8071928381921715e-05, "loss": 0.0358, "step": 22550 }, { "epoch": 1.5778817286881242, "grad_norm": 1.1220752000808716, "learning_rate": 3.803314042790871e-05, "loss": 0.0169, "step": 22600 }, { "epoch": 1.5813726174684075, "grad_norm": 0.08222747594118118, "learning_rate": 3.799435247389571e-05, "loss": 0.0168, "step": 22650 }, { "epoch": 1.5848635062486909, "grad_norm": 0.9396281242370605, "learning_rate": 3.795556451988271e-05, "loss": 0.0269, "step": 22700 }, { "epoch": 1.5883543950289742, "grad_norm": 0.038672447204589844, "learning_rate": 3.79167765658697e-05, "loss": 0.0172, "step": 22750 }, { "epoch": 1.5918452838092578, "grad_norm": 0.020772503688931465, "learning_rate": 3.78779886118567e-05, "loss": 0.0309, "step": 22800 }, { "epoch": 1.5953361725895414, "grad_norm": 0.19638319313526154, "learning_rate": 3.78392006578437e-05, "loss": 0.0246, "step": 22850 }, { "epoch": 1.5988270613698248, "grad_norm": 0.6562692523002625, "learning_rate": 3.78004127038307e-05, "loss": 0.0192, "step": 22900 }, { "epoch": 1.6023179501501081, "grad_norm": 3.3480851650238037, "learning_rate": 3.7761624749817695e-05, "loss": 0.0179, "step": 22950 }, { "epoch": 1.6058088389303917, "grad_norm": 0.07655413448810577, "learning_rate": 3.7722836795804695e-05, "loss": 0.0261, "step": 23000 }, { "epoch": 1.6058088389303917, "eval_f1": 0.9204448246364414, "eval_loss": 0.22322723269462585, "eval_runtime": 25.7811, "eval_samples_per_second": 141.964, "eval_steps_per_second": 4.461, "step": 23000 }, { "epoch": 1.609299727710675, "grad_norm": 1.4100903272628784, "learning_rate": 3.7684048841791695e-05, "loss": 0.0337, "step": 23050 }, { "epoch": 1.6127906164909587, "grad_norm": 0.710587739944458, "learning_rate": 3.7645260887778695e-05, "loss": 0.0236, "step": 23100 }, { "epoch": 1.616281505271242, "grad_norm": 0.6931687593460083, "learning_rate": 3.760647293376569e-05, "loss": 0.0225, "step": 23150 }, { "epoch": 1.6197723940515254, "grad_norm": 0.12429849058389664, "learning_rate": 3.756768497975269e-05, "loss": 0.0275, "step": 23200 }, { "epoch": 1.623263282831809, "grad_norm": 0.04798004776239395, "learning_rate": 3.752889702573969e-05, "loss": 0.033, "step": 23250 }, { "epoch": 1.6267541716120926, "grad_norm": 0.08050743490457535, "learning_rate": 3.749010907172669e-05, "loss": 0.0299, "step": 23300 }, { "epoch": 1.630245060392376, "grad_norm": 0.47208932042121887, "learning_rate": 3.745132111771368e-05, "loss": 0.0337, "step": 23350 }, { "epoch": 1.6337359491726593, "grad_norm": 9.010507583618164, "learning_rate": 3.741253316370068e-05, "loss": 0.0256, "step": 23400 }, { "epoch": 1.6372268379529427, "grad_norm": 0.23059742152690887, "learning_rate": 3.737374520968768e-05, "loss": 0.0214, "step": 23450 }, { "epoch": 1.6407177267332262, "grad_norm": 0.037250641733407974, "learning_rate": 3.733495725567468e-05, "loss": 0.0174, "step": 23500 }, { "epoch": 1.6442086155135098, "grad_norm": 0.04142063111066818, "learning_rate": 3.7296169301661676e-05, "loss": 0.0257, "step": 23550 }, { "epoch": 1.6476995042937932, "grad_norm": 0.061181291937828064, "learning_rate": 3.7257381347648676e-05, "loss": 0.0369, "step": 23600 }, { "epoch": 1.6511903930740766, "grad_norm": 0.12618523836135864, "learning_rate": 3.7218593393635676e-05, "loss": 0.028, "step": 23650 }, { "epoch": 1.6546812818543601, "grad_norm": 0.05392460525035858, "learning_rate": 3.717980543962267e-05, "loss": 0.0169, "step": 23700 }, { "epoch": 1.6581721706346437, "grad_norm": 0.589623749256134, "learning_rate": 3.714101748560967e-05, "loss": 0.0297, "step": 23750 }, { "epoch": 1.661663059414927, "grad_norm": 0.03963274881243706, "learning_rate": 3.710222953159667e-05, "loss": 0.0146, "step": 23800 }, { "epoch": 1.6651539481952105, "grad_norm": 0.0122640086337924, "learning_rate": 3.706344157758367e-05, "loss": 0.0118, "step": 23850 }, { "epoch": 1.6686448369754938, "grad_norm": 0.0617678239941597, "learning_rate": 3.702465362357066e-05, "loss": 0.0284, "step": 23900 }, { "epoch": 1.6721357257557774, "grad_norm": 0.14892399311065674, "learning_rate": 3.698586566955766e-05, "loss": 0.0248, "step": 23950 }, { "epoch": 1.675626614536061, "grad_norm": 0.011567726731300354, "learning_rate": 3.694707771554466e-05, "loss": 0.0173, "step": 24000 }, { "epoch": 1.675626614536061, "eval_f1": 0.9144698301113063, "eval_loss": 0.24581825733184814, "eval_runtime": 27.0574, "eval_samples_per_second": 135.268, "eval_steps_per_second": 4.25, "step": 24000 }, { "epoch": 1.6791175033163444, "grad_norm": 0.02023119479417801, "learning_rate": 3.690828976153166e-05, "loss": 0.0285, "step": 24050 }, { "epoch": 1.6826083920966277, "grad_norm": 0.08181091398000717, "learning_rate": 3.6869501807518656e-05, "loss": 0.0203, "step": 24100 }, { "epoch": 1.6860992808769113, "grad_norm": 0.04160789027810097, "learning_rate": 3.6830713853505656e-05, "loss": 0.0165, "step": 24150 }, { "epoch": 1.6895901696571949, "grad_norm": 0.5199412107467651, "learning_rate": 3.6791925899492656e-05, "loss": 0.0331, "step": 24200 }, { "epoch": 1.6930810584374782, "grad_norm": 0.016219686716794968, "learning_rate": 3.6753137945479657e-05, "loss": 0.0367, "step": 24250 }, { "epoch": 1.6965719472177616, "grad_norm": 0.06320388615131378, "learning_rate": 3.671434999146665e-05, "loss": 0.0172, "step": 24300 }, { "epoch": 1.700062835998045, "grad_norm": 0.04016570374369621, "learning_rate": 3.667556203745365e-05, "loss": 0.0104, "step": 24350 }, { "epoch": 1.7035537247783286, "grad_norm": 0.7291899919509888, "learning_rate": 3.663677408344065e-05, "loss": 0.0387, "step": 24400 }, { "epoch": 1.7070446135586121, "grad_norm": 0.44951528310775757, "learning_rate": 3.659798612942765e-05, "loss": 0.0234, "step": 24450 }, { "epoch": 1.7105355023388955, "grad_norm": 1.9208860397338867, "learning_rate": 3.655919817541464e-05, "loss": 0.0194, "step": 24500 }, { "epoch": 1.7140263911191789, "grad_norm": 0.12419503182172775, "learning_rate": 3.6520410221401643e-05, "loss": 0.0207, "step": 24550 }, { "epoch": 1.7175172798994625, "grad_norm": 0.03292817994952202, "learning_rate": 3.6481622267388644e-05, "loss": 0.0361, "step": 24600 }, { "epoch": 1.7210081686797458, "grad_norm": 3.2311935424804688, "learning_rate": 3.644283431337564e-05, "loss": 0.0169, "step": 24650 }, { "epoch": 1.7244990574600294, "grad_norm": 1.065648078918457, "learning_rate": 3.640404635936264e-05, "loss": 0.0237, "step": 24700 }, { "epoch": 1.7279899462403128, "grad_norm": 0.02137444168329239, "learning_rate": 3.636525840534964e-05, "loss": 0.0167, "step": 24750 }, { "epoch": 1.7314808350205961, "grad_norm": 1.4355806112289429, "learning_rate": 3.632647045133664e-05, "loss": 0.0215, "step": 24800 }, { "epoch": 1.7349717238008797, "grad_norm": 4.564198970794678, "learning_rate": 3.628768249732363e-05, "loss": 0.0226, "step": 24850 }, { "epoch": 1.7384626125811633, "grad_norm": 0.08031658828258514, "learning_rate": 3.624889454331063e-05, "loss": 0.0282, "step": 24900 }, { "epoch": 1.7419535013614467, "grad_norm": 1.2107548713684082, "learning_rate": 3.621010658929763e-05, "loss": 0.0411, "step": 24950 }, { "epoch": 1.74544439014173, "grad_norm": 0.18569612503051758, "learning_rate": 3.617131863528463e-05, "loss": 0.0165, "step": 25000 }, { "epoch": 1.74544439014173, "eval_f1": 0.8931159420289855, "eval_loss": 0.2845694422721863, "eval_runtime": 24.4848, "eval_samples_per_second": 149.48, "eval_steps_per_second": 4.697, "step": 25000 }, { "epoch": 1.7489352789220134, "grad_norm": 1.5806281566619873, "learning_rate": 3.6132530681271624e-05, "loss": 0.023, "step": 25050 }, { "epoch": 1.752426167702297, "grad_norm": 0.054229360073804855, "learning_rate": 3.6093742727258624e-05, "loss": 0.0202, "step": 25100 }, { "epoch": 1.7559170564825806, "grad_norm": 0.20487448573112488, "learning_rate": 3.6054954773245624e-05, "loss": 0.0389, "step": 25150 }, { "epoch": 1.759407945262864, "grad_norm": 0.7431159615516663, "learning_rate": 3.6016166819232624e-05, "loss": 0.0119, "step": 25200 }, { "epoch": 1.7628988340431473, "grad_norm": 0.22739042341709137, "learning_rate": 3.597737886521962e-05, "loss": 0.0383, "step": 25250 }, { "epoch": 1.7663897228234309, "grad_norm": 0.023175235837697983, "learning_rate": 3.593859091120662e-05, "loss": 0.0337, "step": 25300 }, { "epoch": 1.7698806116037145, "grad_norm": 0.03510722145438194, "learning_rate": 3.589980295719362e-05, "loss": 0.0225, "step": 25350 }, { "epoch": 1.7733715003839978, "grad_norm": 0.03466213867068291, "learning_rate": 3.586101500318062e-05, "loss": 0.0267, "step": 25400 }, { "epoch": 1.7768623891642812, "grad_norm": 0.23764640092849731, "learning_rate": 3.582222704916761e-05, "loss": 0.0231, "step": 25450 }, { "epoch": 1.7803532779445645, "grad_norm": 5.474936008453369, "learning_rate": 3.578343909515461e-05, "loss": 0.0276, "step": 25500 }, { "epoch": 1.7838441667248481, "grad_norm": 0.040409285575151443, "learning_rate": 3.574465114114161e-05, "loss": 0.0203, "step": 25550 }, { "epoch": 1.7873350555051317, "grad_norm": 0.04732607677578926, "learning_rate": 3.5705863187128604e-05, "loss": 0.0233, "step": 25600 }, { "epoch": 1.790825944285415, "grad_norm": 1.5996266603469849, "learning_rate": 3.566785099219587e-05, "loss": 0.0298, "step": 25650 }, { "epoch": 1.7943168330656984, "grad_norm": 0.021670788526535034, "learning_rate": 3.562906303818286e-05, "loss": 0.0258, "step": 25700 }, { "epoch": 1.797807721845982, "grad_norm": 0.11140725761651993, "learning_rate": 3.559027508416986e-05, "loss": 0.0253, "step": 25750 }, { "epoch": 1.8012986106262654, "grad_norm": 0.19464978575706482, "learning_rate": 3.5551487130156856e-05, "loss": 0.0394, "step": 25800 }, { "epoch": 1.804789499406549, "grad_norm": 0.0491335391998291, "learning_rate": 3.551269917614386e-05, "loss": 0.0254, "step": 25850 }, { "epoch": 1.8082803881868323, "grad_norm": 0.07059049606323242, "learning_rate": 3.5473911222130856e-05, "loss": 0.0111, "step": 25900 }, { "epoch": 1.8117712769671157, "grad_norm": 2.9037585258483887, "learning_rate": 3.5435123268117856e-05, "loss": 0.0287, "step": 25950 }, { "epoch": 1.8152621657473993, "grad_norm": 0.014848013408482075, "learning_rate": 3.5396335314104856e-05, "loss": 0.0068, "step": 26000 }, { "epoch": 1.8152621657473993, "eval_f1": 0.9202556265629341, "eval_loss": 0.22660647332668304, "eval_runtime": 18.1122, "eval_samples_per_second": 202.073, "eval_steps_per_second": 6.349, "step": 26000 }, { "epoch": 1.8187530545276829, "grad_norm": 1.870595932006836, "learning_rate": 3.535754736009185e-05, "loss": 0.034, "step": 26050 }, { "epoch": 1.8222439433079662, "grad_norm": 0.03695142641663551, "learning_rate": 3.531875940607885e-05, "loss": 0.0334, "step": 26100 }, { "epoch": 1.8257348320882496, "grad_norm": 0.5613416433334351, "learning_rate": 3.527997145206585e-05, "loss": 0.0274, "step": 26150 }, { "epoch": 1.829225720868533, "grad_norm": 2.8512015342712402, "learning_rate": 3.524118349805285e-05, "loss": 0.0262, "step": 26200 }, { "epoch": 1.8327166096488166, "grad_norm": 0.012033411301672459, "learning_rate": 3.520239554403984e-05, "loss": 0.02, "step": 26250 }, { "epoch": 1.8362074984291001, "grad_norm": 0.1745612472295761, "learning_rate": 3.516360759002684e-05, "loss": 0.0098, "step": 26300 }, { "epoch": 1.8396983872093835, "grad_norm": 0.3587130308151245, "learning_rate": 3.5124819636013836e-05, "loss": 0.0194, "step": 26350 }, { "epoch": 1.8431892759896669, "grad_norm": 1.0576293468475342, "learning_rate": 3.508603168200084e-05, "loss": 0.0266, "step": 26400 }, { "epoch": 1.8466801647699504, "grad_norm": 0.13543370366096497, "learning_rate": 3.5047243727987836e-05, "loss": 0.0206, "step": 26450 }, { "epoch": 1.850171053550234, "grad_norm": 0.039760392159223557, "learning_rate": 3.50092315330551e-05, "loss": 0.0271, "step": 26500 }, { "epoch": 1.8536619423305174, "grad_norm": 0.8882807493209839, "learning_rate": 3.4970443579042095e-05, "loss": 0.0298, "step": 26550 }, { "epoch": 1.8571528311108008, "grad_norm": 0.032096270471811295, "learning_rate": 3.4931655625029095e-05, "loss": 0.0096, "step": 26600 }, { "epoch": 1.8606437198910841, "grad_norm": 0.19736655056476593, "learning_rate": 3.489286767101609e-05, "loss": 0.0395, "step": 26650 }, { "epoch": 1.8641346086713677, "grad_norm": 0.04327971488237381, "learning_rate": 3.485407971700309e-05, "loss": 0.0226, "step": 26700 }, { "epoch": 1.8676254974516513, "grad_norm": 0.00801787618547678, "learning_rate": 3.481529176299009e-05, "loss": 0.0133, "step": 26750 }, { "epoch": 1.8711163862319347, "grad_norm": 6.338688850402832, "learning_rate": 3.477650380897708e-05, "loss": 0.0156, "step": 26800 }, { "epoch": 1.874607275012218, "grad_norm": 0.03296707198023796, "learning_rate": 3.473771585496408e-05, "loss": 0.0333, "step": 26850 }, { "epoch": 1.8780981637925016, "grad_norm": 0.0316268652677536, "learning_rate": 3.469892790095108e-05, "loss": 0.0156, "step": 26900 }, { "epoch": 1.8815890525727852, "grad_norm": 3.483025312423706, "learning_rate": 3.466013994693808e-05, "loss": 0.0242, "step": 26950 }, { "epoch": 1.8850799413530686, "grad_norm": 0.11344876885414124, "learning_rate": 3.4621351992925075e-05, "loss": 0.0252, "step": 27000 }, { "epoch": 1.8850799413530686, "eval_f1": 0.8898203592814371, "eval_loss": 0.2792615592479706, "eval_runtime": 383.384, "eval_samples_per_second": 9.547, "eval_steps_per_second": 0.3, "step": 27000 }, { "epoch": 1.888570830133352, "grad_norm": 0.025170810520648956, "learning_rate": 3.458256403891208e-05, "loss": 0.0292, "step": 27050 }, { "epoch": 1.8920617189136353, "grad_norm": 0.12406772375106812, "learning_rate": 3.4543776084899075e-05, "loss": 0.023, "step": 27100 }, { "epoch": 1.8955526076939189, "grad_norm": 0.05151906982064247, "learning_rate": 3.4504988130886075e-05, "loss": 0.0199, "step": 27150 }, { "epoch": 1.8990434964742025, "grad_norm": 0.07848247140645981, "learning_rate": 3.446620017687307e-05, "loss": 0.02, "step": 27200 }, { "epoch": 1.9025343852544858, "grad_norm": 0.01637980341911316, "learning_rate": 3.4427412222860075e-05, "loss": 0.0175, "step": 27250 }, { "epoch": 1.9060252740347692, "grad_norm": 0.6902241706848145, "learning_rate": 3.438862426884707e-05, "loss": 0.0387, "step": 27300 }, { "epoch": 1.9095161628150528, "grad_norm": 1.2976561784744263, "learning_rate": 3.434983631483407e-05, "loss": 0.0266, "step": 27350 }, { "epoch": 1.9130070515953361, "grad_norm": 0.7005116939544678, "learning_rate": 3.431104836082106e-05, "loss": 0.0173, "step": 27400 }, { "epoch": 1.9164979403756197, "grad_norm": 1.2808347940444946, "learning_rate": 3.427226040680807e-05, "loss": 0.036, "step": 27450 }, { "epoch": 1.919988829155903, "grad_norm": 0.03168126195669174, "learning_rate": 3.423347245279506e-05, "loss": 0.0343, "step": 27500 }, { "epoch": 1.9234797179361864, "grad_norm": 0.12667793035507202, "learning_rate": 3.419468449878206e-05, "loss": 0.0239, "step": 27550 }, { "epoch": 1.92697060671647, "grad_norm": 0.9971345663070679, "learning_rate": 3.4155896544769055e-05, "loss": 0.0198, "step": 27600 }, { "epoch": 1.9304614954967536, "grad_norm": 0.03455730155110359, "learning_rate": 3.4117108590756056e-05, "loss": 0.038, "step": 27650 }, { "epoch": 1.933952384277037, "grad_norm": 0.5763664245605469, "learning_rate": 3.4078320636743056e-05, "loss": 0.0328, "step": 27700 }, { "epoch": 1.9374432730573203, "grad_norm": 0.015370190143585205, "learning_rate": 3.403953268273005e-05, "loss": 0.0124, "step": 27750 }, { "epoch": 1.9409341618376037, "grad_norm": 0.5836357474327087, "learning_rate": 3.400074472871705e-05, "loss": 0.0227, "step": 27800 }, { "epoch": 1.9444250506178873, "grad_norm": 0.015789790078997612, "learning_rate": 3.396195677470405e-05, "loss": 0.0131, "step": 27850 }, { "epoch": 1.9479159393981709, "grad_norm": 0.02878633514046669, "learning_rate": 3.392316882069105e-05, "loss": 0.0317, "step": 27900 }, { "epoch": 1.9514068281784542, "grad_norm": 0.2344045788049698, "learning_rate": 3.388438086667804e-05, "loss": 0.0178, "step": 27950 }, { "epoch": 1.9548977169587376, "grad_norm": 0.2111712098121643, "learning_rate": 3.384559291266504e-05, "loss": 0.018, "step": 28000 }, { "epoch": 1.9548977169587376, "eval_f1": 0.9149679673849738, "eval_loss": 0.23677824437618256, "eval_runtime": 27.4697, "eval_samples_per_second": 133.237, "eval_steps_per_second": 4.186, "step": 28000 }, { "epoch": 1.9583886057390212, "grad_norm": 2.1588897705078125, "learning_rate": 3.380680495865204e-05, "loss": 0.0354, "step": 28050 }, { "epoch": 1.9618794945193048, "grad_norm": 0.13064005970954895, "learning_rate": 3.376801700463904e-05, "loss": 0.0231, "step": 28100 }, { "epoch": 1.9653703832995881, "grad_norm": 0.4351823329925537, "learning_rate": 3.3729229050626036e-05, "loss": 0.0381, "step": 28150 }, { "epoch": 1.9688612720798715, "grad_norm": 0.03735004737973213, "learning_rate": 3.369044109661304e-05, "loss": 0.0454, "step": 28200 }, { "epoch": 1.9723521608601549, "grad_norm": 2.105356454849243, "learning_rate": 3.3651653142600036e-05, "loss": 0.0196, "step": 28250 }, { "epoch": 1.9758430496404384, "grad_norm": 0.01047208160161972, "learning_rate": 3.3612865188587036e-05, "loss": 0.0262, "step": 28300 }, { "epoch": 1.979333938420722, "grad_norm": 0.2573576271533966, "learning_rate": 3.357407723457403e-05, "loss": 0.0283, "step": 28350 }, { "epoch": 1.9828248272010054, "grad_norm": 0.03263131529092789, "learning_rate": 3.3535289280561036e-05, "loss": 0.0222, "step": 28400 }, { "epoch": 1.9863157159812888, "grad_norm": 1.0335652828216553, "learning_rate": 3.349650132654803e-05, "loss": 0.0137, "step": 28450 }, { "epoch": 1.9898066047615723, "grad_norm": 0.029950108379125595, "learning_rate": 3.345771337253503e-05, "loss": 0.0257, "step": 28500 }, { "epoch": 1.9932974935418557, "grad_norm": 0.9945626854896545, "learning_rate": 3.341892541852202e-05, "loss": 0.0156, "step": 28550 }, { "epoch": 1.9967883823221393, "grad_norm": 0.18837442994117737, "learning_rate": 3.338013746450902e-05, "loss": 0.015, "step": 28600 }, { "epoch": 2.0002792711024227, "grad_norm": 0.023262226954102516, "learning_rate": 3.334134951049602e-05, "loss": 0.0184, "step": 28650 }, { "epoch": 2.003770159882706, "grad_norm": 0.7680675387382507, "learning_rate": 3.3302561556483016e-05, "loss": 0.0272, "step": 28700 }, { "epoch": 2.0072610486629894, "grad_norm": 0.5894890427589417, "learning_rate": 3.3263773602470017e-05, "loss": 0.036, "step": 28750 }, { "epoch": 2.010751937443273, "grad_norm": 0.16585896909236908, "learning_rate": 3.3224985648457017e-05, "loss": 0.0113, "step": 28800 }, { "epoch": 2.0142428262235565, "grad_norm": 0.09970059245824814, "learning_rate": 3.318619769444402e-05, "loss": 0.024, "step": 28850 }, { "epoch": 2.01773371500384, "grad_norm": 3.7176826000213623, "learning_rate": 3.3148185499511275e-05, "loss": 0.0298, "step": 28900 }, { "epoch": 2.0212246037841233, "grad_norm": 0.08712600916624069, "learning_rate": 3.3109397545498275e-05, "loss": 0.0169, "step": 28950 }, { "epoch": 2.024715492564407, "grad_norm": 0.592137336730957, "learning_rate": 3.307060959148527e-05, "loss": 0.0185, "step": 29000 }, { "epoch": 2.024715492564407, "eval_f1": 0.9016686531585221, "eval_loss": 0.27124351263046265, "eval_runtime": 24.7503, "eval_samples_per_second": 147.877, "eval_steps_per_second": 4.646, "step": 29000 }, { "epoch": 2.0282063813446904, "grad_norm": 41.94601058959961, "learning_rate": 3.303182163747227e-05, "loss": 0.0235, "step": 29050 }, { "epoch": 2.031697270124974, "grad_norm": 0.031210171058773994, "learning_rate": 3.299303368345927e-05, "loss": 0.0282, "step": 29100 }, { "epoch": 2.035188158905257, "grad_norm": 0.10925783962011337, "learning_rate": 3.295424572944626e-05, "loss": 0.0321, "step": 29150 }, { "epoch": 2.0386790476855405, "grad_norm": 3.2144949436187744, "learning_rate": 3.291545777543326e-05, "loss": 0.036, "step": 29200 }, { "epoch": 2.0421699364658243, "grad_norm": 0.024584254249930382, "learning_rate": 3.287666982142026e-05, "loss": 0.0294, "step": 29250 }, { "epoch": 2.0456608252461077, "grad_norm": 4.2165656089782715, "learning_rate": 3.283788186740726e-05, "loss": 0.025, "step": 29300 }, { "epoch": 2.049151714026391, "grad_norm": 11.206308364868164, "learning_rate": 3.2799093913394255e-05, "loss": 0.0176, "step": 29350 }, { "epoch": 2.0526426028066744, "grad_norm": 1.3034924268722534, "learning_rate": 3.2760305959381255e-05, "loss": 0.0274, "step": 29400 }, { "epoch": 2.0561334915869582, "grad_norm": 0.08774253726005554, "learning_rate": 3.2721518005368255e-05, "loss": 0.0205, "step": 29450 }, { "epoch": 2.0596243803672416, "grad_norm": 1.1402833461761475, "learning_rate": 3.2682730051355255e-05, "loss": 0.0234, "step": 29500 }, { "epoch": 2.063115269147525, "grad_norm": 0.5319716930389404, "learning_rate": 3.264394209734225e-05, "loss": 0.0266, "step": 29550 }, { "epoch": 2.0666061579278083, "grad_norm": 0.014304044656455517, "learning_rate": 3.260515414332925e-05, "loss": 0.0233, "step": 29600 }, { "epoch": 2.0700970467080917, "grad_norm": 0.031523484736680984, "learning_rate": 3.256636618931625e-05, "loss": 0.0285, "step": 29650 }, { "epoch": 2.0735879354883755, "grad_norm": 2.613784074783325, "learning_rate": 3.252757823530325e-05, "loss": 0.018, "step": 29700 }, { "epoch": 2.077078824268659, "grad_norm": 5.857016563415527, "learning_rate": 3.248879028129024e-05, "loss": 0.0272, "step": 29750 }, { "epoch": 2.0805697130489422, "grad_norm": 3.223562002182007, "learning_rate": 3.245000232727724e-05, "loss": 0.0249, "step": 29800 }, { "epoch": 2.0840606018292256, "grad_norm": 0.13457106053829193, "learning_rate": 3.241121437326424e-05, "loss": 0.0299, "step": 29850 }, { "epoch": 2.087551490609509, "grad_norm": 0.08261656016111374, "learning_rate": 3.237242641925124e-05, "loss": 0.0145, "step": 29900 }, { "epoch": 2.0910423793897928, "grad_norm": 0.022206531837582588, "learning_rate": 3.2333638465238236e-05, "loss": 0.0297, "step": 29950 }, { "epoch": 2.094533268170076, "grad_norm": 0.23456662893295288, "learning_rate": 3.2294850511225236e-05, "loss": 0.028, "step": 30000 }, { "epoch": 2.094533268170076, "eval_f1": 0.90849478390462, "eval_loss": 0.22531723976135254, "eval_runtime": 26.1579, "eval_samples_per_second": 139.92, "eval_steps_per_second": 4.396, "step": 30000 }, { "epoch": 2.0980241569503595, "grad_norm": 0.04046647995710373, "learning_rate": 3.2256062557212236e-05, "loss": 0.0219, "step": 30050 }, { "epoch": 2.101515045730643, "grad_norm": 0.1782691329717636, "learning_rate": 3.221727460319923e-05, "loss": 0.0185, "step": 30100 }, { "epoch": 2.1050059345109267, "grad_norm": 0.05876058712601662, "learning_rate": 3.217848664918623e-05, "loss": 0.025, "step": 30150 }, { "epoch": 2.10849682329121, "grad_norm": 0.05100909247994423, "learning_rate": 3.213969869517323e-05, "loss": 0.0353, "step": 30200 }, { "epoch": 2.1119877120714934, "grad_norm": 0.1494356095790863, "learning_rate": 3.210091074116023e-05, "loss": 0.0216, "step": 30250 }, { "epoch": 2.1154786008517767, "grad_norm": 0.019559986889362335, "learning_rate": 3.206212278714722e-05, "loss": 0.0179, "step": 30300 }, { "epoch": 2.11896948963206, "grad_norm": 0.009545271284878254, "learning_rate": 3.202333483313422e-05, "loss": 0.0209, "step": 30350 }, { "epoch": 2.122460378412344, "grad_norm": 0.016645213589072227, "learning_rate": 3.198454687912122e-05, "loss": 0.0328, "step": 30400 }, { "epoch": 2.1259512671926273, "grad_norm": 0.05545541271567345, "learning_rate": 3.194575892510822e-05, "loss": 0.0161, "step": 30450 }, { "epoch": 2.1294421559729106, "grad_norm": 4.6828789710998535, "learning_rate": 3.1906970971095216e-05, "loss": 0.0248, "step": 30500 }, { "epoch": 2.132933044753194, "grad_norm": 0.03040727972984314, "learning_rate": 3.1868183017082216e-05, "loss": 0.0233, "step": 30550 }, { "epoch": 2.136423933533478, "grad_norm": 0.04015226662158966, "learning_rate": 3.1829395063069216e-05, "loss": 0.0187, "step": 30600 }, { "epoch": 2.139914822313761, "grad_norm": 0.44110485911369324, "learning_rate": 3.1790607109056216e-05, "loss": 0.0358, "step": 30650 }, { "epoch": 2.1434057110940445, "grad_norm": 0.03207771107554436, "learning_rate": 3.175181915504321e-05, "loss": 0.0207, "step": 30700 }, { "epoch": 2.146896599874328, "grad_norm": 0.02635408379137516, "learning_rate": 3.171303120103021e-05, "loss": 0.0244, "step": 30750 }, { "epoch": 2.1503874886546113, "grad_norm": 0.026559866964817047, "learning_rate": 3.167424324701721e-05, "loss": 0.0218, "step": 30800 }, { "epoch": 2.153878377434895, "grad_norm": 0.03618308901786804, "learning_rate": 3.163545529300421e-05, "loss": 0.0307, "step": 30850 }, { "epoch": 2.1573692662151784, "grad_norm": 0.021506665274500847, "learning_rate": 3.15966673389912e-05, "loss": 0.0181, "step": 30900 }, { "epoch": 2.160860154995462, "grad_norm": 0.19482529163360596, "learning_rate": 3.15578793849782e-05, "loss": 0.0187, "step": 30950 }, { "epoch": 2.164351043775745, "grad_norm": 0.6267532706260681, "learning_rate": 3.15190914309652e-05, "loss": 0.0231, "step": 31000 }, { "epoch": 2.164351043775745, "eval_f1": 0.9244186046511628, "eval_loss": 0.22860726714134216, "eval_runtime": 24.3238, "eval_samples_per_second": 150.47, "eval_steps_per_second": 4.728, "step": 31000 }, { "epoch": 2.167841932556029, "grad_norm": 0.10762369632720947, "learning_rate": 3.1480303476952197e-05, "loss": 0.0166, "step": 31050 }, { "epoch": 2.1713328213363123, "grad_norm": 0.02598682790994644, "learning_rate": 3.14415155229392e-05, "loss": 0.0337, "step": 31100 }, { "epoch": 2.1748237101165957, "grad_norm": 2.0279221534729004, "learning_rate": 3.14027275689262e-05, "loss": 0.0139, "step": 31150 }, { "epoch": 2.178314598896879, "grad_norm": 0.023457834497094154, "learning_rate": 3.13639396149132e-05, "loss": 0.0213, "step": 31200 }, { "epoch": 2.1818054876771624, "grad_norm": 1.7613838911056519, "learning_rate": 3.132515166090019e-05, "loss": 0.0308, "step": 31250 }, { "epoch": 2.1852963764574462, "grad_norm": 0.7117096185684204, "learning_rate": 3.128636370688719e-05, "loss": 0.0155, "step": 31300 }, { "epoch": 2.1887872652377296, "grad_norm": 0.17505602538585663, "learning_rate": 3.1247575752874183e-05, "loss": 0.0203, "step": 31350 }, { "epoch": 2.192278154018013, "grad_norm": 0.0648331269621849, "learning_rate": 3.120878779886119e-05, "loss": 0.0202, "step": 31400 }, { "epoch": 2.1957690427982963, "grad_norm": 0.6441143155097961, "learning_rate": 3.1169999844848184e-05, "loss": 0.0442, "step": 31450 }, { "epoch": 2.19925993157858, "grad_norm": 0.027606986463069916, "learning_rate": 3.1131211890835184e-05, "loss": 0.0161, "step": 31500 }, { "epoch": 2.2027508203588635, "grad_norm": 0.007126821670681238, "learning_rate": 3.109242393682218e-05, "loss": 0.0097, "step": 31550 }, { "epoch": 2.206241709139147, "grad_norm": 0.008397717960178852, "learning_rate": 3.1053635982809184e-05, "loss": 0.0293, "step": 31600 }, { "epoch": 2.20973259791943, "grad_norm": 0.03310353308916092, "learning_rate": 3.101484802879618e-05, "loss": 0.0314, "step": 31650 }, { "epoch": 2.2132234866997136, "grad_norm": 0.02405470609664917, "learning_rate": 3.097606007478318e-05, "loss": 0.0274, "step": 31700 }, { "epoch": 2.2167143754799974, "grad_norm": 0.1666310578584671, "learning_rate": 3.093727212077018e-05, "loss": 0.0215, "step": 31750 }, { "epoch": 2.2202052642602808, "grad_norm": 0.08723346889019012, "learning_rate": 3.089848416675718e-05, "loss": 0.0177, "step": 31800 }, { "epoch": 2.223696153040564, "grad_norm": 0.019003497436642647, "learning_rate": 3.085969621274417e-05, "loss": 0.0149, "step": 31850 }, { "epoch": 2.2271870418208475, "grad_norm": 0.010868572629988194, "learning_rate": 3.082090825873117e-05, "loss": 0.0138, "step": 31900 }, { "epoch": 2.230677930601131, "grad_norm": 0.08453696966171265, "learning_rate": 3.078212030471817e-05, "loss": 0.0166, "step": 31950 }, { "epoch": 2.2341688193814147, "grad_norm": 0.06810794770717621, "learning_rate": 3.0743332350705164e-05, "loss": 0.0241, "step": 32000 }, { "epoch": 2.2341688193814147, "eval_f1": 0.9090370370370371, "eval_loss": 0.25670725107192993, "eval_runtime": 23.7993, "eval_samples_per_second": 153.786, "eval_steps_per_second": 4.832, "step": 32000 }, { "epoch": 2.237659708161698, "grad_norm": 0.08504435420036316, "learning_rate": 3.0704544396692164e-05, "loss": 0.0206, "step": 32050 }, { "epoch": 2.2411505969419814, "grad_norm": 0.009788773022592068, "learning_rate": 3.0665756442679164e-05, "loss": 0.01, "step": 32100 }, { "epoch": 2.2446414857222647, "grad_norm": 0.05060356482863426, "learning_rate": 3.0626968488666164e-05, "loss": 0.0194, "step": 32150 }, { "epoch": 2.2481323745025485, "grad_norm": 0.020789574831724167, "learning_rate": 3.058818053465316e-05, "loss": 0.0193, "step": 32200 }, { "epoch": 2.251623263282832, "grad_norm": 0.09519587457180023, "learning_rate": 3.054939258064016e-05, "loss": 0.0388, "step": 32250 }, { "epoch": 2.2551141520631153, "grad_norm": 0.09373360872268677, "learning_rate": 3.0510604626627154e-05, "loss": 0.0221, "step": 32300 }, { "epoch": 2.2586050408433986, "grad_norm": 0.022543421015143394, "learning_rate": 3.0471816672614158e-05, "loss": 0.0162, "step": 32350 }, { "epoch": 2.262095929623682, "grad_norm": 2.8079066276550293, "learning_rate": 3.0433028718601155e-05, "loss": 0.0262, "step": 32400 }, { "epoch": 2.265586818403966, "grad_norm": 0.032409828156232834, "learning_rate": 3.039424076458815e-05, "loss": 0.0325, "step": 32450 }, { "epoch": 2.269077707184249, "grad_norm": 0.03421691432595253, "learning_rate": 3.0355452810575148e-05, "loss": 0.0294, "step": 32500 }, { "epoch": 2.2725685959645325, "grad_norm": 0.3292216956615448, "learning_rate": 3.0316664856562148e-05, "loss": 0.0175, "step": 32550 }, { "epoch": 2.276059484744816, "grad_norm": 0.031362514942884445, "learning_rate": 3.0277876902549145e-05, "loss": 0.0233, "step": 32600 }, { "epoch": 2.2795503735250993, "grad_norm": 0.15582652390003204, "learning_rate": 3.023908894853614e-05, "loss": 0.0159, "step": 32650 }, { "epoch": 2.283041262305383, "grad_norm": 0.011120837181806564, "learning_rate": 3.0200300994523138e-05, "loss": 0.0194, "step": 32700 }, { "epoch": 2.2865321510856664, "grad_norm": 0.05831030383706093, "learning_rate": 3.016151304051014e-05, "loss": 0.0326, "step": 32750 }, { "epoch": 2.29002303986595, "grad_norm": 0.1667061150074005, "learning_rate": 3.0123500845577403e-05, "loss": 0.0261, "step": 32800 }, { "epoch": 2.293513928646233, "grad_norm": 0.03689422830939293, "learning_rate": 3.00847128915644e-05, "loss": 0.0211, "step": 32850 }, { "epoch": 2.297004817426517, "grad_norm": 0.08821934461593628, "learning_rate": 3.0045924937551396e-05, "loss": 0.0196, "step": 32900 }, { "epoch": 2.3004957062068003, "grad_norm": 0.026601577177643776, "learning_rate": 3.0007136983538393e-05, "loss": 0.0227, "step": 32950 }, { "epoch": 2.3039865949870837, "grad_norm": 6.168493747711182, "learning_rate": 2.9968349029525393e-05, "loss": 0.0292, "step": 33000 }, { "epoch": 2.3039865949870837, "eval_f1": 0.9086595492289442, "eval_loss": 0.22055290639400482, "eval_runtime": 17.4165, "eval_samples_per_second": 210.146, "eval_steps_per_second": 6.603, "step": 33000 }, { "epoch": 2.307477483767367, "grad_norm": 0.011802727356553078, "learning_rate": 2.992956107551239e-05, "loss": 0.0168, "step": 33050 }, { "epoch": 2.3109683725476504, "grad_norm": 0.6425135135650635, "learning_rate": 2.9890773121499386e-05, "loss": 0.0198, "step": 33100 }, { "epoch": 2.3144592613279342, "grad_norm": 0.06205461546778679, "learning_rate": 2.9851985167486387e-05, "loss": 0.0203, "step": 33150 }, { "epoch": 2.3179501501082176, "grad_norm": 0.03974612429738045, "learning_rate": 2.9813197213473387e-05, "loss": 0.0186, "step": 33200 }, { "epoch": 2.321441038888501, "grad_norm": 0.2282327115535736, "learning_rate": 2.9774409259460383e-05, "loss": 0.0312, "step": 33250 }, { "epoch": 2.3249319276687843, "grad_norm": 0.06931372731924057, "learning_rate": 2.973562130544738e-05, "loss": 0.0205, "step": 33300 }, { "epoch": 2.328422816449068, "grad_norm": 0.29048246145248413, "learning_rate": 2.9696833351434377e-05, "loss": 0.0207, "step": 33350 }, { "epoch": 2.3319137052293515, "grad_norm": 0.0800575315952301, "learning_rate": 2.965804539742138e-05, "loss": 0.0319, "step": 33400 }, { "epoch": 2.335404594009635, "grad_norm": 2.8189268112182617, "learning_rate": 2.9619257443408377e-05, "loss": 0.0298, "step": 33450 }, { "epoch": 2.338895482789918, "grad_norm": 0.019050445407629013, "learning_rate": 2.9580469489395374e-05, "loss": 0.0148, "step": 33500 }, { "epoch": 2.3423863715702016, "grad_norm": 2.150393009185791, "learning_rate": 2.954168153538237e-05, "loss": 0.0268, "step": 33550 }, { "epoch": 2.3458772603504854, "grad_norm": 0.049240246415138245, "learning_rate": 2.9502893581369374e-05, "loss": 0.0295, "step": 33600 }, { "epoch": 2.3493681491307687, "grad_norm": 0.012683813460171223, "learning_rate": 2.946410562735637e-05, "loss": 0.022, "step": 33650 }, { "epoch": 2.352859037911052, "grad_norm": 0.7272828221321106, "learning_rate": 2.9425317673343367e-05, "loss": 0.0178, "step": 33700 }, { "epoch": 2.3563499266913355, "grad_norm": 0.005036385729908943, "learning_rate": 2.9386529719330364e-05, "loss": 0.0133, "step": 33750 }, { "epoch": 2.3598408154716193, "grad_norm": 2.0128109455108643, "learning_rate": 2.9347741765317367e-05, "loss": 0.0333, "step": 33800 }, { "epoch": 2.3633317042519026, "grad_norm": 0.5187195539474487, "learning_rate": 2.9308953811304364e-05, "loss": 0.0217, "step": 33850 }, { "epoch": 2.366822593032186, "grad_norm": 0.034500740468502045, "learning_rate": 2.927016585729136e-05, "loss": 0.0232, "step": 33900 }, { "epoch": 2.3703134818124694, "grad_norm": 2.572962522506714, "learning_rate": 2.923137790327836e-05, "loss": 0.0157, "step": 33950 }, { "epoch": 2.3738043705927527, "grad_norm": 0.02022615261375904, "learning_rate": 2.9192589949265357e-05, "loss": 0.0269, "step": 34000 }, { "epoch": 2.3738043705927527, "eval_f1": 0.9133016627078385, "eval_loss": 0.217937171459198, "eval_runtime": 17.3487, "eval_samples_per_second": 210.967, "eval_steps_per_second": 6.629, "step": 34000 }, { "epoch": 2.3772952593730365, "grad_norm": 0.05787201225757599, "learning_rate": 2.9153801995252354e-05, "loss": 0.0187, "step": 34050 }, { "epoch": 2.38078614815332, "grad_norm": 0.08669839054346085, "learning_rate": 2.9115014041239354e-05, "loss": 0.0303, "step": 34100 }, { "epoch": 2.3842770369336033, "grad_norm": 3.055640697479248, "learning_rate": 2.9076226087226354e-05, "loss": 0.0314, "step": 34150 }, { "epoch": 2.3877679257138866, "grad_norm": 3.0248279571533203, "learning_rate": 2.903743813321335e-05, "loss": 0.0305, "step": 34200 }, { "epoch": 2.3912588144941704, "grad_norm": 0.35696566104888916, "learning_rate": 2.8998650179200348e-05, "loss": 0.0228, "step": 34250 }, { "epoch": 2.394749703274454, "grad_norm": 0.03579020872712135, "learning_rate": 2.8959862225187344e-05, "loss": 0.0229, "step": 34300 }, { "epoch": 2.398240592054737, "grad_norm": 0.5238425135612488, "learning_rate": 2.8921074271174348e-05, "loss": 0.0252, "step": 34350 }, { "epoch": 2.4017314808350205, "grad_norm": 0.07448386400938034, "learning_rate": 2.8882286317161344e-05, "loss": 0.0127, "step": 34400 }, { "epoch": 2.405222369615304, "grad_norm": 5.516473293304443, "learning_rate": 2.884349836314834e-05, "loss": 0.0286, "step": 34450 }, { "epoch": 2.4087132583955877, "grad_norm": 0.025009727105498314, "learning_rate": 2.8804710409135338e-05, "loss": 0.0161, "step": 34500 }, { "epoch": 2.412204147175871, "grad_norm": 1.4443304538726807, "learning_rate": 2.876592245512234e-05, "loss": 0.0327, "step": 34550 }, { "epoch": 2.4156950359561544, "grad_norm": 0.01991642825305462, "learning_rate": 2.8727134501109338e-05, "loss": 0.0251, "step": 34600 }, { "epoch": 2.419185924736438, "grad_norm": 0.244741290807724, "learning_rate": 2.8688346547096335e-05, "loss": 0.0218, "step": 34650 }, { "epoch": 2.4226768135167216, "grad_norm": 0.022914182394742966, "learning_rate": 2.864955859308333e-05, "loss": 0.0231, "step": 34700 }, { "epoch": 2.426167702297005, "grad_norm": 0.13576367497444153, "learning_rate": 2.8610770639070335e-05, "loss": 0.0207, "step": 34750 }, { "epoch": 2.4296585910772883, "grad_norm": 0.028347410261631012, "learning_rate": 2.857198268505733e-05, "loss": 0.0199, "step": 34800 }, { "epoch": 2.4331494798575717, "grad_norm": 0.04504719376564026, "learning_rate": 2.8533194731044328e-05, "loss": 0.0165, "step": 34850 }, { "epoch": 2.436640368637855, "grad_norm": 1.0004245042800903, "learning_rate": 2.8494406777031325e-05, "loss": 0.0178, "step": 34900 }, { "epoch": 2.4401312574181384, "grad_norm": 0.050408054143190384, "learning_rate": 2.8455618823018325e-05, "loss": 0.0224, "step": 34950 }, { "epoch": 2.443622146198422, "grad_norm": 0.011280260048806667, "learning_rate": 2.841683086900532e-05, "loss": 0.0157, "step": 35000 }, { "epoch": 2.443622146198422, "eval_f1": 0.9306651634723788, "eval_loss": 0.19316384196281433, "eval_runtime": 17.1785, "eval_samples_per_second": 213.056, "eval_steps_per_second": 6.694, "step": 35000 }, { "epoch": 2.4471130349787056, "grad_norm": 2.7738091945648193, "learning_rate": 2.837804291499232e-05, "loss": 0.0164, "step": 35050 }, { "epoch": 2.450603923758989, "grad_norm": 0.28194230794906616, "learning_rate": 2.834003072005958e-05, "loss": 0.0413, "step": 35100 }, { "epoch": 2.4540948125392723, "grad_norm": 0.0410354882478714, "learning_rate": 2.8301242766046576e-05, "loss": 0.0217, "step": 35150 }, { "epoch": 2.457585701319556, "grad_norm": 0.07318081706762314, "learning_rate": 2.826245481203358e-05, "loss": 0.0383, "step": 35200 }, { "epoch": 2.4610765900998395, "grad_norm": 0.08617233484983444, "learning_rate": 2.8223666858020576e-05, "loss": 0.0301, "step": 35250 }, { "epoch": 2.464567478880123, "grad_norm": 2.5360500812530518, "learning_rate": 2.8184878904007573e-05, "loss": 0.03, "step": 35300 }, { "epoch": 2.468058367660406, "grad_norm": 0.01551117654889822, "learning_rate": 2.814609094999457e-05, "loss": 0.0231, "step": 35350 }, { "epoch": 2.4715492564406896, "grad_norm": 0.09869514405727386, "learning_rate": 2.810730299598157e-05, "loss": 0.0238, "step": 35400 }, { "epoch": 2.4750401452209734, "grad_norm": 4.334355354309082, "learning_rate": 2.8068515041968567e-05, "loss": 0.0187, "step": 35450 }, { "epoch": 2.4785310340012567, "grad_norm": 0.08748903125524521, "learning_rate": 2.8029727087955567e-05, "loss": 0.0222, "step": 35500 }, { "epoch": 2.48202192278154, "grad_norm": 0.1268267035484314, "learning_rate": 2.7990939133942563e-05, "loss": 0.0226, "step": 35550 }, { "epoch": 2.4855128115618235, "grad_norm": 0.12871848046779633, "learning_rate": 2.7952151179929563e-05, "loss": 0.0266, "step": 35600 }, { "epoch": 2.4890037003421073, "grad_norm": 0.0586019866168499, "learning_rate": 2.791336322591656e-05, "loss": 0.0184, "step": 35650 }, { "epoch": 2.4924945891223906, "grad_norm": 4.743653297424316, "learning_rate": 2.7874575271903557e-05, "loss": 0.0301, "step": 35700 }, { "epoch": 2.495985477902674, "grad_norm": 0.12895400822162628, "learning_rate": 2.7835787317890553e-05, "loss": 0.018, "step": 35750 }, { "epoch": 2.4994763666829574, "grad_norm": 0.08446650207042694, "learning_rate": 2.7796999363877557e-05, "loss": 0.0146, "step": 35800 }, { "epoch": 2.5029672554632407, "grad_norm": 0.2824804186820984, "learning_rate": 2.7758211409864554e-05, "loss": 0.0262, "step": 35850 }, { "epoch": 2.5064581442435245, "grad_norm": 0.1400439441204071, "learning_rate": 2.771942345585155e-05, "loss": 0.0156, "step": 35900 }, { "epoch": 2.509949033023808, "grad_norm": 0.005719928536564112, "learning_rate": 2.7680635501838547e-05, "loss": 0.0163, "step": 35950 }, { "epoch": 2.5134399218040913, "grad_norm": 6.512287616729736, "learning_rate": 2.764184754782555e-05, "loss": 0.0175, "step": 36000 }, { "epoch": 2.5134399218040913, "eval_f1": 0.9277177006260672, "eval_loss": 0.20059193670749664, "eval_runtime": 17.1425, "eval_samples_per_second": 213.504, "eval_steps_per_second": 6.708, "step": 36000 }, { "epoch": 2.5169308105843746, "grad_norm": 1.4769384860992432, "learning_rate": 2.7603059593812547e-05, "loss": 0.0345, "step": 36050 }, { "epoch": 2.5204216993646584, "grad_norm": 0.19504880905151367, "learning_rate": 2.7564271639799544e-05, "loss": 0.0226, "step": 36100 }, { "epoch": 2.523912588144942, "grad_norm": 0.03159239888191223, "learning_rate": 2.7525483685786547e-05, "loss": 0.0238, "step": 36150 }, { "epoch": 2.527403476925225, "grad_norm": 0.06540489941835403, "learning_rate": 2.7486695731773544e-05, "loss": 0.0253, "step": 36200 }, { "epoch": 2.5308943657055085, "grad_norm": 0.23230262100696564, "learning_rate": 2.744790777776054e-05, "loss": 0.0158, "step": 36250 }, { "epoch": 2.534385254485792, "grad_norm": 0.030431071296334267, "learning_rate": 2.7409119823747537e-05, "loss": 0.0284, "step": 36300 }, { "epoch": 2.5378761432660757, "grad_norm": 7.270301342010498, "learning_rate": 2.7370331869734537e-05, "loss": 0.0228, "step": 36350 }, { "epoch": 2.541367032046359, "grad_norm": 0.03877014294266701, "learning_rate": 2.7331543915721538e-05, "loss": 0.0149, "step": 36400 }, { "epoch": 2.5448579208266424, "grad_norm": 0.07648701220750809, "learning_rate": 2.7292755961708534e-05, "loss": 0.0195, "step": 36450 }, { "epoch": 2.548348809606926, "grad_norm": 0.25087717175483704, "learning_rate": 2.725396800769553e-05, "loss": 0.0101, "step": 36500 }, { "epoch": 2.5518396983872096, "grad_norm": 0.12214798480272293, "learning_rate": 2.721518005368253e-05, "loss": 0.0195, "step": 36550 }, { "epoch": 2.555330587167493, "grad_norm": 0.00542970048263669, "learning_rate": 2.7176392099669528e-05, "loss": 0.0201, "step": 36600 }, { "epoch": 2.5588214759477763, "grad_norm": 4.020162105560303, "learning_rate": 2.7137604145656524e-05, "loss": 0.0274, "step": 36650 }, { "epoch": 2.5623123647280597, "grad_norm": 0.08753184229135513, "learning_rate": 2.709881619164352e-05, "loss": 0.0106, "step": 36700 }, { "epoch": 2.565803253508343, "grad_norm": 3.764026165008545, "learning_rate": 2.7060028237630524e-05, "loss": 0.028, "step": 36750 }, { "epoch": 2.569294142288627, "grad_norm": 0.01225785631686449, "learning_rate": 2.702124028361752e-05, "loss": 0.0167, "step": 36800 }, { "epoch": 2.57278503106891, "grad_norm": 0.028353124856948853, "learning_rate": 2.6982452329604518e-05, "loss": 0.0252, "step": 36850 }, { "epoch": 2.5762759198491936, "grad_norm": 0.026566436514258385, "learning_rate": 2.6943664375591515e-05, "loss": 0.0178, "step": 36900 }, { "epoch": 2.579766808629477, "grad_norm": 0.2791537344455719, "learning_rate": 2.6904876421578518e-05, "loss": 0.0289, "step": 36950 }, { "epoch": 2.5832576974097607, "grad_norm": 0.7140030860900879, "learning_rate": 2.6866088467565515e-05, "loss": 0.0155, "step": 37000 }, { "epoch": 2.5832576974097607, "eval_f1": 0.879188441438672, "eval_loss": 0.26918989419937134, "eval_runtime": 27.051, "eval_samples_per_second": 135.3, "eval_steps_per_second": 4.251, "step": 37000 }, { "epoch": 2.586748586190044, "grad_norm": 0.007281581871211529, "learning_rate": 2.682730051355251e-05, "loss": 0.0112, "step": 37050 }, { "epoch": 2.5902394749703275, "grad_norm": 0.3017968535423279, "learning_rate": 2.6788512559539508e-05, "loss": 0.0345, "step": 37100 }, { "epoch": 2.593730363750611, "grad_norm": 0.06797858327627182, "learning_rate": 2.674972460552651e-05, "loss": 0.01, "step": 37150 }, { "epoch": 2.597221252530894, "grad_norm": 0.012963666580617428, "learning_rate": 2.6710936651513508e-05, "loss": 0.0152, "step": 37200 }, { "epoch": 2.6007121413111776, "grad_norm": 0.07589650899171829, "learning_rate": 2.6672148697500505e-05, "loss": 0.0245, "step": 37250 }, { "epoch": 2.6042030300914614, "grad_norm": 0.028516370803117752, "learning_rate": 2.6633360743487505e-05, "loss": 0.0132, "step": 37300 }, { "epoch": 2.6076939188717447, "grad_norm": 0.24892716109752655, "learning_rate": 2.6594572789474505e-05, "loss": 0.0161, "step": 37350 }, { "epoch": 2.611184807652028, "grad_norm": 0.04786977171897888, "learning_rate": 2.6555784835461502e-05, "loss": 0.014, "step": 37400 }, { "epoch": 2.614675696432312, "grad_norm": 0.12335788458585739, "learning_rate": 2.65169968814485e-05, "loss": 0.029, "step": 37450 }, { "epoch": 2.6181665852125953, "grad_norm": 0.18296684324741364, "learning_rate": 2.64782089274355e-05, "loss": 0.0216, "step": 37500 }, { "epoch": 2.6216574739928786, "grad_norm": 0.020824022591114044, "learning_rate": 2.6439420973422495e-05, "loss": 0.0098, "step": 37550 }, { "epoch": 2.625148362773162, "grad_norm": 0.3721611797809601, "learning_rate": 2.6400633019409492e-05, "loss": 0.0104, "step": 37600 }, { "epoch": 2.6286392515534454, "grad_norm": 0.0648498460650444, "learning_rate": 2.636184506539649e-05, "loss": 0.029, "step": 37650 }, { "epoch": 2.6321301403337287, "grad_norm": 0.013140054419636726, "learning_rate": 2.6323057111383492e-05, "loss": 0.0175, "step": 37700 }, { "epoch": 2.6356210291140125, "grad_norm": 0.3959859609603882, "learning_rate": 2.628426915737049e-05, "loss": 0.0287, "step": 37750 }, { "epoch": 2.639111917894296, "grad_norm": 1.1734418869018555, "learning_rate": 2.6245481203357485e-05, "loss": 0.0185, "step": 37800 }, { "epoch": 2.6426028066745793, "grad_norm": 13.356185913085938, "learning_rate": 2.6206693249344482e-05, "loss": 0.009, "step": 37850 }, { "epoch": 2.646093695454863, "grad_norm": 6.345465183258057, "learning_rate": 2.6167905295331486e-05, "loss": 0.0185, "step": 37900 }, { "epoch": 2.6495845842351464, "grad_norm": 0.3218725025653839, "learning_rate": 2.6129117341318482e-05, "loss": 0.0325, "step": 37950 }, { "epoch": 2.65307547301543, "grad_norm": 0.013729876838624477, "learning_rate": 2.609032938730548e-05, "loss": 0.0147, "step": 38000 }, { "epoch": 2.65307547301543, "eval_f1": 0.9022556390977443, "eval_loss": 0.2909112870693207, "eval_runtime": 23.2176, "eval_samples_per_second": 157.639, "eval_steps_per_second": 4.953, "step": 38000 }, { "epoch": 2.656566361795713, "grad_norm": 0.33415481448173523, "learning_rate": 2.6051541433292476e-05, "loss": 0.0259, "step": 38050 }, { "epoch": 2.6600572505759965, "grad_norm": 0.009586818516254425, "learning_rate": 2.601275347927948e-05, "loss": 0.0227, "step": 38100 }, { "epoch": 2.66354813935628, "grad_norm": 0.024410735815763474, "learning_rate": 2.5973965525266476e-05, "loss": 0.0259, "step": 38150 }, { "epoch": 2.6670390281365637, "grad_norm": 0.03022683411836624, "learning_rate": 2.5935177571253472e-05, "loss": 0.0122, "step": 38200 }, { "epoch": 2.670529916916847, "grad_norm": 3.5583574771881104, "learning_rate": 2.589638961724047e-05, "loss": 0.0323, "step": 38250 }, { "epoch": 2.6740208056971304, "grad_norm": 0.07743709534406662, "learning_rate": 2.5857601663227473e-05, "loss": 0.0185, "step": 38300 }, { "epoch": 2.677511694477414, "grad_norm": 0.18533405661582947, "learning_rate": 2.581881370921447e-05, "loss": 0.0277, "step": 38350 }, { "epoch": 2.6810025832576976, "grad_norm": 0.05966994911432266, "learning_rate": 2.5780025755201466e-05, "loss": 0.0199, "step": 38400 }, { "epoch": 2.684493472037981, "grad_norm": 0.028219763189554214, "learning_rate": 2.5741237801188463e-05, "loss": 0.0184, "step": 38450 }, { "epoch": 2.6879843608182643, "grad_norm": 0.026486072689294815, "learning_rate": 2.5702449847175463e-05, "loss": 0.0152, "step": 38500 }, { "epoch": 2.6914752495985477, "grad_norm": 0.4646790623664856, "learning_rate": 2.566366189316246e-05, "loss": 0.0113, "step": 38550 }, { "epoch": 2.694966138378831, "grad_norm": 0.016069358214735985, "learning_rate": 2.5624873939149456e-05, "loss": 0.0272, "step": 38600 }, { "epoch": 2.698457027159115, "grad_norm": 0.1055779829621315, "learning_rate": 2.558608598513646e-05, "loss": 0.0372, "step": 38650 }, { "epoch": 2.701947915939398, "grad_norm": 0.08721558004617691, "learning_rate": 2.5547298031123456e-05, "loss": 0.0299, "step": 38700 }, { "epoch": 2.7054388047196816, "grad_norm": 4.360162258148193, "learning_rate": 2.5508510077110453e-05, "loss": 0.0159, "step": 38750 }, { "epoch": 2.708929693499965, "grad_norm": 0.04794733598828316, "learning_rate": 2.546972212309745e-05, "loss": 0.0152, "step": 38800 }, { "epoch": 2.7124205822802487, "grad_norm": 1.7673709392547607, "learning_rate": 2.5430934169084453e-05, "loss": 0.0284, "step": 38850 }, { "epoch": 2.715911471060532, "grad_norm": 0.17031490802764893, "learning_rate": 2.539214621507145e-05, "loss": 0.0154, "step": 38900 }, { "epoch": 2.7194023598408155, "grad_norm": 0.010257457382977009, "learning_rate": 2.5353358261058446e-05, "loss": 0.0092, "step": 38950 }, { "epoch": 2.722893248621099, "grad_norm": 0.011103369295597076, "learning_rate": 2.5314570307045443e-05, "loss": 0.017, "step": 39000 }, { "epoch": 2.722893248621099, "eval_f1": 0.9230318993268949, "eval_loss": 0.2264816164970398, "eval_runtime": 23.7732, "eval_samples_per_second": 153.955, "eval_steps_per_second": 4.837, "step": 39000 }, { "epoch": 2.726384137401382, "grad_norm": 0.024749957025051117, "learning_rate": 2.5275782353032447e-05, "loss": 0.025, "step": 39050 }, { "epoch": 2.729875026181666, "grad_norm": 0.016311727464199066, "learning_rate": 2.5236994399019443e-05, "loss": 0.019, "step": 39100 }, { "epoch": 2.7333659149619494, "grad_norm": 2.8876538276672363, "learning_rate": 2.519820644500644e-05, "loss": 0.0152, "step": 39150 }, { "epoch": 2.7368568037422327, "grad_norm": 0.019060427322983742, "learning_rate": 2.5159418490993437e-05, "loss": 0.0122, "step": 39200 }, { "epoch": 2.740347692522516, "grad_norm": 0.011108928360044956, "learning_rate": 2.5121406296060695e-05, "loss": 0.017, "step": 39250 }, { "epoch": 2.7438385813028, "grad_norm": 0.09083431214094162, "learning_rate": 2.508261834204769e-05, "loss": 0.0295, "step": 39300 }, { "epoch": 2.7473294700830833, "grad_norm": 0.005036790389567614, "learning_rate": 2.5043830388034695e-05, "loss": 0.0068, "step": 39350 }, { "epoch": 2.7508203588633666, "grad_norm": 0.047138676047325134, "learning_rate": 2.500504243402169e-05, "loss": 0.0169, "step": 39400 }, { "epoch": 2.75431124764365, "grad_norm": 0.696245551109314, "learning_rate": 2.4966254480008688e-05, "loss": 0.03, "step": 39450 }, { "epoch": 2.7578021364239333, "grad_norm": 2.528618097305298, "learning_rate": 2.4927466525995688e-05, "loss": 0.0273, "step": 39500 }, { "epoch": 2.761293025204217, "grad_norm": 0.01160145178437233, "learning_rate": 2.488867857198269e-05, "loss": 0.0171, "step": 39550 }, { "epoch": 2.7647839139845005, "grad_norm": 0.00931859016418457, "learning_rate": 2.4849890617969685e-05, "loss": 0.019, "step": 39600 }, { "epoch": 2.768274802764784, "grad_norm": 0.011263793334364891, "learning_rate": 2.4811102663956685e-05, "loss": 0.0227, "step": 39650 }, { "epoch": 2.7717656915450672, "grad_norm": 0.01920177787542343, "learning_rate": 2.4772314709943682e-05, "loss": 0.0195, "step": 39700 }, { "epoch": 2.775256580325351, "grad_norm": 0.0142729626968503, "learning_rate": 2.4733526755930682e-05, "loss": 0.0232, "step": 39750 }, { "epoch": 2.7787474691056344, "grad_norm": 0.10750947892665863, "learning_rate": 2.469473880191768e-05, "loss": 0.0152, "step": 39800 }, { "epoch": 2.782238357885918, "grad_norm": 0.018952248618006706, "learning_rate": 2.4655950847904675e-05, "loss": 0.0113, "step": 39850 }, { "epoch": 2.785729246666201, "grad_norm": 0.019632501527667046, "learning_rate": 2.4617162893891675e-05, "loss": 0.021, "step": 39900 }, { "epoch": 2.7892201354464845, "grad_norm": 0.0779823437333107, "learning_rate": 2.4578374939878672e-05, "loss": 0.025, "step": 39950 }, { "epoch": 2.792711024226768, "grad_norm": 0.02033323608338833, "learning_rate": 2.453958698586567e-05, "loss": 0.0235, "step": 40000 }, { "epoch": 2.792711024226768, "eval_f1": 0.9247813411078717, "eval_loss": 0.19202525913715363, "eval_runtime": 24.1493, "eval_samples_per_second": 151.557, "eval_steps_per_second": 4.762, "step": 40000 }, { "epoch": 2.7962019130070517, "grad_norm": 0.711127758026123, "learning_rate": 2.450079903185267e-05, "loss": 0.0281, "step": 40050 }, { "epoch": 2.799692801787335, "grad_norm": 0.013284056447446346, "learning_rate": 2.4462011077839665e-05, "loss": 0.0338, "step": 40100 }, { "epoch": 2.8031836905676184, "grad_norm": 0.11050266772508621, "learning_rate": 2.4423223123826666e-05, "loss": 0.0202, "step": 40150 }, { "epoch": 2.806674579347902, "grad_norm": 0.056580349802970886, "learning_rate": 2.4384435169813662e-05, "loss": 0.0235, "step": 40200 }, { "epoch": 2.8101654681281856, "grad_norm": 0.03578386455774307, "learning_rate": 2.4346422974880924e-05, "loss": 0.0262, "step": 40250 }, { "epoch": 2.813656356908469, "grad_norm": 0.02802601084113121, "learning_rate": 2.430763502086792e-05, "loss": 0.0245, "step": 40300 }, { "epoch": 2.8171472456887523, "grad_norm": 1.0409111976623535, "learning_rate": 2.426884706685492e-05, "loss": 0.0199, "step": 40350 }, { "epoch": 2.8206381344690357, "grad_norm": 0.031876832246780396, "learning_rate": 2.4230059112841917e-05, "loss": 0.0271, "step": 40400 }, { "epoch": 2.824129023249319, "grad_norm": 0.05730342119932175, "learning_rate": 2.4191271158828914e-05, "loss": 0.0203, "step": 40450 }, { "epoch": 2.827619912029603, "grad_norm": 4.108118534088135, "learning_rate": 2.4152483204815914e-05, "loss": 0.023, "step": 40500 }, { "epoch": 2.831110800809886, "grad_norm": 0.18447378277778625, "learning_rate": 2.411369525080291e-05, "loss": 0.0159, "step": 40550 }, { "epoch": 2.8346016895901696, "grad_norm": 0.10659290850162506, "learning_rate": 2.407490729678991e-05, "loss": 0.016, "step": 40600 }, { "epoch": 2.8380925783704534, "grad_norm": 0.01589849404990673, "learning_rate": 2.4036119342776907e-05, "loss": 0.0209, "step": 40650 }, { "epoch": 2.8415834671507367, "grad_norm": 0.14199930429458618, "learning_rate": 2.3997331388763907e-05, "loss": 0.0162, "step": 40700 }, { "epoch": 2.84507435593102, "grad_norm": 3.8175432682037354, "learning_rate": 2.3958543434750904e-05, "loss": 0.0339, "step": 40750 }, { "epoch": 2.8485652447113035, "grad_norm": 4.6632537841796875, "learning_rate": 2.3919755480737904e-05, "loss": 0.0371, "step": 40800 }, { "epoch": 2.852056133491587, "grad_norm": 3.703428268432617, "learning_rate": 2.38809675267249e-05, "loss": 0.013, "step": 40850 }, { "epoch": 2.85554702227187, "grad_norm": 0.14800503849983215, "learning_rate": 2.38421795727119e-05, "loss": 0.0126, "step": 40900 }, { "epoch": 2.859037911052154, "grad_norm": 6.090341567993164, "learning_rate": 2.3803391618698898e-05, "loss": 0.0239, "step": 40950 }, { "epoch": 2.8625287998324374, "grad_norm": 0.4346800744533539, "learning_rate": 2.3764603664685898e-05, "loss": 0.0238, "step": 41000 }, { "epoch": 2.8625287998324374, "eval_f1": 0.922173274596182, "eval_loss": 0.2260962426662445, "eval_runtime": 26.009, "eval_samples_per_second": 140.721, "eval_steps_per_second": 4.422, "step": 41000 }, { "epoch": 2.8660196886127207, "grad_norm": 3.542853593826294, "learning_rate": 2.3725815710672894e-05, "loss": 0.014, "step": 41050 }, { "epoch": 2.8695105773930045, "grad_norm": 4.3590779304504395, "learning_rate": 2.3687027756659894e-05, "loss": 0.0104, "step": 41100 }, { "epoch": 2.873001466173288, "grad_norm": 0.4096594452857971, "learning_rate": 2.364823980264689e-05, "loss": 0.0247, "step": 41150 }, { "epoch": 2.8764923549535713, "grad_norm": 0.01314621139317751, "learning_rate": 2.360945184863389e-05, "loss": 0.0285, "step": 41200 }, { "epoch": 2.8799832437338546, "grad_norm": 0.2991047501564026, "learning_rate": 2.3570663894620888e-05, "loss": 0.0205, "step": 41250 }, { "epoch": 2.883474132514138, "grad_norm": 5.858856678009033, "learning_rate": 2.3531875940607888e-05, "loss": 0.0201, "step": 41300 }, { "epoch": 2.8869650212944213, "grad_norm": 0.10207907110452652, "learning_rate": 2.3493087986594885e-05, "loss": 0.0238, "step": 41350 }, { "epoch": 2.890455910074705, "grad_norm": 0.19928042590618134, "learning_rate": 2.345430003258188e-05, "loss": 0.0167, "step": 41400 }, { "epoch": 2.8939467988549885, "grad_norm": 0.028837937861680984, "learning_rate": 2.341551207856888e-05, "loss": 0.0185, "step": 41450 }, { "epoch": 2.897437687635272, "grad_norm": 0.019361231476068497, "learning_rate": 2.3376724124555878e-05, "loss": 0.0158, "step": 41500 }, { "epoch": 2.9009285764155552, "grad_norm": 4.604653358459473, "learning_rate": 2.3337936170542875e-05, "loss": 0.0186, "step": 41550 }, { "epoch": 2.904419465195839, "grad_norm": 1.2114059925079346, "learning_rate": 2.3299148216529875e-05, "loss": 0.0311, "step": 41600 }, { "epoch": 2.9079103539761224, "grad_norm": 0.021827546879649162, "learning_rate": 2.326036026251687e-05, "loss": 0.0109, "step": 41650 }, { "epoch": 2.9114012427564058, "grad_norm": 2.5706710815429688, "learning_rate": 2.322157230850387e-05, "loss": 0.0106, "step": 41700 }, { "epoch": 2.914892131536689, "grad_norm": 0.039485763758420944, "learning_rate": 2.3182784354490868e-05, "loss": 0.0163, "step": 41750 }, { "epoch": 2.9183830203169725, "grad_norm": 0.16371645033359528, "learning_rate": 2.314399640047787e-05, "loss": 0.0101, "step": 41800 }, { "epoch": 2.9218739090972563, "grad_norm": 1.083182692527771, "learning_rate": 2.310520844646487e-05, "loss": 0.0193, "step": 41850 }, { "epoch": 2.9253647978775397, "grad_norm": 0.038459427654743195, "learning_rate": 2.3066420492451865e-05, "loss": 0.0191, "step": 41900 }, { "epoch": 2.928855686657823, "grad_norm": 0.054930657148361206, "learning_rate": 2.3027632538438865e-05, "loss": 0.0258, "step": 41950 }, { "epoch": 2.9323465754381064, "grad_norm": 0.01878916658461094, "learning_rate": 2.2988844584425862e-05, "loss": 0.0291, "step": 42000 }, { "epoch": 2.9323465754381064, "eval_f1": 0.9320612893899971, "eval_loss": 0.17577876150608063, "eval_runtime": 28.3368, "eval_samples_per_second": 129.161, "eval_steps_per_second": 4.058, "step": 42000 }, { "epoch": 2.93583746421839, "grad_norm": 1.7324750423431396, "learning_rate": 2.2950056630412862e-05, "loss": 0.0294, "step": 42050 }, { "epoch": 2.9393283529986736, "grad_norm": 0.13416598737239838, "learning_rate": 2.291126867639986e-05, "loss": 0.032, "step": 42100 }, { "epoch": 2.942819241778957, "grad_norm": 7.673727035522461, "learning_rate": 2.287248072238686e-05, "loss": 0.0158, "step": 42150 }, { "epoch": 2.9463101305592403, "grad_norm": 0.4483250677585602, "learning_rate": 2.2833692768373855e-05, "loss": 0.0229, "step": 42200 }, { "epoch": 2.9498010193395237, "grad_norm": 0.10611013323068619, "learning_rate": 2.2794904814360855e-05, "loss": 0.0146, "step": 42250 }, { "epoch": 2.9532919081198075, "grad_norm": 6.6183600425720215, "learning_rate": 2.2756116860347852e-05, "loss": 0.006, "step": 42300 }, { "epoch": 2.956782796900091, "grad_norm": 0.008068676106631756, "learning_rate": 2.271732890633485e-05, "loss": 0.0208, "step": 42350 }, { "epoch": 2.960273685680374, "grad_norm": 0.01248723454773426, "learning_rate": 2.267854095232185e-05, "loss": 0.0412, "step": 42400 }, { "epoch": 2.9637645744606576, "grad_norm": 1.5417126417160034, "learning_rate": 2.2639752998308846e-05, "loss": 0.0363, "step": 42450 }, { "epoch": 2.9672554632409414, "grad_norm": 0.12593162059783936, "learning_rate": 2.2600965044295842e-05, "loss": 0.0107, "step": 42500 }, { "epoch": 2.9707463520212247, "grad_norm": 2.7952985763549805, "learning_rate": 2.2562177090282842e-05, "loss": 0.0231, "step": 42550 }, { "epoch": 2.974237240801508, "grad_norm": 0.021743256598711014, "learning_rate": 2.252338913626984e-05, "loss": 0.0238, "step": 42600 }, { "epoch": 2.9777281295817914, "grad_norm": 0.01719530299305916, "learning_rate": 2.248460118225684e-05, "loss": 0.0104, "step": 42650 }, { "epoch": 2.981219018362075, "grad_norm": 0.009256424382328987, "learning_rate": 2.2445813228243836e-05, "loss": 0.01, "step": 42700 }, { "epoch": 2.984709907142358, "grad_norm": 0.010182956233620644, "learning_rate": 2.2407025274230836e-05, "loss": 0.0162, "step": 42750 }, { "epoch": 2.988200795922642, "grad_norm": 0.46795710921287537, "learning_rate": 2.2368237320217833e-05, "loss": 0.035, "step": 42800 }, { "epoch": 2.9916916847029253, "grad_norm": 0.037175968289375305, "learning_rate": 2.2329449366204833e-05, "loss": 0.0212, "step": 42850 }, { "epoch": 2.9951825734832087, "grad_norm": 0.2701772451400757, "learning_rate": 2.229066141219183e-05, "loss": 0.0215, "step": 42900 }, { "epoch": 2.9986734622634925, "grad_norm": 0.3241135776042938, "learning_rate": 2.225187345817883e-05, "loss": 0.0211, "step": 42950 }, { "epoch": 3.002164351043776, "grad_norm": 0.25792551040649414, "learning_rate": 2.221308550416583e-05, "loss": 0.018, "step": 43000 }, { "epoch": 3.002164351043776, "eval_f1": 0.9154761904761904, "eval_loss": 0.2519144117832184, "eval_runtime": 24.3041, "eval_samples_per_second": 150.592, "eval_steps_per_second": 4.732, "step": 43000 }, { "epoch": 3.0056552398240592, "grad_norm": 0.2332383692264557, "learning_rate": 2.2174297550152826e-05, "loss": 0.0236, "step": 43050 }, { "epoch": 3.0091461286043426, "grad_norm": 0.01284001674503088, "learning_rate": 2.2135509596139826e-05, "loss": 0.0154, "step": 43100 }, { "epoch": 3.012637017384626, "grad_norm": 0.05416431277990341, "learning_rate": 2.2096721642126823e-05, "loss": 0.0115, "step": 43150 }, { "epoch": 3.0161279061649098, "grad_norm": 0.12467977404594421, "learning_rate": 2.2057933688113823e-05, "loss": 0.0176, "step": 43200 }, { "epoch": 3.019618794945193, "grad_norm": 0.5407397150993347, "learning_rate": 2.201914573410082e-05, "loss": 0.0206, "step": 43250 }, { "epoch": 3.0231096837254765, "grad_norm": 0.13258038461208344, "learning_rate": 2.1980357780087816e-05, "loss": 0.0112, "step": 43300 }, { "epoch": 3.02660057250576, "grad_norm": 0.05076462775468826, "learning_rate": 2.1941569826074816e-05, "loss": 0.0146, "step": 43350 }, { "epoch": 3.0300914612860432, "grad_norm": 0.05165479704737663, "learning_rate": 2.1902781872061813e-05, "loss": 0.011, "step": 43400 }, { "epoch": 3.033582350066327, "grad_norm": 0.04711933061480522, "learning_rate": 2.186399391804881e-05, "loss": 0.0207, "step": 43450 }, { "epoch": 3.0370732388466104, "grad_norm": 0.08267899602651596, "learning_rate": 2.182520596403581e-05, "loss": 0.0277, "step": 43500 }, { "epoch": 3.0405641276268938, "grad_norm": 0.05017954856157303, "learning_rate": 2.1786418010022807e-05, "loss": 0.0162, "step": 43550 }, { "epoch": 3.044055016407177, "grad_norm": 0.008251439779996872, "learning_rate": 2.1747630056009807e-05, "loss": 0.0148, "step": 43600 }, { "epoch": 3.047545905187461, "grad_norm": 0.03382747620344162, "learning_rate": 2.1708842101996803e-05, "loss": 0.0128, "step": 43650 }, { "epoch": 3.0510367939677443, "grad_norm": 0.008427140302956104, "learning_rate": 2.1670054147983803e-05, "loss": 0.0173, "step": 43700 }, { "epoch": 3.0545276827480277, "grad_norm": 11.583622932434082, "learning_rate": 2.16312661939708e-05, "loss": 0.0258, "step": 43750 }, { "epoch": 3.058018571528311, "grad_norm": 0.026552023366093636, "learning_rate": 2.15924782399578e-05, "loss": 0.0326, "step": 43800 }, { "epoch": 3.0615094603085944, "grad_norm": 0.08250391483306885, "learning_rate": 2.1553690285944797e-05, "loss": 0.0064, "step": 43850 }, { "epoch": 3.065000349088878, "grad_norm": 0.06599225103855133, "learning_rate": 2.1514902331931797e-05, "loss": 0.0316, "step": 43900 }, { "epoch": 3.0684912378691616, "grad_norm": 0.062200937420129776, "learning_rate": 2.1476114377918794e-05, "loss": 0.0192, "step": 43950 }, { "epoch": 3.071982126649445, "grad_norm": 7.155509948730469, "learning_rate": 2.1437326423905794e-05, "loss": 0.0201, "step": 44000 }, { "epoch": 3.071982126649445, "eval_f1": 0.9021739130434783, "eval_loss": 0.27857547998428345, "eval_runtime": 25.994, "eval_samples_per_second": 140.802, "eval_steps_per_second": 4.424, "step": 44000 }, { "epoch": 3.0754730154297283, "grad_norm": 0.014235005713999271, "learning_rate": 2.139853846989279e-05, "loss": 0.022, "step": 44050 }, { "epoch": 3.0789639042100116, "grad_norm": 0.05011603608727455, "learning_rate": 2.135975051587979e-05, "loss": 0.0107, "step": 44100 }, { "epoch": 3.0824547929902955, "grad_norm": 1.0543460845947266, "learning_rate": 2.1320962561866787e-05, "loss": 0.0196, "step": 44150 }, { "epoch": 3.085945681770579, "grad_norm": 0.09093031287193298, "learning_rate": 2.1282174607853787e-05, "loss": 0.0145, "step": 44200 }, { "epoch": 3.089436570550862, "grad_norm": 0.04965496063232422, "learning_rate": 2.1243386653840784e-05, "loss": 0.0161, "step": 44250 }, { "epoch": 3.0929274593311455, "grad_norm": 2.1972270011901855, "learning_rate": 2.1204598699827784e-05, "loss": 0.0289, "step": 44300 }, { "epoch": 3.0964183481114294, "grad_norm": 0.02106853947043419, "learning_rate": 2.116581074581478e-05, "loss": 0.0303, "step": 44350 }, { "epoch": 3.0999092368917127, "grad_norm": 2.8582568168640137, "learning_rate": 2.1127022791801777e-05, "loss": 0.0177, "step": 44400 }, { "epoch": 3.103400125671996, "grad_norm": 0.010883578099310398, "learning_rate": 2.1088234837788777e-05, "loss": 0.0216, "step": 44450 }, { "epoch": 3.1068910144522794, "grad_norm": 0.20962642133235931, "learning_rate": 2.1049446883775774e-05, "loss": 0.0093, "step": 44500 }, { "epoch": 3.110381903232563, "grad_norm": 2.5480289459228516, "learning_rate": 2.1010658929762774e-05, "loss": 0.0104, "step": 44550 }, { "epoch": 3.1138727920128466, "grad_norm": 0.05912920832633972, "learning_rate": 2.097187097574977e-05, "loss": 0.0089, "step": 44600 }, { "epoch": 3.11736368079313, "grad_norm": 0.006363342050462961, "learning_rate": 2.0933858780817032e-05, "loss": 0.0145, "step": 44650 }, { "epoch": 3.1208545695734133, "grad_norm": 0.09209737926721573, "learning_rate": 2.0895846585884287e-05, "loss": 0.0143, "step": 44700 }, { "epoch": 3.1243454583536967, "grad_norm": 0.08498357236385345, "learning_rate": 2.0857058631871287e-05, "loss": 0.0236, "step": 44750 }, { "epoch": 3.1278363471339805, "grad_norm": 0.20750215649604797, "learning_rate": 2.0818270677858284e-05, "loss": 0.0222, "step": 44800 }, { "epoch": 3.131327235914264, "grad_norm": 1.05506432056427, "learning_rate": 2.0779482723845284e-05, "loss": 0.019, "step": 44850 }, { "epoch": 3.1348181246945472, "grad_norm": 5.7602763175964355, "learning_rate": 2.074069476983228e-05, "loss": 0.0324, "step": 44900 }, { "epoch": 3.1383090134748306, "grad_norm": 3.679661989212036, "learning_rate": 2.070190681581928e-05, "loss": 0.0235, "step": 44950 }, { "epoch": 3.141799902255114, "grad_norm": 0.003911898471415043, "learning_rate": 2.0663118861806277e-05, "loss": 0.0212, "step": 45000 }, { "epoch": 3.141799902255114, "eval_f1": 0.9351320321469575, "eval_loss": 0.1995823234319687, "eval_runtime": 24.8557, "eval_samples_per_second": 147.25, "eval_steps_per_second": 4.627, "step": 45000 }, { "epoch": 3.1452907910353978, "grad_norm": 0.02615140751004219, "learning_rate": 2.0624330907793277e-05, "loss": 0.0125, "step": 45050 }, { "epoch": 3.148781679815681, "grad_norm": 0.01764095388352871, "learning_rate": 2.0585542953780277e-05, "loss": 0.0036, "step": 45100 }, { "epoch": 3.1522725685959645, "grad_norm": 0.04237894341349602, "learning_rate": 2.0546754999767274e-05, "loss": 0.0299, "step": 45150 }, { "epoch": 3.155763457376248, "grad_norm": 0.008063646033406258, "learning_rate": 2.0507967045754274e-05, "loss": 0.022, "step": 45200 }, { "epoch": 3.1592543461565317, "grad_norm": 0.016073619946837425, "learning_rate": 2.046917909174127e-05, "loss": 0.0081, "step": 45250 }, { "epoch": 3.162745234936815, "grad_norm": 1.8636724948883057, "learning_rate": 2.0430391137728268e-05, "loss": 0.0213, "step": 45300 }, { "epoch": 3.1662361237170984, "grad_norm": 0.012820547446608543, "learning_rate": 2.0391603183715268e-05, "loss": 0.0296, "step": 45350 }, { "epoch": 3.1697270124973818, "grad_norm": 0.02494175173342228, "learning_rate": 2.0352815229702264e-05, "loss": 0.0114, "step": 45400 }, { "epoch": 3.173217901277665, "grad_norm": 0.0929480642080307, "learning_rate": 2.031402727568926e-05, "loss": 0.0297, "step": 45450 }, { "epoch": 3.176708790057949, "grad_norm": 0.06418604403734207, "learning_rate": 2.027523932167626e-05, "loss": 0.0181, "step": 45500 }, { "epoch": 3.1801996788382323, "grad_norm": 0.13698844611644745, "learning_rate": 2.0236451367663258e-05, "loss": 0.0223, "step": 45550 }, { "epoch": 3.1836905676185157, "grad_norm": 0.06596588343381882, "learning_rate": 2.0197663413650258e-05, "loss": 0.0213, "step": 45600 }, { "epoch": 3.187181456398799, "grad_norm": 1.1904255151748657, "learning_rate": 2.0158875459637254e-05, "loss": 0.0287, "step": 45650 }, { "epoch": 3.190672345179083, "grad_norm": 0.04259861633181572, "learning_rate": 2.0120087505624255e-05, "loss": 0.0409, "step": 45700 }, { "epoch": 3.194163233959366, "grad_norm": 0.02005760930478573, "learning_rate": 2.008129955161125e-05, "loss": 0.0159, "step": 45750 }, { "epoch": 3.1976541227396496, "grad_norm": 0.023527618497610092, "learning_rate": 2.004251159759825e-05, "loss": 0.011, "step": 45800 }, { "epoch": 3.201145011519933, "grad_norm": 0.054994259029626846, "learning_rate": 2.0003723643585248e-05, "loss": 0.0174, "step": 45850 }, { "epoch": 3.2046359003002163, "grad_norm": 3.069230556488037, "learning_rate": 1.9964935689572248e-05, "loss": 0.0333, "step": 45900 }, { "epoch": 3.2081267890805, "grad_norm": 0.053326524794101715, "learning_rate": 1.9926147735559245e-05, "loss": 0.0156, "step": 45950 }, { "epoch": 3.2116176778607834, "grad_norm": 0.0816662609577179, "learning_rate": 1.9887359781546245e-05, "loss": 0.0186, "step": 46000 }, { "epoch": 3.2116176778607834, "eval_f1": 0.9357903829542182, "eval_loss": 0.19323907792568207, "eval_runtime": 26.5504, "eval_samples_per_second": 137.851, "eval_steps_per_second": 4.331, "step": 46000 }, { "epoch": 3.215108566641067, "grad_norm": 4.011587619781494, "learning_rate": 1.984857182753324e-05, "loss": 0.0168, "step": 46050 }, { "epoch": 3.21859945542135, "grad_norm": 0.3233940899372101, "learning_rate": 1.980978387352024e-05, "loss": 0.0223, "step": 46100 }, { "epoch": 3.2220903442016335, "grad_norm": 0.6042135953903198, "learning_rate": 1.9770995919507238e-05, "loss": 0.0151, "step": 46150 }, { "epoch": 3.2255812329819173, "grad_norm": 0.35782158374786377, "learning_rate": 1.973220796549424e-05, "loss": 0.0112, "step": 46200 }, { "epoch": 3.2290721217622007, "grad_norm": 0.007998952642083168, "learning_rate": 1.9693420011481235e-05, "loss": 0.0123, "step": 46250 }, { "epoch": 3.232563010542484, "grad_norm": 0.012579442001879215, "learning_rate": 1.9654632057468235e-05, "loss": 0.0185, "step": 46300 }, { "epoch": 3.2360538993227674, "grad_norm": 0.025501690804958344, "learning_rate": 1.9615844103455232e-05, "loss": 0.0146, "step": 46350 }, { "epoch": 3.2395447881030512, "grad_norm": 0.026744967326521873, "learning_rate": 1.957705614944223e-05, "loss": 0.029, "step": 46400 }, { "epoch": 3.2430356768833346, "grad_norm": 0.45471131801605225, "learning_rate": 1.953826819542923e-05, "loss": 0.0241, "step": 46450 }, { "epoch": 3.246526565663618, "grad_norm": 0.0749790370464325, "learning_rate": 1.9499480241416225e-05, "loss": 0.0145, "step": 46500 }, { "epoch": 3.2500174544439013, "grad_norm": 0.06732578575611115, "learning_rate": 1.9460692287403225e-05, "loss": 0.0172, "step": 46550 }, { "epoch": 3.2535083432241847, "grad_norm": 0.022160114720463753, "learning_rate": 1.9421904333390222e-05, "loss": 0.0233, "step": 46600 }, { "epoch": 3.2569992320044685, "grad_norm": 0.010378474369645119, "learning_rate": 1.9383116379377222e-05, "loss": 0.0159, "step": 46650 }, { "epoch": 3.260490120784752, "grad_norm": 0.018741408362984657, "learning_rate": 1.934432842536422e-05, "loss": 0.0185, "step": 46700 }, { "epoch": 3.2639810095650352, "grad_norm": 7.819186687469482, "learning_rate": 1.930554047135122e-05, "loss": 0.0191, "step": 46750 }, { "epoch": 3.2674718983453186, "grad_norm": 0.05615635961294174, "learning_rate": 1.9266752517338216e-05, "loss": 0.0144, "step": 46800 }, { "epoch": 3.270962787125602, "grad_norm": 0.03543762490153313, "learning_rate": 1.9227964563325216e-05, "loss": 0.0117, "step": 46850 }, { "epoch": 3.2744536759058858, "grad_norm": 0.02200508303940296, "learning_rate": 1.9189176609312212e-05, "loss": 0.0123, "step": 46900 }, { "epoch": 3.277944564686169, "grad_norm": 0.06120089441537857, "learning_rate": 1.9150388655299212e-05, "loss": 0.0185, "step": 46950 }, { "epoch": 3.2814354534664525, "grad_norm": 0.012565530836582184, "learning_rate": 1.911160070128621e-05, "loss": 0.007, "step": 47000 }, { "epoch": 3.2814354534664525, "eval_f1": 0.9401757867876382, "eval_loss": 0.1720879226922989, "eval_runtime": 40.054, "eval_samples_per_second": 91.377, "eval_steps_per_second": 2.871, "step": 47000 }, { "epoch": 3.284926342246736, "grad_norm": 0.01494626235216856, "learning_rate": 1.907281274727321e-05, "loss": 0.0246, "step": 47050 }, { "epoch": 3.2884172310270197, "grad_norm": 0.023222725838422775, "learning_rate": 1.9034024793260206e-05, "loss": 0.0107, "step": 47100 }, { "epoch": 3.291908119807303, "grad_norm": 0.13517247140407562, "learning_rate": 1.8995236839247206e-05, "loss": 0.0241, "step": 47150 }, { "epoch": 3.2953990085875864, "grad_norm": 3.621380567550659, "learning_rate": 1.895722464431446e-05, "loss": 0.0354, "step": 47200 }, { "epoch": 3.2988898973678698, "grad_norm": 9.058627128601074, "learning_rate": 1.891843669030146e-05, "loss": 0.0428, "step": 47250 }, { "epoch": 3.302380786148153, "grad_norm": 0.03342050686478615, "learning_rate": 1.887964873628846e-05, "loss": 0.0189, "step": 47300 }, { "epoch": 3.305871674928437, "grad_norm": 0.00948148313909769, "learning_rate": 1.8840860782275457e-05, "loss": 0.0102, "step": 47350 }, { "epoch": 3.3093625637087203, "grad_norm": 0.10229407250881195, "learning_rate": 1.8802072828262457e-05, "loss": 0.0138, "step": 47400 }, { "epoch": 3.3128534524890036, "grad_norm": 5.57817268371582, "learning_rate": 1.8763284874249454e-05, "loss": 0.0327, "step": 47450 }, { "epoch": 3.316344341269287, "grad_norm": 0.01957140862941742, "learning_rate": 1.8724496920236454e-05, "loss": 0.0172, "step": 47500 }, { "epoch": 3.319835230049571, "grad_norm": 6.440301418304443, "learning_rate": 1.868570896622345e-05, "loss": 0.0186, "step": 47550 }, { "epoch": 3.323326118829854, "grad_norm": 0.012590606696903706, "learning_rate": 1.864692101221045e-05, "loss": 0.0159, "step": 47600 }, { "epoch": 3.3268170076101375, "grad_norm": 6.431277751922607, "learning_rate": 1.8608133058197448e-05, "loss": 0.0154, "step": 47650 }, { "epoch": 3.330307896390421, "grad_norm": 0.14014244079589844, "learning_rate": 1.8569345104184448e-05, "loss": 0.0149, "step": 47700 }, { "epoch": 3.3337987851707043, "grad_norm": 0.02405048906803131, "learning_rate": 1.8530557150171444e-05, "loss": 0.0219, "step": 47750 }, { "epoch": 3.337289673950988, "grad_norm": 4.332936763763428, "learning_rate": 1.849176919615844e-05, "loss": 0.026, "step": 47800 }, { "epoch": 3.3407805627312714, "grad_norm": 3.507913827896118, "learning_rate": 1.845298124214544e-05, "loss": 0.022, "step": 47850 }, { "epoch": 3.344271451511555, "grad_norm": 0.006527060177177191, "learning_rate": 1.8414193288132438e-05, "loss": 0.0186, "step": 47900 }, { "epoch": 3.347762340291838, "grad_norm": 0.1505638211965561, "learning_rate": 1.8375405334119435e-05, "loss": 0.0127, "step": 47950 }, { "epoch": 3.351253229072122, "grad_norm": 2.856977701187134, "learning_rate": 1.8336617380106435e-05, "loss": 0.0207, "step": 48000 }, { "epoch": 3.351253229072122, "eval_f1": 0.9406025824964132, "eval_loss": 0.18208463490009308, "eval_runtime": 20.8293, "eval_samples_per_second": 175.714, "eval_steps_per_second": 5.521, "step": 48000 }, { "epoch": 3.3547441178524053, "grad_norm": 0.006940176710486412, "learning_rate": 1.829782942609343e-05, "loss": 0.0085, "step": 48050 }, { "epoch": 3.3582350066326887, "grad_norm": 0.6002283096313477, "learning_rate": 1.825904147208043e-05, "loss": 0.0147, "step": 48100 }, { "epoch": 3.361725895412972, "grad_norm": 0.01274219062179327, "learning_rate": 1.8220253518067428e-05, "loss": 0.0151, "step": 48150 }, { "epoch": 3.3652167841932554, "grad_norm": 0.007482277695089579, "learning_rate": 1.8181465564054428e-05, "loss": 0.0181, "step": 48200 }, { "epoch": 3.3687076729735392, "grad_norm": 0.03832433000206947, "learning_rate": 1.8142677610041425e-05, "loss": 0.0308, "step": 48250 }, { "epoch": 3.3721985617538226, "grad_norm": 0.04535474628210068, "learning_rate": 1.8103889656028425e-05, "loss": 0.023, "step": 48300 }, { "epoch": 3.375689450534106, "grad_norm": 0.016268501058220863, "learning_rate": 1.806510170201542e-05, "loss": 0.0218, "step": 48350 }, { "epoch": 3.3791803393143893, "grad_norm": 5.86330509185791, "learning_rate": 1.802631374800242e-05, "loss": 0.029, "step": 48400 }, { "epoch": 3.382671228094673, "grad_norm": 0.035588301718235016, "learning_rate": 1.798752579398942e-05, "loss": 0.0349, "step": 48450 }, { "epoch": 3.3861621168749565, "grad_norm": 0.04482724890112877, "learning_rate": 1.794873783997642e-05, "loss": 0.0198, "step": 48500 }, { "epoch": 3.38965300565524, "grad_norm": 1.3957189321517944, "learning_rate": 1.790994988596342e-05, "loss": 0.0153, "step": 48550 }, { "epoch": 3.3931438944355232, "grad_norm": 0.030972935259342194, "learning_rate": 1.7871161931950415e-05, "loss": 0.014, "step": 48600 }, { "epoch": 3.3966347832158066, "grad_norm": 0.024676023051142693, "learning_rate": 1.7832373977937415e-05, "loss": 0.0158, "step": 48650 }, { "epoch": 3.4001256719960904, "grad_norm": 0.020046940073370934, "learning_rate": 1.7793586023924412e-05, "loss": 0.0226, "step": 48700 }, { "epoch": 3.4036165607763738, "grad_norm": 0.11487656086683273, "learning_rate": 1.775479806991141e-05, "loss": 0.019, "step": 48750 }, { "epoch": 3.407107449556657, "grad_norm": 0.07113959640264511, "learning_rate": 1.771601011589841e-05, "loss": 0.0116, "step": 48800 }, { "epoch": 3.4105983383369405, "grad_norm": 0.020373065024614334, "learning_rate": 1.7677222161885405e-05, "loss": 0.0234, "step": 48850 }, { "epoch": 3.4140892271172243, "grad_norm": 0.2534826099872589, "learning_rate": 1.7638434207872402e-05, "loss": 0.0127, "step": 48900 }, { "epoch": 3.4175801158975077, "grad_norm": 0.00617890153080225, "learning_rate": 1.7599646253859402e-05, "loss": 0.0093, "step": 48950 }, { "epoch": 3.421071004677791, "grad_norm": 0.12724170088768005, "learning_rate": 1.75608582998464e-05, "loss": 0.0136, "step": 49000 }, { "epoch": 3.421071004677791, "eval_f1": 0.9272142648348436, "eval_loss": 0.24522946774959564, "eval_runtime": 19.1642, "eval_samples_per_second": 190.981, "eval_steps_per_second": 6.001, "step": 49000 }, { "epoch": 3.4245618934580744, "grad_norm": 0.012064763344824314, "learning_rate": 1.75220703458334e-05, "loss": 0.0289, "step": 49050 }, { "epoch": 3.4280527822383577, "grad_norm": 0.21019980311393738, "learning_rate": 1.7483282391820396e-05, "loss": 0.0275, "step": 49100 }, { "epoch": 3.431543671018641, "grad_norm": 0.009219990111887455, "learning_rate": 1.7444494437807396e-05, "loss": 0.0265, "step": 49150 }, { "epoch": 3.435034559798925, "grad_norm": 0.44924938678741455, "learning_rate": 1.7405706483794392e-05, "loss": 0.0222, "step": 49200 }, { "epoch": 3.4385254485792083, "grad_norm": 0.12933841347694397, "learning_rate": 1.7366918529781392e-05, "loss": 0.0102, "step": 49250 }, { "epoch": 3.4420163373594916, "grad_norm": 0.07747853547334671, "learning_rate": 1.732813057576839e-05, "loss": 0.0122, "step": 49300 }, { "epoch": 3.445507226139775, "grad_norm": 2.5203640460968018, "learning_rate": 1.728934262175539e-05, "loss": 0.0183, "step": 49350 }, { "epoch": 3.448998114920059, "grad_norm": 0.3809491693973541, "learning_rate": 1.7250554667742386e-05, "loss": 0.0099, "step": 49400 }, { "epoch": 3.452489003700342, "grad_norm": 1.1953871250152588, "learning_rate": 1.7211766713729386e-05, "loss": 0.0173, "step": 49450 }, { "epoch": 3.4559798924806255, "grad_norm": 0.01619006134569645, "learning_rate": 1.7172978759716383e-05, "loss": 0.0372, "step": 49500 }, { "epoch": 3.459470781260909, "grad_norm": 0.032744113355875015, "learning_rate": 1.7134190805703383e-05, "loss": 0.0192, "step": 49550 }, { "epoch": 3.4629616700411923, "grad_norm": 0.015268692746758461, "learning_rate": 1.709540285169038e-05, "loss": 0.0201, "step": 49600 }, { "epoch": 3.466452558821476, "grad_norm": 0.4338730573654175, "learning_rate": 1.7057390656757637e-05, "loss": 0.0157, "step": 49650 }, { "epoch": 3.4699434476017594, "grad_norm": 0.1488252431154251, "learning_rate": 1.7018602702744638e-05, "loss": 0.0326, "step": 49700 }, { "epoch": 3.473434336382043, "grad_norm": 0.35929903388023376, "learning_rate": 1.6979814748731634e-05, "loss": 0.0084, "step": 49750 }, { "epoch": 3.476925225162326, "grad_norm": 0.09212347865104675, "learning_rate": 1.6941026794718634e-05, "loss": 0.0172, "step": 49800 }, { "epoch": 3.48041611394261, "grad_norm": 6.833462238311768, "learning_rate": 1.690223884070563e-05, "loss": 0.011, "step": 49850 }, { "epoch": 3.4839070027228933, "grad_norm": 0.06177880987524986, "learning_rate": 1.686345088669263e-05, "loss": 0.022, "step": 49900 }, { "epoch": 3.4873978915031767, "grad_norm": 0.015737321227788925, "learning_rate": 1.6824662932679628e-05, "loss": 0.0159, "step": 49950 }, { "epoch": 3.49088878028346, "grad_norm": 1.1112961769104004, "learning_rate": 1.6785874978666628e-05, "loss": 0.0113, "step": 50000 }, { "epoch": 3.49088878028346, "eval_f1": 0.9358381502890173, "eval_loss": 0.1911371946334839, "eval_runtime": 20.4886, "eval_samples_per_second": 178.636, "eval_steps_per_second": 5.613, "step": 50000 }, { "epoch": 3.4943796690637434, "grad_norm": 0.009405792690813541, "learning_rate": 1.6747087024653624e-05, "loss": 0.0201, "step": 50050 }, { "epoch": 3.4978705578440272, "grad_norm": 0.03674100339412689, "learning_rate": 1.6708299070640625e-05, "loss": 0.0269, "step": 50100 }, { "epoch": 3.5013614466243106, "grad_norm": 2.9473278522491455, "learning_rate": 1.666951111662762e-05, "loss": 0.0197, "step": 50150 }, { "epoch": 3.504852335404594, "grad_norm": 0.025787200778722763, "learning_rate": 1.663072316261462e-05, "loss": 0.0177, "step": 50200 }, { "epoch": 3.5083432241848773, "grad_norm": 0.025555545464158058, "learning_rate": 1.6591935208601618e-05, "loss": 0.0172, "step": 50250 }, { "epoch": 3.511834112965161, "grad_norm": 0.033668939024209976, "learning_rate": 1.6553147254588615e-05, "loss": 0.0139, "step": 50300 }, { "epoch": 3.5153250017454445, "grad_norm": 0.03965609520673752, "learning_rate": 1.6514359300575615e-05, "loss": 0.0163, "step": 50350 }, { "epoch": 3.518815890525728, "grad_norm": 0.02201533503830433, "learning_rate": 1.647557134656261e-05, "loss": 0.0183, "step": 50400 }, { "epoch": 3.522306779306011, "grad_norm": 0.02458702027797699, "learning_rate": 1.6436783392549608e-05, "loss": 0.0033, "step": 50450 }, { "epoch": 3.5257976680862946, "grad_norm": 0.03175359591841698, "learning_rate": 1.6397995438536608e-05, "loss": 0.023, "step": 50500 }, { "epoch": 3.5292885568665784, "grad_norm": 0.12282973527908325, "learning_rate": 1.6359207484523605e-05, "loss": 0.0231, "step": 50550 }, { "epoch": 3.5327794456468617, "grad_norm": 0.09555836021900177, "learning_rate": 1.6320419530510605e-05, "loss": 0.0225, "step": 50600 }, { "epoch": 3.536270334427145, "grad_norm": 0.03125094249844551, "learning_rate": 1.62816315764976e-05, "loss": 0.0057, "step": 50650 }, { "epoch": 3.5397612232074285, "grad_norm": 0.42508402466773987, "learning_rate": 1.6242843622484602e-05, "loss": 0.0121, "step": 50700 }, { "epoch": 3.5432521119877123, "grad_norm": 0.004737542010843754, "learning_rate": 1.6204055668471602e-05, "loss": 0.0266, "step": 50750 }, { "epoch": 3.5467430007679956, "grad_norm": 0.009997726418077946, "learning_rate": 1.61652677144586e-05, "loss": 0.0094, "step": 50800 }, { "epoch": 3.550233889548279, "grad_norm": 0.10814603418111801, "learning_rate": 1.61264797604456e-05, "loss": 0.009, "step": 50850 }, { "epoch": 3.5537247783285624, "grad_norm": 0.015030721202492714, "learning_rate": 1.6087691806432595e-05, "loss": 0.0059, "step": 50900 }, { "epoch": 3.5572156671088457, "grad_norm": 3.3664567470550537, "learning_rate": 1.6048903852419595e-05, "loss": 0.0265, "step": 50950 }, { "epoch": 3.560706555889129, "grad_norm": 0.037877023220062256, "learning_rate": 1.6010115898406592e-05, "loss": 0.0128, "step": 51000 }, { "epoch": 3.560706555889129, "eval_f1": 0.9240691879214307, "eval_loss": 0.2226642370223999, "eval_runtime": 19.3417, "eval_samples_per_second": 189.229, "eval_steps_per_second": 5.946, "step": 51000 }, { "epoch": 3.564197444669413, "grad_norm": 0.18292999267578125, "learning_rate": 1.5971327944393592e-05, "loss": 0.028, "step": 51050 }, { "epoch": 3.5676883334496963, "grad_norm": 0.03300943225622177, "learning_rate": 1.593253999038059e-05, "loss": 0.0178, "step": 51100 }, { "epoch": 3.5711792222299796, "grad_norm": 0.26565834879875183, "learning_rate": 1.589375203636759e-05, "loss": 0.0156, "step": 51150 }, { "epoch": 3.5746701110102634, "grad_norm": 0.006750601809471846, "learning_rate": 1.5854964082354586e-05, "loss": 0.0112, "step": 51200 }, { "epoch": 3.578160999790547, "grad_norm": 0.09060013294219971, "learning_rate": 1.5816176128341582e-05, "loss": 0.0323, "step": 51250 }, { "epoch": 3.58165188857083, "grad_norm": 2.0851974487304688, "learning_rate": 1.5777388174328582e-05, "loss": 0.0206, "step": 51300 }, { "epoch": 3.5851427773511135, "grad_norm": 0.012320557609200478, "learning_rate": 1.573860022031558e-05, "loss": 0.0157, "step": 51350 }, { "epoch": 3.588633666131397, "grad_norm": 19.57718276977539, "learning_rate": 1.5699812266302576e-05, "loss": 0.016, "step": 51400 }, { "epoch": 3.5921245549116803, "grad_norm": 0.014298966154456139, "learning_rate": 1.5661024312289576e-05, "loss": 0.0107, "step": 51450 }, { "epoch": 3.595615443691964, "grad_norm": 5.5826849937438965, "learning_rate": 1.5622236358276572e-05, "loss": 0.0175, "step": 51500 }, { "epoch": 3.5991063324722474, "grad_norm": 0.011979132890701294, "learning_rate": 1.5583448404263573e-05, "loss": 0.0246, "step": 51550 }, { "epoch": 3.602597221252531, "grad_norm": 0.021864114329218864, "learning_rate": 1.554466045025057e-05, "loss": 0.0199, "step": 51600 }, { "epoch": 3.6060881100328146, "grad_norm": 0.02277781441807747, "learning_rate": 1.550587249623757e-05, "loss": 0.0091, "step": 51650 }, { "epoch": 3.609578998813098, "grad_norm": 0.013841581530869007, "learning_rate": 1.5467084542224566e-05, "loss": 0.0059, "step": 51700 }, { "epoch": 3.6130698875933813, "grad_norm": 0.012927171774208546, "learning_rate": 1.5429072347291827e-05, "loss": 0.0201, "step": 51750 }, { "epoch": 3.6165607763736647, "grad_norm": 7.824684143066406, "learning_rate": 1.5390284393278824e-05, "loss": 0.0176, "step": 51800 }, { "epoch": 3.620051665153948, "grad_norm": 0.16372781991958618, "learning_rate": 1.535149643926582e-05, "loss": 0.0313, "step": 51850 }, { "epoch": 3.6235425539342314, "grad_norm": 0.11191274225711823, "learning_rate": 1.531270848525282e-05, "loss": 0.0192, "step": 51900 }, { "epoch": 3.627033442714515, "grad_norm": 0.09052783995866776, "learning_rate": 1.5273920531239818e-05, "loss": 0.015, "step": 51950 }, { "epoch": 3.6305243314947986, "grad_norm": 3.989870071411133, "learning_rate": 1.5235132577226818e-05, "loss": 0.0173, "step": 52000 }, { "epoch": 3.6305243314947986, "eval_f1": 0.9422750424448217, "eval_loss": 0.1699497252702713, "eval_runtime": 18.403, "eval_samples_per_second": 198.881, "eval_steps_per_second": 6.249, "step": 52000 }, { "epoch": 3.634015220275082, "grad_norm": 0.004953749943524599, "learning_rate": 1.5196344623213816e-05, "loss": 0.0073, "step": 52050 }, { "epoch": 3.6375061090553658, "grad_norm": 6.198831558227539, "learning_rate": 1.5157556669200814e-05, "loss": 0.026, "step": 52100 }, { "epoch": 3.640996997835649, "grad_norm": 0.018628481775522232, "learning_rate": 1.5118768715187811e-05, "loss": 0.0173, "step": 52150 }, { "epoch": 3.6444878866159325, "grad_norm": 0.025905922055244446, "learning_rate": 1.5079980761174811e-05, "loss": 0.0094, "step": 52200 }, { "epoch": 3.647978775396216, "grad_norm": 0.00478333467617631, "learning_rate": 1.5041192807161808e-05, "loss": 0.0182, "step": 52250 }, { "epoch": 3.651469664176499, "grad_norm": 3.3075673580169678, "learning_rate": 1.5002404853148808e-05, "loss": 0.0198, "step": 52300 }, { "epoch": 3.6549605529567826, "grad_norm": 0.05998103320598602, "learning_rate": 1.4963616899135805e-05, "loss": 0.0096, "step": 52350 }, { "epoch": 3.6584514417370664, "grad_norm": 4.651539325714111, "learning_rate": 1.4924828945122805e-05, "loss": 0.0248, "step": 52400 }, { "epoch": 3.6619423305173497, "grad_norm": 0.6373411417007446, "learning_rate": 1.4886040991109801e-05, "loss": 0.0338, "step": 52450 }, { "epoch": 3.665433219297633, "grad_norm": 1.2010835409164429, "learning_rate": 1.48472530370968e-05, "loss": 0.0103, "step": 52500 }, { "epoch": 3.668924108077917, "grad_norm": 0.018328318372368813, "learning_rate": 1.4808465083083798e-05, "loss": 0.0085, "step": 52550 }, { "epoch": 3.6724149968582003, "grad_norm": 0.03623636066913605, "learning_rate": 1.4769677129070796e-05, "loss": 0.0119, "step": 52600 }, { "epoch": 3.6759058856384836, "grad_norm": 0.05136825889348984, "learning_rate": 1.4730889175057793e-05, "loss": 0.0118, "step": 52650 }, { "epoch": 3.679396774418767, "grad_norm": 0.025166302919387817, "learning_rate": 1.4692101221044793e-05, "loss": 0.0148, "step": 52700 }, { "epoch": 3.6828876631990504, "grad_norm": 0.2804352045059204, "learning_rate": 1.465331326703179e-05, "loss": 0.0291, "step": 52750 }, { "epoch": 3.6863785519793337, "grad_norm": 0.01500253938138485, "learning_rate": 1.461452531301879e-05, "loss": 0.0157, "step": 52800 }, { "epoch": 3.6898694407596175, "grad_norm": 0.012079290114343166, "learning_rate": 1.4575737359005787e-05, "loss": 0.0143, "step": 52850 }, { "epoch": 3.693360329539901, "grad_norm": 0.09183007478713989, "learning_rate": 1.4536949404992787e-05, "loss": 0.0203, "step": 52900 }, { "epoch": 3.6968512183201843, "grad_norm": 0.020523011684417725, "learning_rate": 1.4498161450979783e-05, "loss": 0.0208, "step": 52950 }, { "epoch": 3.7003421071004676, "grad_norm": 2.2191498279571533, "learning_rate": 1.4459373496966783e-05, "loss": 0.0115, "step": 53000 }, { "epoch": 3.7003421071004676, "eval_f1": 0.9403368541250356, "eval_loss": 0.17281506955623627, "eval_runtime": 18.7663, "eval_samples_per_second": 195.031, "eval_steps_per_second": 6.128, "step": 53000 }, { "epoch": 3.7038329958807514, "grad_norm": 0.11821743845939636, "learning_rate": 1.4420585542953782e-05, "loss": 0.0144, "step": 53050 }, { "epoch": 3.707323884661035, "grad_norm": 0.015602991916239262, "learning_rate": 1.4381797588940779e-05, "loss": 0.0281, "step": 53100 }, { "epoch": 3.710814773441318, "grad_norm": 0.405454158782959, "learning_rate": 1.4343009634927779e-05, "loss": 0.0161, "step": 53150 }, { "epoch": 3.7143056622216015, "grad_norm": 0.7658965587615967, "learning_rate": 1.4304221680914775e-05, "loss": 0.0292, "step": 53200 }, { "epoch": 3.717796551001885, "grad_norm": 3.5675363540649414, "learning_rate": 1.4265433726901775e-05, "loss": 0.0109, "step": 53250 }, { "epoch": 3.7212874397821687, "grad_norm": 0.032014861702919006, "learning_rate": 1.4226645772888772e-05, "loss": 0.0223, "step": 53300 }, { "epoch": 3.724778328562452, "grad_norm": 0.12666158378124237, "learning_rate": 1.4187857818875772e-05, "loss": 0.0254, "step": 53350 }, { "epoch": 3.7282692173427354, "grad_norm": 0.029635407030582428, "learning_rate": 1.4149069864862769e-05, "loss": 0.0045, "step": 53400 }, { "epoch": 3.731760106123019, "grad_norm": 2.7315502166748047, "learning_rate": 1.4110281910849767e-05, "loss": 0.0421, "step": 53450 }, { "epoch": 3.7352509949033026, "grad_norm": 0.5258963704109192, "learning_rate": 1.4071493956836766e-05, "loss": 0.0206, "step": 53500 }, { "epoch": 3.738741883683586, "grad_norm": 0.05045127496123314, "learning_rate": 1.4032706002823764e-05, "loss": 0.0122, "step": 53550 }, { "epoch": 3.7422327724638693, "grad_norm": 0.008424119092524052, "learning_rate": 1.399391804881076e-05, "loss": 0.008, "step": 53600 }, { "epoch": 3.7457236612441527, "grad_norm": 0.026001902297139168, "learning_rate": 1.395513009479776e-05, "loss": 0.0204, "step": 53650 }, { "epoch": 3.749214550024436, "grad_norm": 0.009944004938006401, "learning_rate": 1.3916342140784757e-05, "loss": 0.0109, "step": 53700 }, { "epoch": 3.7527054388047194, "grad_norm": 0.48391270637512207, "learning_rate": 1.3877554186771758e-05, "loss": 0.0228, "step": 53750 }, { "epoch": 3.756196327585003, "grad_norm": 6.483001708984375, "learning_rate": 1.3838766232758754e-05, "loss": 0.0245, "step": 53800 }, { "epoch": 3.7596872163652866, "grad_norm": 0.23561638593673706, "learning_rate": 1.3799978278745754e-05, "loss": 0.0072, "step": 53850 }, { "epoch": 3.76317810514557, "grad_norm": 2.020230770111084, "learning_rate": 1.3761190324732751e-05, "loss": 0.0209, "step": 53900 }, { "epoch": 3.7666689939258537, "grad_norm": 2.9172303676605225, "learning_rate": 1.3722402370719751e-05, "loss": 0.0229, "step": 53950 }, { "epoch": 3.770159882706137, "grad_norm": 0.08300653100013733, "learning_rate": 1.3683614416706748e-05, "loss": 0.0226, "step": 54000 }, { "epoch": 3.770159882706137, "eval_f1": 0.9282156460591855, "eval_loss": 0.2161129117012024, "eval_runtime": 27.4237, "eval_samples_per_second": 133.461, "eval_steps_per_second": 4.193, "step": 54000 }, { "epoch": 3.7736507714864205, "grad_norm": 0.051579758524894714, "learning_rate": 1.3644826462693746e-05, "loss": 0.0128, "step": 54050 }, { "epoch": 3.777141660266704, "grad_norm": 0.029366178438067436, "learning_rate": 1.3606038508680743e-05, "loss": 0.0116, "step": 54100 }, { "epoch": 3.780632549046987, "grad_norm": 0.03355459123849869, "learning_rate": 1.3567250554667743e-05, "loss": 0.0171, "step": 54150 }, { "epoch": 3.7841234378272706, "grad_norm": 13.578819274902344, "learning_rate": 1.3528462600654743e-05, "loss": 0.0133, "step": 54200 }, { "epoch": 3.7876143266075544, "grad_norm": 0.4114447236061096, "learning_rate": 1.348967464664174e-05, "loss": 0.0133, "step": 54250 }, { "epoch": 3.7911052153878377, "grad_norm": 2.2274863719940186, "learning_rate": 1.345088669262874e-05, "loss": 0.0135, "step": 54300 }, { "epoch": 3.794596104168121, "grad_norm": 0.016799284145236015, "learning_rate": 1.3412098738615736e-05, "loss": 0.0103, "step": 54350 }, { "epoch": 3.798086992948405, "grad_norm": 0.021271638572216034, "learning_rate": 1.3373310784602735e-05, "loss": 0.0144, "step": 54400 }, { "epoch": 3.8015778817286883, "grad_norm": 0.40475618839263916, "learning_rate": 1.3334522830589733e-05, "loss": 0.0157, "step": 54450 }, { "epoch": 3.8050687705089716, "grad_norm": 0.014505240134894848, "learning_rate": 1.3295734876576732e-05, "loss": 0.0214, "step": 54500 }, { "epoch": 3.808559659289255, "grad_norm": 1.2870405912399292, "learning_rate": 1.3256946922563728e-05, "loss": 0.0199, "step": 54550 }, { "epoch": 3.8120505480695384, "grad_norm": 0.11923400312662125, "learning_rate": 1.3218158968550728e-05, "loss": 0.019, "step": 54600 }, { "epoch": 3.8155414368498217, "grad_norm": 6.796934127807617, "learning_rate": 1.3180146773617985e-05, "loss": 0.0189, "step": 54650 }, { "epoch": 3.8190323256301055, "grad_norm": 0.2147059142589569, "learning_rate": 1.3141358819604985e-05, "loss": 0.0148, "step": 54700 }, { "epoch": 3.822523214410389, "grad_norm": 0.009229235351085663, "learning_rate": 1.3102570865591981e-05, "loss": 0.0125, "step": 54750 }, { "epoch": 3.8260141031906723, "grad_norm": 0.012158773839473724, "learning_rate": 1.3063782911578981e-05, "loss": 0.0181, "step": 54800 }, { "epoch": 3.829504991970956, "grad_norm": 0.9341710209846497, "learning_rate": 1.3024994957565978e-05, "loss": 0.0118, "step": 54850 }, { "epoch": 3.8329958807512394, "grad_norm": 0.1444963812828064, "learning_rate": 1.2986207003552978e-05, "loss": 0.0219, "step": 54900 }, { "epoch": 3.836486769531523, "grad_norm": 0.2622317969799042, "learning_rate": 1.2947419049539975e-05, "loss": 0.0239, "step": 54950 }, { "epoch": 3.839977658311806, "grad_norm": 2.9175851345062256, "learning_rate": 1.2908631095526973e-05, "loss": 0.0229, "step": 55000 }, { "epoch": 3.839977658311806, "eval_f1": 0.9377182770663562, "eval_loss": 0.18816331028938293, "eval_runtime": 26.4818, "eval_samples_per_second": 138.208, "eval_steps_per_second": 4.343, "step": 55000 }, { "epoch": 3.8434685470920895, "grad_norm": 0.01847333274781704, "learning_rate": 1.2869843141513972e-05, "loss": 0.0123, "step": 55050 }, { "epoch": 3.846959435872373, "grad_norm": 0.14379319548606873, "learning_rate": 1.283105518750097e-05, "loss": 0.0135, "step": 55100 }, { "epoch": 3.8504503246526567, "grad_norm": 0.017605192959308624, "learning_rate": 1.2792267233487967e-05, "loss": 0.0051, "step": 55150 }, { "epoch": 3.85394121343294, "grad_norm": 0.00873847771435976, "learning_rate": 1.2753479279474967e-05, "loss": 0.0123, "step": 55200 }, { "epoch": 3.8574321022132234, "grad_norm": 0.006974588148295879, "learning_rate": 1.2714691325461967e-05, "loss": 0.0054, "step": 55250 }, { "epoch": 3.860922990993507, "grad_norm": 0.008401262573897839, "learning_rate": 1.2675903371448964e-05, "loss": 0.0115, "step": 55300 }, { "epoch": 3.8644138797737906, "grad_norm": 0.004291973542422056, "learning_rate": 1.2637115417435964e-05, "loss": 0.0233, "step": 55350 }, { "epoch": 3.867904768554074, "grad_norm": 0.039114680141210556, "learning_rate": 1.259832746342296e-05, "loss": 0.0301, "step": 55400 }, { "epoch": 3.8713956573343573, "grad_norm": 0.02999526634812355, "learning_rate": 1.2559539509409959e-05, "loss": 0.021, "step": 55450 }, { "epoch": 3.8748865461146407, "grad_norm": 0.14493048191070557, "learning_rate": 1.2520751555396955e-05, "loss": 0.0155, "step": 55500 }, { "epoch": 3.878377434894924, "grad_norm": 0.028276991099119186, "learning_rate": 1.2481963601383954e-05, "loss": 0.0093, "step": 55550 }, { "epoch": 3.881868323675208, "grad_norm": 0.984074056148529, "learning_rate": 1.2443175647370952e-05, "loss": 0.0121, "step": 55600 }, { "epoch": 3.885359212455491, "grad_norm": 0.006328861694782972, "learning_rate": 1.240438769335795e-05, "loss": 0.0144, "step": 55650 }, { "epoch": 3.8888501012357746, "grad_norm": 0.013225522823631763, "learning_rate": 1.2365599739344949e-05, "loss": 0.0112, "step": 55700 }, { "epoch": 3.892340990016058, "grad_norm": 0.16436105966567993, "learning_rate": 1.2326811785331947e-05, "loss": 0.0507, "step": 55750 }, { "epoch": 3.8958318787963417, "grad_norm": 0.026833226904273033, "learning_rate": 1.2288023831318946e-05, "loss": 0.0139, "step": 55800 }, { "epoch": 3.899322767576625, "grad_norm": 0.4603937268257141, "learning_rate": 1.2249235877305946e-05, "loss": 0.0146, "step": 55850 }, { "epoch": 3.9028136563569085, "grad_norm": 0.2762994170188904, "learning_rate": 1.2210447923292944e-05, "loss": 0.0118, "step": 55900 }, { "epoch": 3.906304545137192, "grad_norm": 0.040168821811676025, "learning_rate": 1.217165996927994e-05, "loss": 0.0176, "step": 55950 }, { "epoch": 3.909795433917475, "grad_norm": 0.010930743999779224, "learning_rate": 1.213287201526694e-05, "loss": 0.0073, "step": 56000 }, { "epoch": 3.909795433917475, "eval_f1": 0.9395429563205091, "eval_loss": 0.1781720668077469, "eval_runtime": 28.9085, "eval_samples_per_second": 126.607, "eval_steps_per_second": 3.978, "step": 56000 }, { "epoch": 3.913286322697759, "grad_norm": 0.014746199361979961, "learning_rate": 1.2094084061253938e-05, "loss": 0.0127, "step": 56050 }, { "epoch": 3.9167772114780424, "grad_norm": 0.014964771457016468, "learning_rate": 1.2055296107240936e-05, "loss": 0.0213, "step": 56100 }, { "epoch": 3.9202681002583257, "grad_norm": 1.6473639011383057, "learning_rate": 1.2016508153227934e-05, "loss": 0.0254, "step": 56150 }, { "epoch": 3.923758989038609, "grad_norm": 0.1399172842502594, "learning_rate": 1.1977720199214933e-05, "loss": 0.016, "step": 56200 }, { "epoch": 3.927249877818893, "grad_norm": 0.01769440248608589, "learning_rate": 1.1938932245201931e-05, "loss": 0.0232, "step": 56250 }, { "epoch": 3.9307407665991763, "grad_norm": 0.012674226425588131, "learning_rate": 1.190014429118893e-05, "loss": 0.0211, "step": 56300 }, { "epoch": 3.9342316553794596, "grad_norm": 6.944097518920898, "learning_rate": 1.1861356337175928e-05, "loss": 0.016, "step": 56350 }, { "epoch": 3.937722544159743, "grad_norm": 0.009510631673038006, "learning_rate": 1.1822568383162926e-05, "loss": 0.0236, "step": 56400 }, { "epoch": 3.9412134329400264, "grad_norm": 0.016936538740992546, "learning_rate": 1.1783780429149923e-05, "loss": 0.0301, "step": 56450 }, { "epoch": 3.9447043217203097, "grad_norm": 0.010942582041025162, "learning_rate": 1.1744992475136921e-05, "loss": 0.0082, "step": 56500 }, { "epoch": 3.9481952105005935, "grad_norm": 0.21423903107643127, "learning_rate": 1.170620452112392e-05, "loss": 0.0122, "step": 56550 }, { "epoch": 3.951686099280877, "grad_norm": 0.1346980482339859, "learning_rate": 1.1667416567110918e-05, "loss": 0.0156, "step": 56600 }, { "epoch": 3.9551769880611602, "grad_norm": 0.027757730334997177, "learning_rate": 1.1628628613097916e-05, "loss": 0.02, "step": 56650 }, { "epoch": 3.958667876841444, "grad_norm": 0.05131814628839493, "learning_rate": 1.1589840659084915e-05, "loss": 0.0097, "step": 56700 }, { "epoch": 3.9621587656217274, "grad_norm": 3.791733741760254, "learning_rate": 1.1551052705071913e-05, "loss": 0.0229, "step": 56750 }, { "epoch": 3.965649654402011, "grad_norm": 2.3265886306762695, "learning_rate": 1.1512264751058912e-05, "loss": 0.0137, "step": 56800 }, { "epoch": 3.969140543182294, "grad_norm": 2.308258533477783, "learning_rate": 1.147347679704591e-05, "loss": 0.0223, "step": 56850 }, { "epoch": 3.9726314319625775, "grad_norm": 0.004564803559333086, "learning_rate": 1.1434688843032908e-05, "loss": 0.0229, "step": 56900 }, { "epoch": 3.976122320742861, "grad_norm": 0.06186622008681297, "learning_rate": 1.1395900889019907e-05, "loss": 0.0118, "step": 56950 }, { "epoch": 3.9796132095231447, "grad_norm": 0.010163922794163227, "learning_rate": 1.1357888694087165e-05, "loss": 0.0116, "step": 57000 }, { "epoch": 3.9796132095231447, "eval_f1": 0.9382573571840739, "eval_loss": 0.17844714224338531, "eval_runtime": 26.9732, "eval_samples_per_second": 135.69, "eval_steps_per_second": 4.263, "step": 57000 }, { "epoch": 3.983104098303428, "grad_norm": 0.06274252384901047, "learning_rate": 1.1319100740074163e-05, "loss": 0.0119, "step": 57050 }, { "epoch": 3.9865949870837114, "grad_norm": 0.014810669235885143, "learning_rate": 1.1280312786061162e-05, "loss": 0.0028, "step": 57100 }, { "epoch": 3.990085875863995, "grad_norm": 0.0769757330417633, "learning_rate": 1.124152483204816e-05, "loss": 0.0133, "step": 57150 }, { "epoch": 3.9935767646442786, "grad_norm": 1.5741026401519775, "learning_rate": 1.1202736878035158e-05, "loss": 0.0324, "step": 57200 }, { "epoch": 3.997067653424562, "grad_norm": 1.7274805307388306, "learning_rate": 1.1163948924022157e-05, "loss": 0.0128, "step": 57250 }, { "epoch": 4.000558542204845, "grad_norm": 0.023459715768694878, "learning_rate": 1.1125160970009155e-05, "loss": 0.0336, "step": 57300 }, { "epoch": 4.004049430985129, "grad_norm": 0.21544323861598969, "learning_rate": 1.1086373015996153e-05, "loss": 0.0184, "step": 57350 }, { "epoch": 4.007540319765412, "grad_norm": 5.495270252227783, "learning_rate": 1.1047585061983152e-05, "loss": 0.0217, "step": 57400 }, { "epoch": 4.011031208545695, "grad_norm": 0.1595708429813385, "learning_rate": 1.100879710797015e-05, "loss": 0.02, "step": 57450 }, { "epoch": 4.014522097325979, "grad_norm": 0.32755452394485474, "learning_rate": 1.0970009153957147e-05, "loss": 0.0073, "step": 57500 }, { "epoch": 4.018012986106263, "grad_norm": 0.03040587343275547, "learning_rate": 1.0931221199944145e-05, "loss": 0.0189, "step": 57550 }, { "epoch": 4.021503874886546, "grad_norm": 0.006620501633733511, "learning_rate": 1.0892433245931144e-05, "loss": 0.0187, "step": 57600 }, { "epoch": 4.02499476366683, "grad_norm": 0.015734750777482986, "learning_rate": 1.0853645291918142e-05, "loss": 0.0181, "step": 57650 }, { "epoch": 4.028485652447113, "grad_norm": 0.08926963061094284, "learning_rate": 1.081485733790514e-05, "loss": 0.0199, "step": 57700 }, { "epoch": 4.0319765412273965, "grad_norm": 0.19889496266841888, "learning_rate": 1.0776069383892139e-05, "loss": 0.0104, "step": 57750 }, { "epoch": 4.03546743000768, "grad_norm": 0.008089990355074406, "learning_rate": 1.0737281429879137e-05, "loss": 0.0139, "step": 57800 }, { "epoch": 4.038958318787963, "grad_norm": 0.006437818054109812, "learning_rate": 1.0698493475866136e-05, "loss": 0.0173, "step": 57850 }, { "epoch": 4.0424492075682465, "grad_norm": 0.008485854603350163, "learning_rate": 1.0659705521853134e-05, "loss": 0.0193, "step": 57900 }, { "epoch": 4.04594009634853, "grad_norm": 0.01528970431536436, "learning_rate": 1.0620917567840132e-05, "loss": 0.0204, "step": 57950 }, { "epoch": 4.049430985128814, "grad_norm": 0.018422499299049377, "learning_rate": 1.0582129613827129e-05, "loss": 0.0101, "step": 58000 }, { "epoch": 4.049430985128814, "eval_f1": 0.9438202247191011, "eval_loss": 0.17257019877433777, "eval_runtime": 19.6793, "eval_samples_per_second": 185.983, "eval_steps_per_second": 5.844, "step": 58000 }, { "epoch": 4.0529218739090975, "grad_norm": 19.834749221801758, "learning_rate": 1.0543341659814127e-05, "loss": 0.0299, "step": 58050 }, { "epoch": 4.056412762689381, "grad_norm": 0.035838037729263306, "learning_rate": 1.0504553705801127e-05, "loss": 0.0132, "step": 58100 }, { "epoch": 4.059903651469664, "grad_norm": 0.014196299947798252, "learning_rate": 1.0465765751788126e-05, "loss": 0.0191, "step": 58150 }, { "epoch": 4.063394540249948, "grad_norm": 0.10412344336509705, "learning_rate": 1.0426977797775124e-05, "loss": 0.0193, "step": 58200 }, { "epoch": 4.066885429030231, "grad_norm": 0.012938785366714, "learning_rate": 1.0388189843762123e-05, "loss": 0.0074, "step": 58250 }, { "epoch": 4.070376317810514, "grad_norm": 0.07328232377767563, "learning_rate": 1.0349401889749121e-05, "loss": 0.0224, "step": 58300 }, { "epoch": 4.073867206590798, "grad_norm": 0.268594890832901, "learning_rate": 1.031061393573612e-05, "loss": 0.0156, "step": 58350 }, { "epoch": 4.077358095371081, "grad_norm": 0.05627800524234772, "learning_rate": 1.0271825981723118e-05, "loss": 0.0172, "step": 58400 }, { "epoch": 4.080848984151365, "grad_norm": 0.01628001220524311, "learning_rate": 1.0233038027710114e-05, "loss": 0.0092, "step": 58450 }, { "epoch": 4.084339872931649, "grad_norm": 0.15032631158828735, "learning_rate": 1.0194250073697113e-05, "loss": 0.0127, "step": 58500 }, { "epoch": 4.087830761711932, "grad_norm": 0.03321278095245361, "learning_rate": 1.0155462119684111e-05, "loss": 0.0071, "step": 58550 }, { "epoch": 4.091321650492215, "grad_norm": 1.3011609315872192, "learning_rate": 1.011667416567111e-05, "loss": 0.0193, "step": 58600 }, { "epoch": 4.094812539272499, "grad_norm": 0.009231241419911385, "learning_rate": 1.0077886211658108e-05, "loss": 0.0142, "step": 58650 }, { "epoch": 4.098303428052782, "grad_norm": 0.009372848086059093, "learning_rate": 1.0039098257645106e-05, "loss": 0.0047, "step": 58700 }, { "epoch": 4.1017943168330655, "grad_norm": 0.04598918929696083, "learning_rate": 1.0000310303632105e-05, "loss": 0.021, "step": 58750 }, { "epoch": 4.105285205613349, "grad_norm": 2.4926247596740723, "learning_rate": 9.961522349619103e-06, "loss": 0.0113, "step": 58800 }, { "epoch": 4.108776094393632, "grad_norm": 0.025690525770187378, "learning_rate": 9.922734395606101e-06, "loss": 0.0069, "step": 58850 }, { "epoch": 4.1122669831739165, "grad_norm": 0.009157177992165089, "learning_rate": 9.8839464415931e-06, "loss": 0.0059, "step": 58900 }, { "epoch": 4.1157578719542, "grad_norm": 0.015483526512980461, "learning_rate": 9.845158487580097e-06, "loss": 0.0243, "step": 58950 }, { "epoch": 4.119248760734483, "grad_norm": 0.03802795335650444, "learning_rate": 9.806370533567095e-06, "loss": 0.016, "step": 59000 }, { "epoch": 4.119248760734483, "eval_f1": 0.9442208165612421, "eval_loss": 0.170488640666008, "eval_runtime": 30.507, "eval_samples_per_second": 119.972, "eval_steps_per_second": 3.77, "step": 59000 }, { "epoch": 4.122739649514767, "grad_norm": 0.01446942612528801, "learning_rate": 9.767582579554093e-06, "loss": 0.0289, "step": 59050 }, { "epoch": 4.12623053829505, "grad_norm": 0.09928736090660095, "learning_rate": 9.728794625541092e-06, "loss": 0.0372, "step": 59100 }, { "epoch": 4.129721427075333, "grad_norm": 7.637205600738525, "learning_rate": 9.69000667152809e-06, "loss": 0.0091, "step": 59150 }, { "epoch": 4.133212315855617, "grad_norm": 0.10061842203140259, "learning_rate": 9.651218717515088e-06, "loss": 0.0267, "step": 59200 }, { "epoch": 4.1367032046359, "grad_norm": 0.00964583270251751, "learning_rate": 9.612430763502087e-06, "loss": 0.0095, "step": 59250 }, { "epoch": 4.140194093416183, "grad_norm": 2.4882123470306396, "learning_rate": 9.573642809489087e-06, "loss": 0.0095, "step": 59300 }, { "epoch": 4.143684982196468, "grad_norm": 0.07580289989709854, "learning_rate": 9.534854855476085e-06, "loss": 0.0095, "step": 59350 }, { "epoch": 4.147175870976751, "grad_norm": 1.8626923561096191, "learning_rate": 9.496066901463082e-06, "loss": 0.0148, "step": 59400 }, { "epoch": 4.150666759757034, "grad_norm": 0.015414468944072723, "learning_rate": 9.45727894745008e-06, "loss": 0.0292, "step": 59450 }, { "epoch": 4.154157648537318, "grad_norm": 0.10861571878194809, "learning_rate": 9.419266752517338e-06, "loss": 0.0146, "step": 59500 }, { "epoch": 4.157648537317601, "grad_norm": 0.01463202852755785, "learning_rate": 9.380478798504337e-06, "loss": 0.0114, "step": 59550 }, { "epoch": 4.1611394260978845, "grad_norm": 0.3748888075351715, "learning_rate": 9.341690844491335e-06, "loss": 0.0326, "step": 59600 }, { "epoch": 4.164630314878168, "grad_norm": 2.7507495880126953, "learning_rate": 9.302902890478333e-06, "loss": 0.0217, "step": 59650 }, { "epoch": 4.168121203658451, "grad_norm": 5.160195350646973, "learning_rate": 9.264114936465332e-06, "loss": 0.027, "step": 59700 }, { "epoch": 4.1716120924387345, "grad_norm": 0.021508311852812767, "learning_rate": 9.22532698245233e-06, "loss": 0.0141, "step": 59750 }, { "epoch": 4.175102981219018, "grad_norm": 3.139760732650757, "learning_rate": 9.186539028439329e-06, "loss": 0.0162, "step": 59800 }, { "epoch": 4.178593869999302, "grad_norm": 0.026574354618787766, "learning_rate": 9.147751074426327e-06, "loss": 0.0068, "step": 59850 }, { "epoch": 4.1820847587795855, "grad_norm": 0.01623447611927986, "learning_rate": 9.108963120413325e-06, "loss": 0.0115, "step": 59900 }, { "epoch": 4.185575647559869, "grad_norm": 0.012585906311869621, "learning_rate": 9.070175166400322e-06, "loss": 0.0123, "step": 59950 }, { "epoch": 4.189066536340152, "grad_norm": 0.12533676624298096, "learning_rate": 9.03138721238732e-06, "loss": 0.0073, "step": 60000 }, { "epoch": 4.189066536340152, "eval_f1": 0.9414816950129721, "eval_loss": 0.18086545169353485, "eval_runtime": 25.9786, "eval_samples_per_second": 140.885, "eval_steps_per_second": 4.427, "step": 60000 }, { "epoch": 4.192557425120436, "grad_norm": 0.9304799437522888, "learning_rate": 8.992599258374319e-06, "loss": 0.0074, "step": 60050 }, { "epoch": 4.196048313900719, "grad_norm": 0.00669199600815773, "learning_rate": 8.953811304361317e-06, "loss": 0.0107, "step": 60100 }, { "epoch": 4.199539202681002, "grad_norm": 4.687979698181152, "learning_rate": 8.915023350348316e-06, "loss": 0.0206, "step": 60150 }, { "epoch": 4.203030091461286, "grad_norm": 0.18070493638515472, "learning_rate": 8.876235396335314e-06, "loss": 0.0043, "step": 60200 }, { "epoch": 4.206520980241569, "grad_norm": 0.11669691652059555, "learning_rate": 8.837447442322312e-06, "loss": 0.0136, "step": 60250 }, { "epoch": 4.210011869021853, "grad_norm": 0.027609361335635185, "learning_rate": 8.79865948830931e-06, "loss": 0.0131, "step": 60300 }, { "epoch": 4.213502757802137, "grad_norm": 0.019433321431279182, "learning_rate": 8.75987153429631e-06, "loss": 0.0278, "step": 60350 }, { "epoch": 4.21699364658242, "grad_norm": 0.34241268038749695, "learning_rate": 8.721083580283308e-06, "loss": 0.0071, "step": 60400 }, { "epoch": 4.220484535362703, "grad_norm": 0.09645307064056396, "learning_rate": 8.682295626270306e-06, "loss": 0.0148, "step": 60450 }, { "epoch": 4.223975424142987, "grad_norm": 0.4123315215110779, "learning_rate": 8.643507672257304e-06, "loss": 0.0176, "step": 60500 }, { "epoch": 4.22746631292327, "grad_norm": 0.06280361115932465, "learning_rate": 8.604719718244303e-06, "loss": 0.0122, "step": 60550 }, { "epoch": 4.2309572017035535, "grad_norm": 0.013686880469322205, "learning_rate": 8.565931764231301e-06, "loss": 0.0036, "step": 60600 }, { "epoch": 4.234448090483837, "grad_norm": 0.013660120777785778, "learning_rate": 8.5271438102183e-06, "loss": 0.0157, "step": 60650 }, { "epoch": 4.23793897926412, "grad_norm": 0.07857243716716766, "learning_rate": 8.488355856205298e-06, "loss": 0.0147, "step": 60700 }, { "epoch": 4.2414298680444045, "grad_norm": 0.15520472824573517, "learning_rate": 8.449567902192296e-06, "loss": 0.0187, "step": 60750 }, { "epoch": 4.244920756824688, "grad_norm": 0.006114164367318153, "learning_rate": 8.410779948179295e-06, "loss": 0.0184, "step": 60800 }, { "epoch": 4.248411645604971, "grad_norm": 0.10630781948566437, "learning_rate": 8.371991994166293e-06, "loss": 0.0061, "step": 60850 }, { "epoch": 4.251902534385255, "grad_norm": 0.01164723839610815, "learning_rate": 8.33320404015329e-06, "loss": 0.0184, "step": 60900 }, { "epoch": 4.255393423165538, "grad_norm": 0.025141581892967224, "learning_rate": 8.294416086140288e-06, "loss": 0.0093, "step": 60950 }, { "epoch": 4.258884311945821, "grad_norm": 0.009035998955368996, "learning_rate": 8.255628132127286e-06, "loss": 0.0147, "step": 61000 }, { "epoch": 4.258884311945821, "eval_f1": 0.9440599769319492, "eval_loss": 0.1847628951072693, "eval_runtime": 25.1303, "eval_samples_per_second": 145.641, "eval_steps_per_second": 4.576, "step": 61000 }, { "epoch": 4.262375200726105, "grad_norm": 0.1899830847978592, "learning_rate": 8.216840178114285e-06, "loss": 0.0194, "step": 61050 }, { "epoch": 4.265866089506388, "grad_norm": 0.034703437238931656, "learning_rate": 8.178052224101283e-06, "loss": 0.0101, "step": 61100 }, { "epoch": 4.269356978286671, "grad_norm": 0.1625027358531952, "learning_rate": 8.139264270088282e-06, "loss": 0.0224, "step": 61150 }, { "epoch": 4.272847867066956, "grad_norm": 4.016665935516357, "learning_rate": 8.10047631607528e-06, "loss": 0.0082, "step": 61200 }, { "epoch": 4.276338755847239, "grad_norm": 0.006044914945960045, "learning_rate": 8.061688362062278e-06, "loss": 0.0081, "step": 61250 }, { "epoch": 4.279829644627522, "grad_norm": 6.473517894744873, "learning_rate": 8.022900408049277e-06, "loss": 0.0205, "step": 61300 }, { "epoch": 4.283320533407806, "grad_norm": 0.00561263645067811, "learning_rate": 7.984112454036275e-06, "loss": 0.029, "step": 61350 }, { "epoch": 4.286811422188089, "grad_norm": 0.012270646169781685, "learning_rate": 7.945324500023273e-06, "loss": 0.0207, "step": 61400 }, { "epoch": 4.2903023109683724, "grad_norm": 0.057795651257038116, "learning_rate": 7.90653654601027e-06, "loss": 0.0226, "step": 61450 }, { "epoch": 4.293793199748656, "grad_norm": 0.005627433769404888, "learning_rate": 7.867748591997268e-06, "loss": 0.0125, "step": 61500 }, { "epoch": 4.297284088528939, "grad_norm": 0.021934116259217262, "learning_rate": 7.828960637984269e-06, "loss": 0.0105, "step": 61550 }, { "epoch": 4.3007749773092225, "grad_norm": 6.289938926696777, "learning_rate": 7.790172683971267e-06, "loss": 0.0145, "step": 61600 }, { "epoch": 4.304265866089507, "grad_norm": 0.055504728108644485, "learning_rate": 7.751384729958265e-06, "loss": 0.0116, "step": 61650 }, { "epoch": 4.30775675486979, "grad_norm": 0.23014897108078003, "learning_rate": 7.712596775945264e-06, "loss": 0.0099, "step": 61700 }, { "epoch": 4.3112476436500735, "grad_norm": 0.048537444323301315, "learning_rate": 7.673808821932262e-06, "loss": 0.0141, "step": 61750 }, { "epoch": 4.314738532430357, "grad_norm": 0.0228910893201828, "learning_rate": 7.63502086791926e-06, "loss": 0.009, "step": 61800 }, { "epoch": 4.31822942121064, "grad_norm": 0.04024737700819969, "learning_rate": 7.596232913906258e-06, "loss": 0.0081, "step": 61850 }, { "epoch": 4.321720309990924, "grad_norm": 0.29908424615859985, "learning_rate": 7.557444959893256e-06, "loss": 0.021, "step": 61900 }, { "epoch": 4.325211198771207, "grad_norm": 0.006798740942031145, "learning_rate": 7.518657005880255e-06, "loss": 0.0186, "step": 61950 }, { "epoch": 4.32870208755149, "grad_norm": 0.05872134864330292, "learning_rate": 7.479869051867252e-06, "loss": 0.0273, "step": 62000 }, { "epoch": 4.32870208755149, "eval_f1": 0.9387518142235123, "eval_loss": 0.19446897506713867, "eval_runtime": 26.5965, "eval_samples_per_second": 137.612, "eval_steps_per_second": 4.324, "step": 62000 }, { "epoch": 4.332192976331774, "grad_norm": 0.012227371335029602, "learning_rate": 7.441081097854251e-06, "loss": 0.0096, "step": 62050 }, { "epoch": 4.335683865112058, "grad_norm": 0.06303926557302475, "learning_rate": 7.402293143841249e-06, "loss": 0.0146, "step": 62100 }, { "epoch": 4.339174753892341, "grad_norm": 0.23254437744617462, "learning_rate": 7.363505189828247e-06, "loss": 0.0144, "step": 62150 }, { "epoch": 4.342665642672625, "grad_norm": 0.0949549674987793, "learning_rate": 7.324717235815246e-06, "loss": 0.0051, "step": 62200 }, { "epoch": 4.346156531452908, "grad_norm": 0.016313236206769943, "learning_rate": 7.285929281802243e-06, "loss": 0.0346, "step": 62250 }, { "epoch": 4.349647420233191, "grad_norm": 0.381818950176239, "learning_rate": 7.247141327789242e-06, "loss": 0.0144, "step": 62300 }, { "epoch": 4.353138309013475, "grad_norm": 0.020029937848448753, "learning_rate": 7.20835337377624e-06, "loss": 0.0121, "step": 62350 }, { "epoch": 4.356629197793758, "grad_norm": 0.10468722879886627, "learning_rate": 7.170341178843498e-06, "loss": 0.0144, "step": 62400 }, { "epoch": 4.3601200865740415, "grad_norm": 0.0396365262567997, "learning_rate": 7.1315532248304965e-06, "loss": 0.0188, "step": 62450 }, { "epoch": 4.363610975354325, "grad_norm": 0.23057244718074799, "learning_rate": 7.092765270817495e-06, "loss": 0.0159, "step": 62500 }, { "epoch": 4.367101864134609, "grad_norm": 0.07276740670204163, "learning_rate": 7.053977316804493e-06, "loss": 0.0106, "step": 62550 }, { "epoch": 4.3705927529148925, "grad_norm": 0.016734370961785316, "learning_rate": 7.0151893627914925e-06, "loss": 0.0208, "step": 62600 }, { "epoch": 4.374083641695176, "grad_norm": 0.009831580333411694, "learning_rate": 6.976401408778491e-06, "loss": 0.0222, "step": 62650 }, { "epoch": 4.377574530475459, "grad_norm": 0.17393887042999268, "learning_rate": 6.937613454765489e-06, "loss": 0.0104, "step": 62700 }, { "epoch": 4.3810654192557426, "grad_norm": 0.10413938015699387, "learning_rate": 6.898825500752487e-06, "loss": 0.0107, "step": 62750 }, { "epoch": 4.384556308036026, "grad_norm": 6.596822261810303, "learning_rate": 6.860037546739485e-06, "loss": 0.0124, "step": 62800 }, { "epoch": 4.388047196816309, "grad_norm": 0.0255569014698267, "learning_rate": 6.8212495927264836e-06, "loss": 0.0131, "step": 62850 }, { "epoch": 4.391538085596593, "grad_norm": 0.005997411906719208, "learning_rate": 6.782461638713482e-06, "loss": 0.0194, "step": 62900 }, { "epoch": 4.395028974376876, "grad_norm": 0.031105129048228264, "learning_rate": 6.74367368470048e-06, "loss": 0.0153, "step": 62950 }, { "epoch": 4.39851986315716, "grad_norm": 0.02738456428050995, "learning_rate": 6.704885730687478e-06, "loss": 0.0191, "step": 63000 }, { "epoch": 4.39851986315716, "eval_f1": 0.9386806160999709, "eval_loss": 0.1916884183883667, "eval_runtime": 27.6313, "eval_samples_per_second": 132.459, "eval_steps_per_second": 4.162, "step": 63000 }, { "epoch": 4.402010751937444, "grad_norm": 0.1203995794057846, "learning_rate": 6.666097776674476e-06, "loss": 0.0173, "step": 63050 }, { "epoch": 4.405501640717727, "grad_norm": 12.34914493560791, "learning_rate": 6.627309822661475e-06, "loss": 0.0146, "step": 63100 }, { "epoch": 4.40899252949801, "grad_norm": 10.822372436523438, "learning_rate": 6.588521868648473e-06, "loss": 0.0147, "step": 63150 }, { "epoch": 4.412483418278294, "grad_norm": 0.018758252263069153, "learning_rate": 6.549733914635471e-06, "loss": 0.0136, "step": 63200 }, { "epoch": 4.415974307058577, "grad_norm": 0.007578504737466574, "learning_rate": 6.510945960622469e-06, "loss": 0.0065, "step": 63250 }, { "epoch": 4.41946519583886, "grad_norm": 0.05588122084736824, "learning_rate": 6.472158006609467e-06, "loss": 0.0052, "step": 63300 }, { "epoch": 4.422956084619144, "grad_norm": 0.007861234247684479, "learning_rate": 6.433370052596466e-06, "loss": 0.0196, "step": 63350 }, { "epoch": 4.426446973399427, "grad_norm": 1.294003963470459, "learning_rate": 6.394582098583464e-06, "loss": 0.0164, "step": 63400 }, { "epoch": 4.429937862179711, "grad_norm": 0.416669100522995, "learning_rate": 6.3557941445704624e-06, "loss": 0.0211, "step": 63450 }, { "epoch": 4.433428750959995, "grad_norm": 11.46147346496582, "learning_rate": 6.317006190557461e-06, "loss": 0.0134, "step": 63500 }, { "epoch": 4.436919639740278, "grad_norm": 0.03336512669920921, "learning_rate": 6.278218236544458e-06, "loss": 0.0092, "step": 63550 }, { "epoch": 4.4404105285205615, "grad_norm": 0.0666455328464508, "learning_rate": 6.2394302825314576e-06, "loss": 0.0207, "step": 63600 }, { "epoch": 4.443901417300845, "grad_norm": 0.02921397238969803, "learning_rate": 6.200642328518456e-06, "loss": 0.0189, "step": 63650 }, { "epoch": 4.447392306081128, "grad_norm": 0.02691628783941269, "learning_rate": 6.161854374505454e-06, "loss": 0.0055, "step": 63700 }, { "epoch": 4.450883194861412, "grad_norm": 0.008060934953391552, "learning_rate": 6.123066420492452e-06, "loss": 0.0221, "step": 63750 }, { "epoch": 4.454374083641695, "grad_norm": 0.03594188764691353, "learning_rate": 6.08427846647945e-06, "loss": 0.0089, "step": 63800 }, { "epoch": 4.457864972421978, "grad_norm": 0.6915578246116638, "learning_rate": 6.045490512466449e-06, "loss": 0.0213, "step": 63850 }, { "epoch": 4.461355861202262, "grad_norm": 0.01729670725762844, "learning_rate": 6.006702558453447e-06, "loss": 0.0102, "step": 63900 }, { "epoch": 4.464846749982546, "grad_norm": 0.0669393390417099, "learning_rate": 5.967914604440445e-06, "loss": 0.0253, "step": 63950 }, { "epoch": 4.468337638762829, "grad_norm": 0.006938809063285589, "learning_rate": 5.929126650427443e-06, "loss": 0.0126, "step": 64000 }, { "epoch": 4.468337638762829, "eval_f1": 0.9565217391304348, "eval_loss": 0.13514484465122223, "eval_runtime": 25.6105, "eval_samples_per_second": 142.91, "eval_steps_per_second": 4.49, "step": 64000 }, { "epoch": 4.471828527543113, "grad_norm": 0.051233816891908646, "learning_rate": 5.890338696414441e-06, "loss": 0.0111, "step": 64050 }, { "epoch": 4.475319416323396, "grad_norm": 0.02876574732363224, "learning_rate": 5.8515507424014405e-06, "loss": 0.0217, "step": 64100 }, { "epoch": 4.478810305103679, "grad_norm": 5.748332500457764, "learning_rate": 5.812762788388439e-06, "loss": 0.0104, "step": 64150 }, { "epoch": 4.482301193883963, "grad_norm": 0.004587912000715733, "learning_rate": 5.7739748343754364e-06, "loss": 0.0156, "step": 64200 }, { "epoch": 4.485792082664246, "grad_norm": 0.04000851511955261, "learning_rate": 5.735186880362435e-06, "loss": 0.0202, "step": 64250 }, { "epoch": 4.4892829714445295, "grad_norm": 3.99192476272583, "learning_rate": 5.696398926349433e-06, "loss": 0.0132, "step": 64300 }, { "epoch": 4.492773860224813, "grad_norm": 0.005991567857563496, "learning_rate": 5.6576109723364316e-06, "loss": 0.0149, "step": 64350 }, { "epoch": 4.496264749005097, "grad_norm": 0.05888019874691963, "learning_rate": 5.61882301832343e-06, "loss": 0.0162, "step": 64400 }, { "epoch": 4.4997556377853805, "grad_norm": 0.013234168291091919, "learning_rate": 5.580035064310428e-06, "loss": 0.0192, "step": 64450 }, { "epoch": 4.503246526565664, "grad_norm": 0.00775251816958189, "learning_rate": 5.542022869377686e-06, "loss": 0.0259, "step": 64500 }, { "epoch": 4.506737415345947, "grad_norm": 3.50691556930542, "learning_rate": 5.503234915364685e-06, "loss": 0.0176, "step": 64550 }, { "epoch": 4.5102283041262305, "grad_norm": 0.35396555066108704, "learning_rate": 5.464446961351683e-06, "loss": 0.0148, "step": 64600 }, { "epoch": 4.513719192906514, "grad_norm": 0.01665789633989334, "learning_rate": 5.4256590073386815e-06, "loss": 0.0074, "step": 64650 }, { "epoch": 4.517210081686797, "grad_norm": 0.05178999900817871, "learning_rate": 5.38687105332568e-06, "loss": 0.0095, "step": 64700 }, { "epoch": 4.520700970467081, "grad_norm": 0.1286119818687439, "learning_rate": 5.3480830993126774e-06, "loss": 0.0054, "step": 64750 }, { "epoch": 4.524191859247364, "grad_norm": 17.01955795288086, "learning_rate": 5.309295145299676e-06, "loss": 0.0178, "step": 64800 }, { "epoch": 4.527682748027647, "grad_norm": 0.08323328197002411, "learning_rate": 5.270507191286674e-06, "loss": 0.0115, "step": 64850 }, { "epoch": 4.531173636807932, "grad_norm": 4.3498640060424805, "learning_rate": 5.2317192372736726e-06, "loss": 0.0137, "step": 64900 }, { "epoch": 4.534664525588215, "grad_norm": 4.796928405761719, "learning_rate": 5.192931283260671e-06, "loss": 0.012, "step": 64950 }, { "epoch": 4.538155414368498, "grad_norm": 0.011186789721250534, "learning_rate": 5.1541433292476685e-06, "loss": 0.0096, "step": 65000 }, { "epoch": 4.538155414368498, "eval_f1": 0.9540295119182747, "eval_loss": 0.14705610275268555, "eval_runtime": 22.5602, "eval_samples_per_second": 162.232, "eval_steps_per_second": 5.097, "step": 65000 }, { "epoch": 4.541646303148782, "grad_norm": 0.11445566266775131, "learning_rate": 5.115355375234667e-06, "loss": 0.0255, "step": 65050 }, { "epoch": 4.545137191929065, "grad_norm": 0.2920547425746918, "learning_rate": 5.076567421221665e-06, "loss": 0.0126, "step": 65100 }, { "epoch": 4.548628080709348, "grad_norm": 0.010884485207498074, "learning_rate": 5.0377794672086645e-06, "loss": 0.0084, "step": 65150 }, { "epoch": 4.552118969489632, "grad_norm": 0.004941530991345644, "learning_rate": 4.998991513195663e-06, "loss": 0.0138, "step": 65200 }, { "epoch": 4.555609858269915, "grad_norm": 0.03210439160466194, "learning_rate": 4.96020355918266e-06, "loss": 0.0137, "step": 65250 }, { "epoch": 4.5591007470501985, "grad_norm": 0.16372257471084595, "learning_rate": 4.921415605169659e-06, "loss": 0.009, "step": 65300 }, { "epoch": 4.562591635830483, "grad_norm": 0.01041162759065628, "learning_rate": 4.882627651156657e-06, "loss": 0.0111, "step": 65350 }, { "epoch": 4.566082524610766, "grad_norm": 0.010886666364967823, "learning_rate": 4.8438396971436555e-06, "loss": 0.0065, "step": 65400 }, { "epoch": 4.5695734133910495, "grad_norm": 5.038321495056152, "learning_rate": 4.805051743130654e-06, "loss": 0.0123, "step": 65450 }, { "epoch": 4.573064302171333, "grad_norm": 0.14299964904785156, "learning_rate": 4.7662637891176514e-06, "loss": 0.0104, "step": 65500 }, { "epoch": 4.576555190951616, "grad_norm": 0.07817520946264267, "learning_rate": 4.72747583510465e-06, "loss": 0.0153, "step": 65550 }, { "epoch": 4.5800460797319, "grad_norm": 0.02830136939883232, "learning_rate": 4.688687881091648e-06, "loss": 0.0212, "step": 65600 }, { "epoch": 4.583536968512183, "grad_norm": 0.006670476868748665, "learning_rate": 4.649899927078647e-06, "loss": 0.0134, "step": 65650 }, { "epoch": 4.587027857292466, "grad_norm": 12.660615921020508, "learning_rate": 4.611111973065645e-06, "loss": 0.0125, "step": 65700 }, { "epoch": 4.59051874607275, "grad_norm": 0.030848925933241844, "learning_rate": 4.572324019052643e-06, "loss": 0.0122, "step": 65750 }, { "epoch": 4.594009634853034, "grad_norm": 0.028387153521180153, "learning_rate": 4.533536065039642e-06, "loss": 0.0137, "step": 65800 }, { "epoch": 4.597500523633317, "grad_norm": 0.05481830611824989, "learning_rate": 4.49474811102664e-06, "loss": 0.0243, "step": 65850 }, { "epoch": 4.600991412413601, "grad_norm": 0.021391669288277626, "learning_rate": 4.4559601570136385e-06, "loss": 0.0206, "step": 65900 }, { "epoch": 4.604482301193884, "grad_norm": 0.06175484508275986, "learning_rate": 4.417172203000636e-06, "loss": 0.0182, "step": 65950 }, { "epoch": 4.607973189974167, "grad_norm": 0.10749427229166031, "learning_rate": 4.378384248987634e-06, "loss": 0.0187, "step": 66000 }, { "epoch": 4.607973189974167, "eval_f1": 0.9402205455600696, "eval_loss": 0.2095240354537964, "eval_runtime": 23.1199, "eval_samples_per_second": 158.305, "eval_steps_per_second": 4.974, "step": 66000 }, { "epoch": 4.611464078754451, "grad_norm": 0.009061275981366634, "learning_rate": 4.339596294974633e-06, "loss": 0.0278, "step": 66050 }, { "epoch": 4.614954967534734, "grad_norm": 0.003557862713932991, "learning_rate": 4.300808340961631e-06, "loss": 0.0101, "step": 66100 }, { "epoch": 4.6184458563150175, "grad_norm": 0.05570525676012039, "learning_rate": 4.2620203869486295e-06, "loss": 0.0148, "step": 66150 }, { "epoch": 4.621936745095301, "grad_norm": 26.183717727661133, "learning_rate": 4.223232432935628e-06, "loss": 0.0293, "step": 66200 }, { "epoch": 4.625427633875585, "grad_norm": 0.01294758915901184, "learning_rate": 4.1844444789226254e-06, "loss": 0.014, "step": 66250 }, { "epoch": 4.6289185226558685, "grad_norm": 0.005879008211195469, "learning_rate": 4.146432283989884e-06, "loss": 0.0293, "step": 66300 }, { "epoch": 4.632409411436152, "grad_norm": 0.06964797526597977, "learning_rate": 4.107644329976883e-06, "loss": 0.0088, "step": 66350 }, { "epoch": 4.635900300216435, "grad_norm": 0.07942630350589752, "learning_rate": 4.068856375963881e-06, "loss": 0.0174, "step": 66400 }, { "epoch": 4.6393911889967185, "grad_norm": 0.12276989966630936, "learning_rate": 4.0300684219508795e-06, "loss": 0.0176, "step": 66450 }, { "epoch": 4.642882077777002, "grad_norm": 0.07290726155042648, "learning_rate": 3.991280467937877e-06, "loss": 0.0048, "step": 66500 }, { "epoch": 4.646372966557285, "grad_norm": 5.691318511962891, "learning_rate": 3.952492513924875e-06, "loss": 0.0185, "step": 66550 }, { "epoch": 4.649863855337569, "grad_norm": 0.01649297960102558, "learning_rate": 3.913704559911874e-06, "loss": 0.01, "step": 66600 }, { "epoch": 4.653354744117852, "grad_norm": 0.08620315790176392, "learning_rate": 3.874916605898872e-06, "loss": 0.0143, "step": 66650 }, { "epoch": 4.656845632898136, "grad_norm": 0.26478084921836853, "learning_rate": 3.8361286518858705e-06, "loss": 0.014, "step": 66700 }, { "epoch": 4.66033652167842, "grad_norm": 0.10022763907909393, "learning_rate": 3.7973406978728685e-06, "loss": 0.0032, "step": 66750 }, { "epoch": 4.663827410458703, "grad_norm": 6.073363780975342, "learning_rate": 3.7585527438598673e-06, "loss": 0.0149, "step": 66800 }, { "epoch": 4.667318299238986, "grad_norm": 0.014798264019191265, "learning_rate": 3.7197647898468657e-06, "loss": 0.0086, "step": 66850 }, { "epoch": 4.67080918801927, "grad_norm": 0.0035229383502155542, "learning_rate": 3.6809768358338636e-06, "loss": 0.0184, "step": 66900 }, { "epoch": 4.674300076799553, "grad_norm": 0.008204997517168522, "learning_rate": 3.642188881820862e-06, "loss": 0.0132, "step": 66950 }, { "epoch": 4.677790965579836, "grad_norm": 0.26443013548851013, "learning_rate": 3.6034009278078604e-06, "loss": 0.0175, "step": 67000 }, { "epoch": 4.677790965579836, "eval_f1": 0.9542743538767395, "eval_loss": 0.1465238481760025, "eval_runtime": 19.3418, "eval_samples_per_second": 189.227, "eval_steps_per_second": 5.946, "step": 67000 }, { "epoch": 4.68128185436012, "grad_norm": 0.03244328871369362, "learning_rate": 3.5646129737948583e-06, "loss": 0.0144, "step": 67050 }, { "epoch": 4.684772743140403, "grad_norm": 0.006314130965620279, "learning_rate": 3.5258250197818567e-06, "loss": 0.0162, "step": 67100 }, { "epoch": 4.688263631920687, "grad_norm": 3.670078992843628, "learning_rate": 3.4870370657688547e-06, "loss": 0.0102, "step": 67150 }, { "epoch": 4.691754520700971, "grad_norm": 0.016808710992336273, "learning_rate": 3.448249111755853e-06, "loss": 0.0114, "step": 67200 }, { "epoch": 4.695245409481254, "grad_norm": 7.48440408706665, "learning_rate": 3.4094611577428514e-06, "loss": 0.0159, "step": 67250 }, { "epoch": 4.6987362982615375, "grad_norm": 0.22316499054431915, "learning_rate": 3.3706732037298494e-06, "loss": 0.0077, "step": 67300 }, { "epoch": 4.702227187041821, "grad_norm": 6.541471004486084, "learning_rate": 3.3318852497168478e-06, "loss": 0.015, "step": 67350 }, { "epoch": 4.705718075822104, "grad_norm": 0.038210611790418625, "learning_rate": 3.2930972957038466e-06, "loss": 0.0139, "step": 67400 }, { "epoch": 4.709208964602388, "grad_norm": 0.008333321660757065, "learning_rate": 3.254309341690845e-06, "loss": 0.0192, "step": 67450 }, { "epoch": 4.712699853382671, "grad_norm": 0.031475260853767395, "learning_rate": 3.215521387677843e-06, "loss": 0.0059, "step": 67500 }, { "epoch": 4.716190742162954, "grad_norm": 0.0056376708671450615, "learning_rate": 3.1767334336648413e-06, "loss": 0.0081, "step": 67550 }, { "epoch": 4.719681630943239, "grad_norm": 0.013743482530117035, "learning_rate": 3.1379454796518397e-06, "loss": 0.0144, "step": 67600 }, { "epoch": 4.723172519723522, "grad_norm": 0.009420888498425484, "learning_rate": 3.0991575256388376e-06, "loss": 0.0115, "step": 67650 }, { "epoch": 4.726663408503805, "grad_norm": 0.007564714178442955, "learning_rate": 3.060369571625836e-06, "loss": 0.0119, "step": 67700 }, { "epoch": 4.730154297284089, "grad_norm": 0.13455595076084137, "learning_rate": 3.0215816176128344e-06, "loss": 0.0183, "step": 67750 }, { "epoch": 4.733645186064372, "grad_norm": 0.011826273053884506, "learning_rate": 2.9827936635998324e-06, "loss": 0.0192, "step": 67800 }, { "epoch": 4.737136074844655, "grad_norm": 0.02266635373234749, "learning_rate": 2.944005709586831e-06, "loss": 0.0083, "step": 67850 }, { "epoch": 4.740626963624939, "grad_norm": 2.2667205333709717, "learning_rate": 2.905217755573829e-06, "loss": 0.0208, "step": 67900 }, { "epoch": 4.744117852405222, "grad_norm": 0.08782637864351273, "learning_rate": 2.8664298015608275e-06, "loss": 0.0114, "step": 67950 }, { "epoch": 4.7476087411855055, "grad_norm": 0.013912571594119072, "learning_rate": 2.8276418475478255e-06, "loss": 0.0209, "step": 68000 }, { "epoch": 4.7476087411855055, "eval_f1": 0.9542223485925505, "eval_loss": 0.14947552978992462, "eval_runtime": 20.6326, "eval_samples_per_second": 177.389, "eval_steps_per_second": 5.574, "step": 68000 }, { "epoch": 4.75109962996579, "grad_norm": 0.07238892465829849, "learning_rate": 2.788853893534824e-06, "loss": 0.0115, "step": 68050 }, { "epoch": 4.754590518746073, "grad_norm": 0.007459067273885012, "learning_rate": 2.750065939521822e-06, "loss": 0.0148, "step": 68100 }, { "epoch": 4.758081407526356, "grad_norm": 9.393803596496582, "learning_rate": 2.7112779855088206e-06, "loss": 0.0145, "step": 68150 }, { "epoch": 4.76157229630664, "grad_norm": 0.1944788694381714, "learning_rate": 2.672490031495819e-06, "loss": 0.0166, "step": 68200 }, { "epoch": 4.765063185086923, "grad_norm": 0.086297906935215, "learning_rate": 2.633702077482817e-06, "loss": 0.0128, "step": 68250 }, { "epoch": 4.7685540738672065, "grad_norm": 0.007248733192682266, "learning_rate": 2.5949141234698153e-06, "loss": 0.0138, "step": 68300 }, { "epoch": 4.77204496264749, "grad_norm": 0.06873763352632523, "learning_rate": 2.5561261694568137e-06, "loss": 0.0196, "step": 68350 }, { "epoch": 4.775535851427773, "grad_norm": 0.011037365533411503, "learning_rate": 2.5173382154438116e-06, "loss": 0.0111, "step": 68400 }, { "epoch": 4.779026740208057, "grad_norm": 0.3673991858959198, "learning_rate": 2.4785502614308104e-06, "loss": 0.0147, "step": 68450 }, { "epoch": 4.782517628988341, "grad_norm": 0.014958728104829788, "learning_rate": 2.4397623074178084e-06, "loss": 0.0139, "step": 68500 }, { "epoch": 4.786008517768624, "grad_norm": 0.07658286392688751, "learning_rate": 2.4009743534048068e-06, "loss": 0.0091, "step": 68550 }, { "epoch": 4.789499406548908, "grad_norm": 0.09001337736845016, "learning_rate": 2.362186399391805e-06, "loss": 0.0095, "step": 68600 }, { "epoch": 4.792990295329191, "grad_norm": 0.0031005863565951586, "learning_rate": 2.323398445378803e-06, "loss": 0.0174, "step": 68650 }, { "epoch": 4.796481184109474, "grad_norm": 0.0415862575173378, "learning_rate": 2.2846104913658015e-06, "loss": 0.0038, "step": 68700 }, { "epoch": 4.799972072889758, "grad_norm": 4.582136631011963, "learning_rate": 2.2458225373528e-06, "loss": 0.0197, "step": 68750 }, { "epoch": 4.803462961670041, "grad_norm": 0.02210802398622036, "learning_rate": 2.2070345833397983e-06, "loss": 0.012, "step": 68800 }, { "epoch": 4.806953850450324, "grad_norm": 0.01928221434354782, "learning_rate": 2.1682466293267962e-06, "loss": 0.0144, "step": 68850 }, { "epoch": 4.810444739230608, "grad_norm": 0.016591308638453484, "learning_rate": 2.1294586753137946e-06, "loss": 0.0041, "step": 68900 }, { "epoch": 4.813935628010892, "grad_norm": 5.927182197570801, "learning_rate": 2.090670721300793e-06, "loss": 0.015, "step": 68950 }, { "epoch": 4.817426516791175, "grad_norm": 0.07412146776914597, "learning_rate": 2.051882767287791e-06, "loss": 0.0202, "step": 69000 }, { "epoch": 4.817426516791175, "eval_f1": 0.958063608218407, "eval_loss": 0.13213428854942322, "eval_runtime": 23.7215, "eval_samples_per_second": 154.29, "eval_steps_per_second": 4.848, "step": 69000 }, { "epoch": 4.820917405571459, "grad_norm": 0.011551358737051487, "learning_rate": 2.0130948132747897e-06, "loss": 0.0106, "step": 69050 }, { "epoch": 4.824408294351742, "grad_norm": 0.041600409895181656, "learning_rate": 1.9743068592617877e-06, "loss": 0.028, "step": 69100 }, { "epoch": 4.8278991831320255, "grad_norm": 0.011347083374857903, "learning_rate": 1.935518905248786e-06, "loss": 0.0197, "step": 69150 }, { "epoch": 4.831390071912309, "grad_norm": 0.1340709775686264, "learning_rate": 1.8967309512357842e-06, "loss": 0.0147, "step": 69200 }, { "epoch": 4.834880960692592, "grad_norm": 0.013690965250134468, "learning_rate": 1.8579429972227824e-06, "loss": 0.0118, "step": 69250 }, { "epoch": 4.838371849472876, "grad_norm": 7.221401214599609, "learning_rate": 1.8191550432097808e-06, "loss": 0.0314, "step": 69300 }, { "epoch": 4.841862738253159, "grad_norm": 0.0063622863963246346, "learning_rate": 1.7803670891967792e-06, "loss": 0.011, "step": 69350 }, { "epoch": 4.845353627033443, "grad_norm": 0.09204477816820145, "learning_rate": 1.7415791351837776e-06, "loss": 0.0089, "step": 69400 }, { "epoch": 4.8488445158137266, "grad_norm": 0.2040203958749771, "learning_rate": 1.7027911811707757e-06, "loss": 0.0019, "step": 69450 }, { "epoch": 4.85233540459401, "grad_norm": 0.009934721514582634, "learning_rate": 1.6640032271577739e-06, "loss": 0.0226, "step": 69500 }, { "epoch": 4.855826293374293, "grad_norm": 0.009200035594403744, "learning_rate": 1.625215273144772e-06, "loss": 0.0202, "step": 69550 }, { "epoch": 4.859317182154577, "grad_norm": 2.9772799015045166, "learning_rate": 1.5864273191317704e-06, "loss": 0.0267, "step": 69600 }, { "epoch": 4.86280807093486, "grad_norm": 0.046260032802820206, "learning_rate": 1.5476393651187688e-06, "loss": 0.0111, "step": 69650 }, { "epoch": 4.866298959715143, "grad_norm": 0.5659658312797546, "learning_rate": 1.509627170186027e-06, "loss": 0.024, "step": 69700 }, { "epoch": 4.869789848495427, "grad_norm": 0.029453273862600327, "learning_rate": 1.4708392161730255e-06, "loss": 0.0206, "step": 69750 }, { "epoch": 4.87328073727571, "grad_norm": 0.06688568741083145, "learning_rate": 1.4320512621600236e-06, "loss": 0.0081, "step": 69800 }, { "epoch": 4.876771626055994, "grad_norm": 0.09227997809648514, "learning_rate": 1.393263308147022e-06, "loss": 0.0158, "step": 69850 }, { "epoch": 4.880262514836277, "grad_norm": 0.01653113402426243, "learning_rate": 1.3544753541340202e-06, "loss": 0.012, "step": 69900 }, { "epoch": 4.883753403616561, "grad_norm": 0.1240030974149704, "learning_rate": 1.3156874001210183e-06, "loss": 0.0042, "step": 69950 }, { "epoch": 4.887244292396844, "grad_norm": 0.05169876664876938, "learning_rate": 1.2768994461080167e-06, "loss": 0.0181, "step": 70000 }, { "epoch": 4.887244292396844, "eval_f1": 0.9547652916073969, "eval_loss": 0.14620168507099152, "eval_runtime": 18.4218, "eval_samples_per_second": 198.678, "eval_steps_per_second": 6.243, "step": 70000 }, { "epoch": 4.890735181177128, "grad_norm": 0.005993202794343233, "learning_rate": 1.238111492095015e-06, "loss": 0.0088, "step": 70050 }, { "epoch": 4.894226069957411, "grad_norm": 0.014470288529992104, "learning_rate": 1.1993235380820133e-06, "loss": 0.0226, "step": 70100 }, { "epoch": 4.8977169587376945, "grad_norm": 4.7974162101745605, "learning_rate": 1.1605355840690116e-06, "loss": 0.0145, "step": 70150 }, { "epoch": 4.901207847517978, "grad_norm": 0.010994662530720234, "learning_rate": 1.1217476300560098e-06, "loss": 0.0174, "step": 70200 }, { "epoch": 4.904698736298261, "grad_norm": 0.006302454508841038, "learning_rate": 1.082959676043008e-06, "loss": 0.0128, "step": 70250 }, { "epoch": 4.908189625078545, "grad_norm": 0.1383901685476303, "learning_rate": 1.0441717220300066e-06, "loss": 0.016, "step": 70300 }, { "epoch": 4.911680513858828, "grad_norm": 0.06532856822013855, "learning_rate": 1.0053837680170047e-06, "loss": 0.0118, "step": 70350 }, { "epoch": 4.915171402639112, "grad_norm": 0.010423216037452221, "learning_rate": 9.66595814004003e-07, "loss": 0.0085, "step": 70400 }, { "epoch": 4.918662291419396, "grad_norm": 0.030837245285511017, "learning_rate": 9.278078599910013e-07, "loss": 0.015, "step": 70450 }, { "epoch": 4.922153180199679, "grad_norm": 0.025385385379195213, "learning_rate": 8.890199059779996e-07, "loss": 0.005, "step": 70500 }, { "epoch": 4.925644068979962, "grad_norm": 4.04787540435791, "learning_rate": 8.502319519649977e-07, "loss": 0.0135, "step": 70550 }, { "epoch": 4.929134957760246, "grad_norm": 0.018447715789079666, "learning_rate": 8.114439979519961e-07, "loss": 0.0159, "step": 70600 }, { "epoch": 4.932625846540529, "grad_norm": 0.5685107707977295, "learning_rate": 7.726560439389944e-07, "loss": 0.0022, "step": 70650 }, { "epoch": 4.936116735320812, "grad_norm": 0.05199820548295975, "learning_rate": 7.338680899259926e-07, "loss": 0.0176, "step": 70700 }, { "epoch": 4.939607624101096, "grad_norm": 0.18712732195854187, "learning_rate": 6.950801359129908e-07, "loss": 0.0236, "step": 70750 }, { "epoch": 4.943098512881379, "grad_norm": 0.009691745974123478, "learning_rate": 6.562921818999892e-07, "loss": 0.0238, "step": 70800 }, { "epoch": 4.946589401661663, "grad_norm": 0.0077186268754303455, "learning_rate": 6.175042278869874e-07, "loss": 0.0196, "step": 70850 }, { "epoch": 4.950080290441947, "grad_norm": 0.015092890709638596, "learning_rate": 5.787162738739857e-07, "loss": 0.0166, "step": 70900 }, { "epoch": 4.95357117922223, "grad_norm": 0.01764654368162155, "learning_rate": 5.39928319860984e-07, "loss": 0.0084, "step": 70950 }, { "epoch": 4.9570620680025135, "grad_norm": 0.030939076095819473, "learning_rate": 5.011403658479822e-07, "loss": 0.0067, "step": 71000 }, { "epoch": 4.9570620680025135, "eval_f1": 0.9526525955504849, "eval_loss": 0.150511234998703, "eval_runtime": 19.3329, "eval_samples_per_second": 189.315, "eval_steps_per_second": 5.948, "step": 71000 }, { "epoch": 4.960552956782797, "grad_norm": 0.016753925010561943, "learning_rate": 4.6235241183498053e-07, "loss": 0.0057, "step": 71050 }, { "epoch": 4.96404384556308, "grad_norm": 0.01155967079102993, "learning_rate": 4.2356445782197886e-07, "loss": 0.0089, "step": 71100 }, { "epoch": 4.967534734343364, "grad_norm": 0.04415968805551529, "learning_rate": 3.847765038089771e-07, "loss": 0.01, "step": 71150 }, { "epoch": 4.971025623123647, "grad_norm": 4.265117645263672, "learning_rate": 3.4598854979597536e-07, "loss": 0.017, "step": 71200 }, { "epoch": 4.97451651190393, "grad_norm": 0.02736731618642807, "learning_rate": 3.072005957829737e-07, "loss": 0.007, "step": 71250 }, { "epoch": 4.9780074006842145, "grad_norm": 0.029399313032627106, "learning_rate": 2.6841264176997196e-07, "loss": 0.0195, "step": 71300 }, { "epoch": 4.981498289464498, "grad_norm": 0.006107365246862173, "learning_rate": 2.296246877569702e-07, "loss": 0.0016, "step": 71350 }, { "epoch": 4.984989178244781, "grad_norm": 3.961512565612793, "learning_rate": 1.9083673374396848e-07, "loss": 0.0087, "step": 71400 }, { "epoch": 4.988480067025065, "grad_norm": 0.02385067753493786, "learning_rate": 1.5204877973096676e-07, "loss": 0.0261, "step": 71450 }, { "epoch": 4.991970955805348, "grad_norm": 0.15537860989570618, "learning_rate": 1.1326082571796503e-07, "loss": 0.0056, "step": 71500 }, { "epoch": 4.995461844585631, "grad_norm": 9.23886489868164, "learning_rate": 7.44728717049633e-08, "loss": 0.018, "step": 71550 }, { "epoch": 4.998952733365915, "grad_norm": 3.6488893032073975, "learning_rate": 3.5684917691961585e-08, "loss": 0.0126, "step": 71600 }, { "epoch": 5.0, "step": 71615, "total_flos": 1.2034503255440461e+20, "train_loss": 0.03158619465179086, "train_runtime": 20070.0469, "train_samples_per_second": 114.177, "train_steps_per_second": 3.568 } ], "logging_steps": 50, "max_steps": 71615, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2034503255440461e+20, "train_batch_size": 32, "trial_name": null, "trial_params": null }