diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 0.7555555555555555, - "best_model_checkpoint": "CTMAE-P2-V4-S2/checkpoint-8091", + "best_metric": 0.7333333333333333, + "best_model_checkpoint": "CTMAE-P2-V4-S2/checkpoint-10440", "epoch": 49.02, "eval_steps": 500, "global_step": 13050, @@ -10,8232 +10,8232 @@ "log_history": [ { "epoch": 0.0007662835249042146, - "grad_norm": 5.652839660644531, + "grad_norm": 6.34381103515625, "learning_rate": 7.662835249042146e-08, - "loss": 0.6911, + "loss": 0.6932, "step": 10 }, { "epoch": 0.0015325670498084292, - "grad_norm": 6.038414001464844, + "grad_norm": 5.316327095031738, "learning_rate": 1.5325670498084292e-07, - "loss": 0.686, + "loss": 0.6874, "step": 20 }, { "epoch": 0.0022988505747126436, - "grad_norm": 7.407590389251709, + "grad_norm": 5.252355098724365, "learning_rate": 2.2988505747126437e-07, - "loss": 0.6875, + "loss": 0.6947, "step": 30 }, { "epoch": 0.0030651340996168583, - "grad_norm": 5.204799175262451, + "grad_norm": 5.454947471618652, "learning_rate": 3.0651340996168583e-07, - "loss": 0.6794, + "loss": 0.6838, "step": 40 }, { "epoch": 0.0038314176245210726, - "grad_norm": 5.656806945800781, + "grad_norm": 4.9687113761901855, "learning_rate": 3.831417624521073e-07, - "loss": 0.6654, + "loss": 0.6756, "step": 50 }, { "epoch": 0.004597701149425287, - "grad_norm": 6.95879602432251, + "grad_norm": 5.919829845428467, "learning_rate": 4.5977011494252875e-07, - "loss": 0.6936, + "loss": 0.6861, "step": 60 }, { "epoch": 0.0053639846743295016, - "grad_norm": 6.637821197509766, + "grad_norm": 5.693767070770264, "learning_rate": 5.363984674329502e-07, - "loss": 0.6446, + "loss": 0.6608, "step": 70 }, { "epoch": 0.006130268199233717, - "grad_norm": 7.103973865509033, + "grad_norm": 4.888784885406494, "learning_rate": 6.130268199233717e-07, - "loss": 0.673, + "loss": 0.6744, "step": 80 }, { "epoch": 0.006896551724137931, - "grad_norm": 6.592593193054199, + "grad_norm": 4.942636489868164, "learning_rate": 6.896551724137931e-07, - "loss": 0.6082, + "loss": 0.6248, "step": 90 }, { "epoch": 0.007662835249042145, - "grad_norm": 11.525618553161621, + "grad_norm": 7.797171592712402, "learning_rate": 7.662835249042146e-07, - "loss": 0.7945, + "loss": 0.7431, "step": 100 }, { "epoch": 0.00842911877394636, - "grad_norm": 6.677394866943359, + "grad_norm": 4.698269367218018, "learning_rate": 8.429118773946361e-07, - "loss": 0.5628, + "loss": 0.6016, "step": 110 }, { "epoch": 0.009195402298850575, - "grad_norm": 16.042999267578125, + "grad_norm": 10.742891311645508, "learning_rate": 9.195402298850575e-07, - "loss": 0.5443, + "loss": 0.6037, "step": 120 }, { "epoch": 0.00996168582375479, - "grad_norm": 24.803476333618164, + "grad_norm": 4.607847213745117, "learning_rate": 9.96168582375479e-07, - "loss": 0.4339, + "loss": 0.4934, "step": 130 }, { "epoch": 0.010727969348659003, - "grad_norm": 6.741539478302002, + "grad_norm": 11.38501262664795, "learning_rate": 1.0727969348659004e-06, - "loss": 0.755, + "loss": 0.7138, "step": 140 }, { "epoch": 0.011494252873563218, - "grad_norm": 31.296611785888672, + "grad_norm": 21.715641021728516, "learning_rate": 1.1494252873563219e-06, - "loss": 0.4921, + "loss": 0.5165, "step": 150 }, { "epoch": 0.012260536398467433, - "grad_norm": 40.26383972167969, + "grad_norm": 39.267356872558594, "learning_rate": 1.2260536398467433e-06, - "loss": 0.5677, + "loss": 0.5534, "step": 160 }, { "epoch": 0.013026819923371647, - "grad_norm": 4.158061504364014, + "grad_norm": 8.496562957763672, "learning_rate": 1.3026819923371648e-06, - "loss": 0.6033, + "loss": 0.5268, "step": 170 }, { "epoch": 0.013793103448275862, - "grad_norm": 1.5700534582138062, + "grad_norm": 3.9400100708007812, "learning_rate": 1.3793103448275862e-06, - "loss": 0.6387, + "loss": 0.5656, "step": 180 }, { "epoch": 0.014559386973180077, - "grad_norm": 1.463417887687683, + "grad_norm": 5.03970193862915, "learning_rate": 1.455938697318008e-06, - "loss": 1.0985, + "loss": 0.9341, "step": 190 }, { "epoch": 0.01532567049808429, - "grad_norm": 1.8321679830551147, + "grad_norm": 6.012237548828125, "learning_rate": 1.5325670498084292e-06, - "loss": 1.1611, + "loss": 0.9896, "step": 200 }, { "epoch": 0.016091954022988506, - "grad_norm": 1.387742042541504, + "grad_norm": 4.443873882293701, "learning_rate": 1.6091954022988506e-06, - "loss": 1.1747, + "loss": 0.9845, "step": 210 }, { "epoch": 0.01685823754789272, - "grad_norm": 1.9803731441497803, + "grad_norm": 5.772376537322998, "learning_rate": 1.6858237547892723e-06, - "loss": 1.7274, + "loss": 1.5395, "step": 220 }, { "epoch": 0.017624521072796936, - "grad_norm": 0.4964846670627594, + "grad_norm": 2.1801087856292725, "learning_rate": 1.7624521072796935e-06, - "loss": 0.0131, + "loss": 0.0262, "step": 230 }, { "epoch": 0.01839080459770115, - "grad_norm": 0.2209441065788269, + "grad_norm": 0.44824352860450745, "learning_rate": 1.839080459770115e-06, - "loss": 0.4633, + "loss": 0.3738, "step": 240 }, { "epoch": 0.019157088122605363, - "grad_norm": 0.2584649622440338, + "grad_norm": 0.558213472366333, "learning_rate": 1.9157088122605367e-06, - "loss": 1.5895, + "loss": 1.4904, "step": 250 }, { "epoch": 0.01992337164750958, - "grad_norm": 0.6102164387702942, + "grad_norm": 0.43040409684181213, "learning_rate": 1.992337164750958e-06, - "loss": 1.5916, + "loss": 1.5797, "step": 260 }, { "epoch": 0.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 2.1703546047210693, - "eval_runtime": 19.5729, - "eval_samples_per_second": 2.299, - "eval_steps_per_second": 2.299, + "eval_loss": 2.3909504413604736, + "eval_runtime": 18.3899, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, "step": 261 }, { "epoch": 1.0006896551724138, - "grad_norm": 0.2907000184059143, + "grad_norm": 0.5485615134239197, "learning_rate": 2.0689655172413796e-06, - "loss": 1.075, + "loss": 1.1079, "step": 270 }, { "epoch": 1.001455938697318, - "grad_norm": 40.891265869140625, + "grad_norm": 87.26502227783203, "learning_rate": 2.145593869731801e-06, - "loss": 1.5405, + "loss": 1.6445, "step": 280 }, { "epoch": 1.0022222222222221, - "grad_norm": 0.36645081639289856, + "grad_norm": 0.6385524272918701, "learning_rate": 2.222222222222222e-06, - "loss": 1.0165, + "loss": 1.0985, "step": 290 }, { "epoch": 1.0029885057471264, - "grad_norm": 1.2434968948364258, + "grad_norm": 0.628416895866394, "learning_rate": 2.2988505747126437e-06, - "loss": 1.0131, + "loss": 1.0926, "step": 300 }, { "epoch": 1.0037547892720307, - "grad_norm": 0.46751731634140015, + "grad_norm": 0.703681468963623, "learning_rate": 2.3754789272030654e-06, - "loss": 2.4679, + "loss": 2.76, "step": 310 }, { "epoch": 1.004521072796935, - "grad_norm": 1.0821744203567505, + "grad_norm": 1.0094643831253052, "learning_rate": 2.4521072796934867e-06, - "loss": 1.3531, + "loss": 1.52, "step": 320 }, { "epoch": 1.0052873563218392, - "grad_norm": 9.356194496154785, + "grad_norm": 0.5250710248947144, "learning_rate": 2.5287356321839083e-06, - "loss": 1.7448, + "loss": 1.9831, "step": 330 }, { "epoch": 1.0060536398467432, - "grad_norm": 0.4335194230079651, + "grad_norm": 0.8657022714614868, "learning_rate": 2.6053639846743296e-06, - "loss": 0.9566, + "loss": 1.027, "step": 340 }, { "epoch": 1.0068199233716475, - "grad_norm": 14.799612998962402, + "grad_norm": 0.6573832631111145, "learning_rate": 2.6819923371647512e-06, - "loss": 1.8549, + "loss": 2.0993, "step": 350 }, { "epoch": 1.0075862068965518, - "grad_norm": 73.6395263671875, + "grad_norm": 73.20822143554688, "learning_rate": 2.7586206896551725e-06, - "loss": 1.3878, + "loss": 1.627, "step": 360 }, { "epoch": 1.008352490421456, - "grad_norm": 0.5049798488616943, + "grad_norm": 0.48767930269241333, "learning_rate": 2.835249042145594e-06, - "loss": 1.9485, + "loss": 2.0081, "step": 370 }, { "epoch": 1.00911877394636, - "grad_norm": 0.25192803144454956, + "grad_norm": 0.19195550680160522, "learning_rate": 2.911877394636016e-06, - "loss": 0.5163, + "loss": 0.5922, "step": 380 }, { "epoch": 1.0098850574712643, - "grad_norm": 55.14847183227539, + "grad_norm": 117.13427734375, "learning_rate": 2.988505747126437e-06, - "loss": 2.538, + "loss": 2.7542, "step": 390 }, { "epoch": 1.0106513409961686, - "grad_norm": 0.3945120871067047, + "grad_norm": 0.4038996696472168, "learning_rate": 3.0651340996168583e-06, - "loss": 0.4539, + "loss": 0.5338, "step": 400 }, { "epoch": 1.0114176245210729, - "grad_norm": 35.96253204345703, + "grad_norm": 79.3775405883789, "learning_rate": 3.14176245210728e-06, - "loss": 1.0977, + "loss": 1.0984, "step": 410 }, { "epoch": 1.012183908045977, - "grad_norm": 0.38254401087760925, + "grad_norm": 0.4100040793418884, "learning_rate": 3.2183908045977012e-06, - "loss": 1.5617, + "loss": 1.5991, "step": 420 }, { "epoch": 1.0129501915708812, - "grad_norm": 0.5673269033432007, + "grad_norm": 0.5613195896148682, "learning_rate": 3.295019157088123e-06, - "loss": 1.4325, + "loss": 1.5589, "step": 430 }, { "epoch": 1.0137164750957854, - "grad_norm": 0.4337630867958069, + "grad_norm": 0.281681627035141, "learning_rate": 3.3716475095785446e-06, - "loss": 1.4372, + "loss": 1.5447, "step": 440 }, { "epoch": 1.0144827586206897, - "grad_norm": 0.1940333992242813, + "grad_norm": 0.20483756065368652, "learning_rate": 3.448275862068966e-06, - "loss": 0.0075, + "loss": 0.0046, "step": 450 }, { "epoch": 1.015249042145594, - "grad_norm": 0.26588886976242065, + "grad_norm": 0.26011019945144653, "learning_rate": 3.524904214559387e-06, - "loss": 1.0735, + "loss": 1.2156, "step": 460 }, { "epoch": 1.016015325670498, - "grad_norm": 0.24096156656742096, + "grad_norm": 0.6013814210891724, "learning_rate": 3.6015325670498087e-06, - "loss": 1.6232, + "loss": 1.7069, "step": 470 }, { "epoch": 1.0167816091954023, - "grad_norm": 2.214542865753174, + "grad_norm": 0.8164659738540649, "learning_rate": 3.67816091954023e-06, - "loss": 1.9241, + "loss": 1.9502, "step": 480 }, { "epoch": 1.0175478927203065, - "grad_norm": 0.8996695876121521, + "grad_norm": 0.6838377118110657, "learning_rate": 3.7547892720306517e-06, - "loss": 1.7149, + "loss": 1.9397, "step": 490 }, { "epoch": 1.0183141762452108, - "grad_norm": 0.43199461698532104, + "grad_norm": 0.31821244955062866, "learning_rate": 3.831417624521073e-06, - "loss": 0.401, + "loss": 0.4382, "step": 500 }, { "epoch": 1.0190804597701149, - "grad_norm": 0.2488986998796463, + "grad_norm": 0.13592922687530518, "learning_rate": 3.908045977011495e-06, - "loss": 0.4625, + "loss": 0.5357, "step": 510 }, { "epoch": 1.0198467432950191, - "grad_norm": 0.297882080078125, + "grad_norm": 0.24643361568450928, "learning_rate": 3.984674329501916e-06, - "loss": 0.5716, + "loss": 0.6074, "step": 520 }, { "epoch": 1.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 2.521507978439331, - "eval_runtime": 18.5535, - "eval_samples_per_second": 2.425, - "eval_steps_per_second": 2.425, + "eval_loss": 2.6630847454071045, + "eval_runtime": 18.1273, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, "step": 522 }, { "epoch": 2.0006130268199236, - "grad_norm": 43.404659271240234, + "grad_norm": 63.87251281738281, "learning_rate": 4.0613026819923375e-06, - "loss": 1.6952, + "loss": 1.8009, "step": 530 }, { "epoch": 2.0013793103448276, - "grad_norm": 3.141453981399536, + "grad_norm": 5.836681842803955, "learning_rate": 4.137931034482759e-06, - "loss": 2.2189, + "loss": 2.2765, "step": 540 }, { "epoch": 2.0021455938697317, - "grad_norm": 40.19944763183594, + "grad_norm": 43.2210807800293, "learning_rate": 4.214559386973181e-06, - "loss": 0.7344, + "loss": 0.8407, "step": 550 }, { "epoch": 2.002911877394636, - "grad_norm": 43.29054260253906, + "grad_norm": 40.42451858520508, "learning_rate": 4.291187739463602e-06, - "loss": 2.1861, + "loss": 2.2198, "step": 560 }, { "epoch": 2.00367816091954, - "grad_norm": 1.5995831489562988, + "grad_norm": 0.9370375275611877, "learning_rate": 4.367816091954023e-06, - "loss": 1.4424, + "loss": 1.514, "step": 570 }, { "epoch": 2.0044444444444443, - "grad_norm": 0.28506144881248474, + "grad_norm": 0.25866493582725525, "learning_rate": 4.444444444444444e-06, - "loss": 1.2751, + "loss": 1.3263, "step": 580 }, { "epoch": 2.0052107279693487, - "grad_norm": 0.22614559531211853, + "grad_norm": 0.1699424386024475, "learning_rate": 4.521072796934866e-06, - "loss": 0.9596, + "loss": 0.9733, "step": 590 }, { "epoch": 2.005977011494253, - "grad_norm": 0.5259389877319336, + "grad_norm": 0.5139633417129517, "learning_rate": 4.5977011494252875e-06, - "loss": 2.0754, + "loss": 2.2573, "step": 600 }, { "epoch": 2.0067432950191573, - "grad_norm": 0.6116517186164856, + "grad_norm": 0.42055729031562805, "learning_rate": 4.674329501915709e-06, - "loss": 0.3457, + "loss": 0.3669, "step": 610 }, { "epoch": 2.0075095785440613, - "grad_norm": 0.12475229054689407, + "grad_norm": 0.19771786034107208, "learning_rate": 4.750957854406131e-06, - "loss": 0.9694, + "loss": 0.9309, "step": 620 }, { "epoch": 2.0082758620689654, - "grad_norm": 0.0920993834733963, + "grad_norm": 0.10925453156232834, "learning_rate": 4.8275862068965525e-06, - "loss": 0.0035, + "loss": 0.0036, "step": 630 }, { "epoch": 2.00904214559387, - "grad_norm": 38.45577621459961, + "grad_norm": 58.28190231323242, "learning_rate": 4.904214559386973e-06, - "loss": 2.3353, + "loss": 2.3747, "step": 640 }, { "epoch": 2.009808429118774, - "grad_norm": 35.26827621459961, + "grad_norm": 55.52565002441406, "learning_rate": 4.980842911877395e-06, - "loss": 0.9852, + "loss": 0.9456, "step": 650 }, { "epoch": 2.0105747126436784, - "grad_norm": 0.34033599495887756, + "grad_norm": 0.11590640991926193, "learning_rate": 5.057471264367817e-06, - "loss": 0.4001, + "loss": 0.4773, "step": 660 }, { "epoch": 2.0113409961685824, - "grad_norm": 33.48397445678711, + "grad_norm": 30.875436782836914, "learning_rate": 5.134099616858238e-06, - "loss": 0.9924, + "loss": 1.11, "step": 670 }, { "epoch": 2.0121072796934865, - "grad_norm": 44.040733337402344, + "grad_norm": 30.847183227539062, "learning_rate": 5.210727969348659e-06, - "loss": 1.8575, + "loss": 1.8591, "step": 680 }, { "epoch": 2.012873563218391, - "grad_norm": 0.6051804423332214, + "grad_norm": 0.4506951868534088, "learning_rate": 5.287356321839081e-06, - "loss": 0.4717, + "loss": 0.4323, "step": 690 }, { "epoch": 2.013639846743295, - "grad_norm": 34.99502182006836, + "grad_norm": 32.03398513793945, "learning_rate": 5.3639846743295025e-06, - "loss": 1.4708, + "loss": 1.4884, "step": 700 }, { "epoch": 2.014406130268199, - "grad_norm": 0.34523364901542664, + "grad_norm": 0.39289671182632446, "learning_rate": 5.440613026819924e-06, - "loss": 1.0658, + "loss": 1.0794, "step": 710 }, { "epoch": 2.0151724137931035, - "grad_norm": 0.25993669033050537, + "grad_norm": 0.3561428487300873, "learning_rate": 5.517241379310345e-06, - "loss": 1.9357, + "loss": 1.876, "step": 720 }, { "epoch": 2.0159386973180076, - "grad_norm": 73.50279998779297, + "grad_norm": 55.84814453125, "learning_rate": 5.593869731800766e-06, - "loss": 1.7523, + "loss": 1.5571, "step": 730 }, { "epoch": 2.016704980842912, - "grad_norm": 0.3800812363624573, + "grad_norm": 0.28972840309143066, "learning_rate": 5.670498084291188e-06, - "loss": 0.4206, + "loss": 0.4212, "step": 740 }, { "epoch": 2.017471264367816, - "grad_norm": 36.452369689941406, + "grad_norm": 30.838090896606445, "learning_rate": 5.747126436781609e-06, - "loss": 0.5412, + "loss": 0.5633, "step": 750 }, { "epoch": 2.01823754789272, - "grad_norm": 0.13314314186573029, + "grad_norm": 0.15926188230514526, "learning_rate": 5.823754789272032e-06, - "loss": 0.563, + "loss": 0.5576, "step": 760 }, { "epoch": 2.0190038314176246, - "grad_norm": 0.1593407392501831, + "grad_norm": 0.16359524428844452, "learning_rate": 5.9003831417624525e-06, - "loss": 1.0607, + "loss": 1.0708, "step": 770 }, { "epoch": 2.0197701149425287, - "grad_norm": 0.49019119143486023, + "grad_norm": 0.31254881620407104, "learning_rate": 5.977011494252874e-06, - "loss": 1.5226, + "loss": 1.5457, "step": 780 }, { "epoch": 2.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 1.977901577949524, - "eval_runtime": 17.5386, - "eval_samples_per_second": 2.566, - "eval_steps_per_second": 2.566, + "eval_loss": 2.003734588623047, + "eval_runtime": 17.2379, + "eval_samples_per_second": 2.611, + "eval_steps_per_second": 2.611, "step": 783 }, { "epoch": 3.000536398467433, - "grad_norm": 43.794677734375, + "grad_norm": 39.901973724365234, "learning_rate": 6.053639846743296e-06, - "loss": 1.2914, + "loss": 1.3281, "step": 790 }, { "epoch": 3.001302681992337, - "grad_norm": 0.5036290287971497, + "grad_norm": 0.21426962316036224, "learning_rate": 6.130268199233717e-06, - "loss": 0.5029, + "loss": 0.4621, "step": 800 }, { "epoch": 3.0020689655172412, - "grad_norm": 0.2458069622516632, + "grad_norm": 0.22403688728809357, "learning_rate": 6.206896551724138e-06, - "loss": 1.0423, + "loss": 1.0494, "step": 810 }, { "epoch": 3.0028352490421457, - "grad_norm": 0.1924199014902115, + "grad_norm": 0.14000019431114197, "learning_rate": 6.28352490421456e-06, - "loss": 1.02, + "loss": 1.0793, "step": 820 }, { "epoch": 3.0036015325670498, - "grad_norm": 0.5810611248016357, + "grad_norm": 0.4416393041610718, "learning_rate": 6.360153256704982e-06, - "loss": 1.3787, + "loss": 1.4922, "step": 830 }, { "epoch": 3.004367816091954, - "grad_norm": 41.79328155517578, + "grad_norm": 28.39811897277832, "learning_rate": 6.4367816091954025e-06, - "loss": 0.9023, + "loss": 0.9289, "step": 840 }, { "epoch": 3.0051340996168583, - "grad_norm": 0.6982182860374451, + "grad_norm": 0.5478395819664001, "learning_rate": 6.513409961685824e-06, - "loss": 2.0055, + "loss": 1.8741, "step": 850 }, { "epoch": 3.0059003831417623, - "grad_norm": 0.6013248562812805, + "grad_norm": 0.5628589987754822, "learning_rate": 6.590038314176246e-06, - "loss": 0.8278, + "loss": 0.7966, "step": 860 }, { "epoch": 3.006666666666667, - "grad_norm": 0.21289119124412537, + "grad_norm": 0.18930064141750336, "learning_rate": 6.666666666666667e-06, - "loss": 1.4199, + "loss": 1.3296, "step": 870 }, { "epoch": 3.007432950191571, - "grad_norm": 0.13905803859233856, + "grad_norm": 0.16223835945129395, "learning_rate": 6.743295019157089e-06, - "loss": 0.0079, + "loss": 0.0094, "step": 880 }, { "epoch": 3.008199233716475, - "grad_norm": 34.4211311340332, + "grad_norm": 28.012046813964844, "learning_rate": 6.81992337164751e-06, - "loss": 1.1715, + "loss": 1.1305, "step": 890 }, { "epoch": 3.0089655172413794, - "grad_norm": 0.5754991173744202, + "grad_norm": 0.38224461674690247, "learning_rate": 6.896551724137932e-06, - "loss": 1.5533, + "loss": 1.5438, "step": 900 }, { "epoch": 3.0097318007662834, - "grad_norm": 0.08667688071727753, + "grad_norm": 0.15222999453544617, "learning_rate": 6.973180076628353e-06, - "loss": 0.9377, + "loss": 0.9214, "step": 910 }, { "epoch": 3.010498084291188, - "grad_norm": 19.656044006347656, + "grad_norm": 27.904842376708984, "learning_rate": 7.049808429118774e-06, - "loss": 1.9566, + "loss": 2.3761, "step": 920 }, { "epoch": 3.011264367816092, - "grad_norm": 0.5167022943496704, + "grad_norm": 0.3715040981769562, "learning_rate": 7.126436781609196e-06, - "loss": 0.0934, + "loss": 0.0268, "step": 930 }, { "epoch": 3.012030651340996, - "grad_norm": 42.75983810424805, + "grad_norm": 28.796871185302734, "learning_rate": 7.2030651340996175e-06, - "loss": 1.8835, + "loss": 1.9196, "step": 940 }, { "epoch": 3.0127969348659005, - "grad_norm": 0.5700867176055908, + "grad_norm": 0.3207364082336426, "learning_rate": 7.279693486590039e-06, - "loss": 0.8627, + "loss": 0.9359, "step": 950 }, { "epoch": 3.0135632183908045, - "grad_norm": 29.320573806762695, + "grad_norm": 26.651212692260742, "learning_rate": 7.35632183908046e-06, - "loss": 1.0258, + "loss": 0.9335, "step": 960 }, { "epoch": 3.014329501915709, - "grad_norm": 29.65011215209961, + "grad_norm": 25.842222213745117, "learning_rate": 7.4329501915708825e-06, - "loss": 2.475, + "loss": 2.2809, "step": 970 }, { "epoch": 3.015095785440613, - "grad_norm": 0.621644139289856, + "grad_norm": 0.5549905896186829, "learning_rate": 7.509578544061303e-06, "loss": 0.404, "step": 980 }, { "epoch": 3.015862068965517, - "grad_norm": 0.15273216366767883, + "grad_norm": 0.18950757384300232, "learning_rate": 7.586206896551724e-06, - "loss": 1.2101, + "loss": 1.1746, "step": 990 }, { "epoch": 3.0166283524904216, - "grad_norm": 0.5874581336975098, + "grad_norm": 0.2075805813074112, "learning_rate": 7.662835249042147e-06, - "loss": 0.3958, + "loss": 0.4116, "step": 1000 }, { "epoch": 3.0173946360153256, - "grad_norm": 36.50336456298828, + "grad_norm": 26.857540130615234, "learning_rate": 7.739463601532567e-06, - "loss": 2.2839, + "loss": 2.4204, "step": 1010 }, { "epoch": 3.0181609195402297, - "grad_norm": 1.6061341762542725, + "grad_norm": 2.3038957118988037, "learning_rate": 7.81609195402299e-06, - "loss": 1.6815, + "loss": 1.7141, "step": 1020 }, { "epoch": 3.018927203065134, - "grad_norm": 0.616327702999115, + "grad_norm": 0.8607901334762573, "learning_rate": 7.89272030651341e-06, - "loss": 0.6702, + "loss": 0.6923, "step": 1030 }, { "epoch": 3.0196934865900382, - "grad_norm": 4.43576717376709, + "grad_norm": 4.493562698364258, "learning_rate": 7.969348659003832e-06, - "loss": 0.6566, + "loss": 0.7292, "step": 1040 }, { "epoch": 3.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 2.288022994995117, - "eval_runtime": 17.3752, - "eval_samples_per_second": 2.59, - "eval_steps_per_second": 2.59, + "eval_loss": 2.413928985595703, + "eval_runtime": 17.8935, + "eval_samples_per_second": 2.515, + "eval_steps_per_second": 2.515, "step": 1044 }, { "epoch": 4.000459770114943, - "grad_norm": 0.22421708703041077, + "grad_norm": 0.19549770653247833, "learning_rate": 8.045977011494253e-06, - "loss": 0.9757, + "loss": 1.0729, "step": 1050 }, { "epoch": 4.001226053639847, - "grad_norm": 57.68259811401367, + "grad_norm": 42.400352478027344, "learning_rate": 8.122605363984675e-06, - "loss": 0.5005, + "loss": 0.5297, "step": 1060 }, { "epoch": 4.001992337164751, - "grad_norm": 35.22530746459961, + "grad_norm": 36.23779296875, "learning_rate": 8.199233716475097e-06, - "loss": 1.1988, + "loss": 1.0799, "step": 1070 }, { "epoch": 4.002758620689655, - "grad_norm": 0.33660832047462463, + "grad_norm": 0.34192830324172974, "learning_rate": 8.275862068965518e-06, - "loss": 1.6764, + "loss": 1.8378, "step": 1080 }, { "epoch": 4.00352490421456, - "grad_norm": 3.44526743888855, + "grad_norm": 1.619736909866333, "learning_rate": 8.35249042145594e-06, - "loss": 0.8003, + "loss": 0.825, "step": 1090 }, { "epoch": 4.004291187739463, - "grad_norm": 40.93766784667969, + "grad_norm": 33.67726516723633, "learning_rate": 8.429118773946362e-06, - "loss": 1.5908, + "loss": 1.6032, "step": 1100 }, { "epoch": 4.005057471264368, - "grad_norm": 34.26191711425781, + "grad_norm": 28.808135986328125, "learning_rate": 8.505747126436782e-06, - "loss": 2.0754, + "loss": 2.019, "step": 1110 }, { "epoch": 4.005823754789272, - "grad_norm": 3.224503755569458, + "grad_norm": 1.7604376077651978, "learning_rate": 8.582375478927203e-06, - "loss": 1.207, + "loss": 1.2777, "step": 1120 }, { "epoch": 4.006590038314176, - "grad_norm": 1.6150684356689453, + "grad_norm": 0.5971145033836365, "learning_rate": 8.659003831417625e-06, - "loss": 0.7519, + "loss": 0.8842, "step": 1130 }, { "epoch": 4.00735632183908, - "grad_norm": 34.66046905517578, + "grad_norm": 26.123445510864258, "learning_rate": 8.735632183908047e-06, - "loss": 1.5112, + "loss": 1.5033, "step": 1140 }, { "epoch": 4.008122605363985, - "grad_norm": 0.5555245876312256, + "grad_norm": 0.4754658639431, "learning_rate": 8.812260536398468e-06, - "loss": 0.9574, + "loss": 0.9818, "step": 1150 }, { "epoch": 4.0088888888888885, - "grad_norm": 0.052726663649082184, + "grad_norm": 0.14061832427978516, "learning_rate": 8.888888888888888e-06, - "loss": 1.0397, + "loss": 0.8885, "step": 1160 }, { "epoch": 4.009655172413793, - "grad_norm": 0.5288287997245789, + "grad_norm": 0.6857254505157471, "learning_rate": 8.965517241379312e-06, - "loss": 1.248, + "loss": 1.2337, "step": 1170 }, { "epoch": 4.0104214559386975, - "grad_norm": 0.4055088460445404, + "grad_norm": 0.6307085752487183, "learning_rate": 9.042145593869732e-06, - "loss": 1.416, + "loss": 1.3891, "step": 1180 }, { "epoch": 4.011187739463602, - "grad_norm": 0.7204898595809937, + "grad_norm": 0.20993439853191376, "learning_rate": 9.118773946360155e-06, - "loss": 1.2203, + "loss": 1.3045, "step": 1190 }, { "epoch": 4.011954022988506, - "grad_norm": 0.13948437571525574, + "grad_norm": 0.19304411113262177, "learning_rate": 9.195402298850575e-06, - "loss": 1.1305, + "loss": 0.9091, "step": 1200 }, { "epoch": 4.01272030651341, - "grad_norm": 0.15905851125717163, + "grad_norm": 0.16440033912658691, "learning_rate": 9.272030651340997e-06, - "loss": 0.5537, + "loss": 0.5424, "step": 1210 }, { "epoch": 4.0134865900383145, - "grad_norm": 0.10506876558065414, + "grad_norm": 0.11204668134450912, "learning_rate": 9.348659003831418e-06, - "loss": 0.5343, + "loss": 0.5521, "step": 1220 }, { "epoch": 4.014252873563218, - "grad_norm": 0.6640226244926453, + "grad_norm": 0.7018865346908569, "learning_rate": 9.42528735632184e-06, - "loss": 2.0216, + "loss": 2.029, "step": 1230 }, { "epoch": 4.015019157088123, - "grad_norm": 1.129831075668335, + "grad_norm": 1.1758533716201782, "learning_rate": 9.501915708812262e-06, - "loss": 1.4206, + "loss": 1.4767, "step": 1240 }, { "epoch": 4.015785440613027, - "grad_norm": 32.93185806274414, + "grad_norm": 27.594398498535156, "learning_rate": 9.578544061302683e-06, - "loss": 1.1033, + "loss": 1.117, "step": 1250 }, { "epoch": 4.016551724137931, - "grad_norm": 0.09101611375808716, + "grad_norm": 0.13690224289894104, "learning_rate": 9.655172413793105e-06, - "loss": 0.5194, + "loss": 0.4822, "step": 1260 }, { "epoch": 4.017318007662835, - "grad_norm": 0.10182340443134308, + "grad_norm": 0.10060080885887146, "learning_rate": 9.731800766283525e-06, - "loss": 0.5991, + "loss": 0.5568, "step": 1270 }, { "epoch": 4.01808429118774, - "grad_norm": 0.10434069484472275, + "grad_norm": 0.08842390030622482, "learning_rate": 9.808429118773947e-06, - "loss": 1.1675, + "loss": 1.1275, "step": 1280 }, { "epoch": 4.018850574712643, - "grad_norm": 0.2632581293582916, + "grad_norm": 0.22402533888816833, "learning_rate": 9.885057471264368e-06, - "loss": 1.002, + "loss": 1.011, "step": 1290 }, { "epoch": 4.019616858237548, - "grad_norm": 23.97089958190918, + "grad_norm": 27.197467803955078, "learning_rate": 9.96168582375479e-06, - "loss": 1.4753, + "loss": 1.5417, "step": 1300 }, { "epoch": 4.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 1.9706525802612305, - "eval_runtime": 17.5025, - "eval_samples_per_second": 2.571, - "eval_steps_per_second": 2.571, + "eval_loss": 1.8114548921585083, + "eval_runtime": 18.1271, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, "step": 1305 }, { "epoch": 5.000383141762452, - "grad_norm": 0.13285315036773682, + "grad_norm": 0.11586648970842361, "learning_rate": 9.995742869306088e-06, - "loss": 0.8181, + "loss": 0.7757, "step": 1310 }, { "epoch": 5.001149425287356, - "grad_norm": 0.292896032333374, + "grad_norm": 0.25203320384025574, "learning_rate": 9.987228607918263e-06, - "loss": 1.5031, + "loss": 1.4339, "step": 1320 }, { "epoch": 5.001915708812261, - "grad_norm": 0.1625707596540451, + "grad_norm": 0.10990557819604874, "learning_rate": 9.97871434653044e-06, - "loss": 0.4523, + "loss": 0.5342, "step": 1330 }, { "epoch": 5.002681992337164, - "grad_norm": 0.4843929708003998, + "grad_norm": 0.18456988036632538, "learning_rate": 9.970200085142615e-06, - "loss": 1.2977, + "loss": 1.4733, "step": 1340 }, { "epoch": 5.003448275862069, - "grad_norm": 24.590654373168945, + "grad_norm": 33.10526657104492, "learning_rate": 9.96168582375479e-06, - "loss": 2.1938, + "loss": 2.0585, "step": 1350 }, { "epoch": 5.004214559386973, - "grad_norm": 45.145320892333984, + "grad_norm": 34.103981018066406, "learning_rate": 9.953171562366965e-06, - "loss": 1.0997, + "loss": 1.0572, "step": 1360 }, { "epoch": 5.004980842911878, - "grad_norm": 0.1326051950454712, + "grad_norm": 0.12564679980278015, "learning_rate": 9.944657300979142e-06, - "loss": 0.0109, + "loss": 0.0094, "step": 1370 }, { "epoch": 5.005747126436781, - "grad_norm": 0.17839914560317993, + "grad_norm": 0.2521078884601593, "learning_rate": 9.936143039591317e-06, - "loss": 1.6253, + "loss": 1.6226, "step": 1380 }, { "epoch": 5.006513409961686, - "grad_norm": 26.25552749633789, + "grad_norm": 31.14646339416504, "learning_rate": 9.927628778203492e-06, - "loss": 2.4458, + "loss": 2.1683, "step": 1390 }, { "epoch": 5.00727969348659, - "grad_norm": 0.19945690035820007, + "grad_norm": 0.14362873136997223, "learning_rate": 9.919114516815667e-06, - "loss": 0.4457, + "loss": 0.3346, "step": 1400 }, { "epoch": 5.008045977011494, - "grad_norm": 29.13455581665039, + "grad_norm": 26.62425994873047, "learning_rate": 9.910600255427842e-06, - "loss": 2.0107, + "loss": 2.2745, "step": 1410 }, { "epoch": 5.0088122605363985, - "grad_norm": 0.9910873770713806, + "grad_norm": 0.6855002641677856, "learning_rate": 9.902085994040018e-06, - "loss": 0.4578, + "loss": 0.4516, "step": 1420 }, { "epoch": 5.009578544061303, - "grad_norm": 28.005178451538086, + "grad_norm": 26.288606643676758, "learning_rate": 9.893571732652193e-06, - "loss": 1.4472, + "loss": 1.3007, "step": 1430 }, { "epoch": 5.010344827586207, - "grad_norm": 0.6784669160842896, + "grad_norm": 0.46419236063957214, "learning_rate": 9.885057471264368e-06, - "loss": 1.7133, + "loss": 1.4289, "step": 1440 }, { "epoch": 5.011111111111111, - "grad_norm": 0.2355167120695114, + "grad_norm": 0.1166498214006424, "learning_rate": 9.876543209876543e-06, - "loss": 0.7316, + "loss": 0.7529, "step": 1450 }, { "epoch": 5.011877394636016, - "grad_norm": 0.31805485486984253, + "grad_norm": 1.1704742908477783, "learning_rate": 9.86802894848872e-06, - "loss": 0.7807, + "loss": 0.7242, "step": 1460 }, { "epoch": 5.012643678160919, - "grad_norm": 0.13661302626132965, + "grad_norm": 0.12426697462797165, "learning_rate": 9.859514687100895e-06, - "loss": 0.9905, + "loss": 1.0984, "step": 1470 }, { "epoch": 5.013409961685824, - "grad_norm": 0.7544468641281128, + "grad_norm": 0.6996082067489624, "learning_rate": 9.85100042571307e-06, - "loss": 1.8386, + "loss": 1.9518, "step": 1480 }, { "epoch": 5.014176245210728, - "grad_norm": 0.268381267786026, + "grad_norm": 0.3130974769592285, "learning_rate": 9.842486164325245e-06, - "loss": 0.8431, + "loss": 0.8351, "step": 1490 }, { "epoch": 5.014942528735633, - "grad_norm": 0.1431567370891571, + "grad_norm": 0.20495007932186127, "learning_rate": 9.833971902937422e-06, - "loss": 1.2555, + "loss": 1.3027, "step": 1500 }, { "epoch": 5.015708812260536, - "grad_norm": 0.4003981351852417, + "grad_norm": 0.2900417745113373, "learning_rate": 9.825457641549597e-06, - "loss": 1.2832, + "loss": 1.209, "step": 1510 }, { "epoch": 5.016475095785441, - "grad_norm": 0.08528074622154236, + "grad_norm": 0.08843345940113068, "learning_rate": 9.816943380161772e-06, - "loss": 0.3511, + "loss": 0.4, "step": 1520 }, { "epoch": 5.017241379310345, - "grad_norm": 28.630849838256836, + "grad_norm": 28.542177200317383, "learning_rate": 9.808429118773947e-06, - "loss": 1.6434, + "loss": 1.6235, "step": 1530 }, { "epoch": 5.018007662835249, - "grad_norm": 28.206207275390625, + "grad_norm": 25.423433303833008, "learning_rate": 9.799914857386122e-06, - "loss": 1.7724, + "loss": 1.7544, "step": 1540 }, { "epoch": 5.018773946360153, - "grad_norm": 0.5953699946403503, + "grad_norm": 0.6223596334457397, "learning_rate": 9.791400595998298e-06, - "loss": 0.3764, + "loss": 0.4079, "step": 1550 }, { "epoch": 5.019540229885058, - "grad_norm": 0.2531318664550781, + "grad_norm": 0.1999908685684204, "learning_rate": 9.782886334610473e-06, - "loss": 0.9052, + "loss": 0.9259, "step": 1560 }, { "epoch": 5.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 2.002394676208496, - "eval_runtime": 17.5088, - "eval_samples_per_second": 2.57, - "eval_steps_per_second": 2.57, + "eval_loss": 1.990254521369934, + "eval_runtime": 15.8006, + "eval_samples_per_second": 2.848, + "eval_steps_per_second": 2.848, "step": 1566 }, { "epoch": 6.000306513409962, - "grad_norm": 29.369661331176758, + "grad_norm": 31.772960662841797, "learning_rate": 9.774372073222648e-06, - "loss": 0.8426, + "loss": 0.9787, "step": 1570 }, { "epoch": 6.001072796934866, - "grad_norm": 31.23344612121582, + "grad_norm": 47.60161209106445, "learning_rate": 9.765857811834825e-06, - "loss": 1.3308, + "loss": 1.2737, "step": 1580 }, { "epoch": 6.00183908045977, - "grad_norm": 0.38720428943634033, + "grad_norm": 0.39719128608703613, "learning_rate": 9.757343550447e-06, - "loss": 0.8827, + "loss": 0.8781, "step": 1590 }, { "epoch": 6.002605363984674, - "grad_norm": 0.05737161263823509, + "grad_norm": 0.06991539150476456, "learning_rate": 9.748829289059175e-06, - "loss": 0.6943, + "loss": 0.6305, "step": 1600 }, { "epoch": 6.003371647509579, - "grad_norm": 0.20373477041721344, + "grad_norm": 0.21721164882183075, "learning_rate": 9.74031502767135e-06, - "loss": 2.2305, + "loss": 2.1519, "step": 1610 }, { "epoch": 6.0041379310344825, - "grad_norm": 0.13082756102085114, + "grad_norm": 0.11582913994789124, "learning_rate": 9.731800766283525e-06, - "loss": 0.728, + "loss": 0.7254, "step": 1620 }, { "epoch": 6.004904214559387, - "grad_norm": 33.23627471923828, + "grad_norm": 38.99522018432617, "learning_rate": 9.723286504895702e-06, - "loss": 1.8183, + "loss": 1.7413, "step": 1630 }, { "epoch": 6.005670498084291, - "grad_norm": 0.06156496703624725, + "grad_norm": 0.06668917089700699, "learning_rate": 9.714772243507877e-06, - "loss": 0.0336, + "loss": 0.0446, "step": 1640 }, { "epoch": 6.006436781609195, - "grad_norm": 31.361791610717773, + "grad_norm": 35.88335418701172, "learning_rate": 9.706257982120052e-06, - "loss": 1.0731, + "loss": 0.8033, "step": 1650 }, { "epoch": 6.0072030651340995, - "grad_norm": 29.613847732543945, + "grad_norm": 34.246768951416016, "learning_rate": 9.697743720732228e-06, - "loss": 1.0388, + "loss": 0.9261, "step": 1660 }, { "epoch": 6.007969348659004, - "grad_norm": 30.859901428222656, + "grad_norm": 28.39165687561035, "learning_rate": 9.689229459344403e-06, - "loss": 1.8366, + "loss": 2.0242, "step": 1670 }, { "epoch": 6.008735632183908, - "grad_norm": 1.76474928855896, + "grad_norm": 1.0314298868179321, "learning_rate": 9.680715197956578e-06, - "loss": 0.6605, + "loss": 0.4954, "step": 1680 }, { "epoch": 6.009501915708812, - "grad_norm": 3.6797220706939697, + "grad_norm": 14.369938850402832, "learning_rate": 9.672200936568753e-06, - "loss": 0.834, + "loss": 0.8774, "step": 1690 }, { "epoch": 6.010268199233717, - "grad_norm": 33.129032135009766, + "grad_norm": 35.32308578491211, "learning_rate": 9.663686675180928e-06, - "loss": 1.853, + "loss": 1.733, "step": 1700 }, { "epoch": 6.011034482758621, - "grad_norm": 0.660237729549408, + "grad_norm": 0.4685197174549103, "learning_rate": 9.655172413793105e-06, - "loss": 1.3547, + "loss": 1.018, "step": 1710 }, { "epoch": 6.011800766283525, - "grad_norm": 1.2416914701461792, + "grad_norm": 2.036705255508423, "learning_rate": 9.64665815240528e-06, - "loss": 1.3361, + "loss": 1.1048, "step": 1720 }, { "epoch": 6.012567049808429, - "grad_norm": 0.062473446130752563, + "grad_norm": 0.09332600980997086, "learning_rate": 9.638143891017455e-06, - "loss": 0.0049, + "loss": 0.0062, "step": 1730 }, { "epoch": 6.013333333333334, - "grad_norm": 0.26628538966178894, + "grad_norm": 0.28798797726631165, "learning_rate": 9.62962962962963e-06, - "loss": 1.5327, + "loss": 1.7012, "step": 1740 }, { "epoch": 6.014099616858237, - "grad_norm": 34.06354522705078, + "grad_norm": 32.18275833129883, "learning_rate": 9.621115368241805e-06, - "loss": 1.4161, + "loss": 1.2207, "step": 1750 }, { "epoch": 6.014865900383142, - "grad_norm": 0.4737323820590973, + "grad_norm": 0.43422597646713257, "learning_rate": 9.612601106853982e-06, - "loss": 0.8003, + "loss": 0.8722, "step": 1760 }, { "epoch": 6.015632183908046, - "grad_norm": 0.034674499183893204, + "grad_norm": 0.03912891447544098, "learning_rate": 9.604086845466157e-06, - "loss": 0.3427, + "loss": 0.3705, "step": 1770 }, { "epoch": 6.01639846743295, - "grad_norm": 0.047134336084127426, + "grad_norm": 0.036081306636333466, "learning_rate": 9.595572584078332e-06, - "loss": 1.1304, + "loss": 1.2151, "step": 1780 }, { "epoch": 6.017164750957854, - "grad_norm": 38.024044036865234, + "grad_norm": 35.24870300292969, "learning_rate": 9.587058322690508e-06, - "loss": 2.2138, + "loss": 2.3234, "step": 1790 }, { "epoch": 6.017931034482759, - "grad_norm": 32.73537063598633, + "grad_norm": 36.206233978271484, "learning_rate": 9.578544061302683e-06, - "loss": 1.2603, + "loss": 1.2311, "step": 1800 }, { "epoch": 6.018697318007663, - "grad_norm": 0.12362480163574219, + "grad_norm": 0.14158888161182404, "learning_rate": 9.570029799914858e-06, - "loss": 0.3934, + "loss": 0.4018, "step": 1810 }, { "epoch": 6.019463601532567, - "grad_norm": 0.2425663322210312, + "grad_norm": 0.23180009424686432, "learning_rate": 9.561515538527033e-06, - "loss": 0.6047, + "loss": 0.7081, "step": 1820 }, { "epoch": 6.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 2.2993123531341553, - "eval_runtime": 18.0856, - "eval_samples_per_second": 2.488, - "eval_steps_per_second": 2.488, + "eval_loss": 2.2050774097442627, + "eval_runtime": 15.8252, + "eval_samples_per_second": 2.844, + "eval_steps_per_second": 2.844, "step": 1827 }, { "epoch": 7.000229885057471, - "grad_norm": 36.7726936340332, + "grad_norm": 32.795982360839844, "learning_rate": 9.553001277139208e-06, - "loss": 1.0355, + "loss": 1.0629, "step": 1830 }, { "epoch": 7.000996168582375, - "grad_norm": 48.24052047729492, + "grad_norm": 38.839500427246094, "learning_rate": 9.544487015751385e-06, - "loss": 1.8324, + "loss": 1.8354, "step": 1840 }, { "epoch": 7.00176245210728, - "grad_norm": 0.7771627902984619, + "grad_norm": 0.4783906638622284, "learning_rate": 9.53597275436356e-06, - "loss": 0.828, + "loss": 0.8643, "step": 1850 }, { "epoch": 7.0025287356321835, - "grad_norm": 29.158430099487305, + "grad_norm": 51.76274108886719, "learning_rate": 9.527458492975735e-06, - "loss": 1.1525, + "loss": 1.0796, "step": 1860 }, { "epoch": 7.003295019157088, - "grad_norm": 0.955651044845581, + "grad_norm": 0.5234166979789734, "learning_rate": 9.518944231587912e-06, - "loss": 1.0851, + "loss": 0.965, "step": 1870 }, { "epoch": 7.0040613026819925, - "grad_norm": 0.08242946863174438, + "grad_norm": 0.0872463658452034, "learning_rate": 9.510429970200085e-06, - "loss": 0.3983, + "loss": 0.3858, "step": 1880 }, { "epoch": 7.004827586206897, - "grad_norm": 46.56532287597656, + "grad_norm": 58.89110565185547, "learning_rate": 9.501915708812262e-06, - "loss": 1.047, + "loss": 1.036, "step": 1890 }, { "epoch": 7.0055938697318005, - "grad_norm": 25.511343002319336, + "grad_norm": 25.31493377685547, "learning_rate": 9.493401447424437e-06, - "loss": 1.5027, + "loss": 1.5178, "step": 1900 }, { "epoch": 7.006360153256705, - "grad_norm": 1.9284840822219849, + "grad_norm": 1.643489956855774, "learning_rate": 9.484887186036612e-06, - "loss": 1.0999, + "loss": 1.0528, "step": 1910 }, { "epoch": 7.0071264367816095, - "grad_norm": 0.3347567915916443, + "grad_norm": 0.1744176596403122, "learning_rate": 9.476372924648788e-06, - "loss": 1.0487, + "loss": 1.0241, "step": 1920 }, { "epoch": 7.007892720306513, - "grad_norm": 0.05186135694384575, + "grad_norm": 0.06867603212594986, "learning_rate": 9.467858663260963e-06, - "loss": 0.6258, + "loss": 0.704, "step": 1930 }, { "epoch": 7.008659003831418, - "grad_norm": 0.11341365426778793, + "grad_norm": 0.14667190611362457, "learning_rate": 9.459344401873138e-06, - "loss": 1.5365, + "loss": 1.6422, "step": 1940 }, { "epoch": 7.009425287356322, - "grad_norm": 40.456451416015625, + "grad_norm": 65.47974395751953, "learning_rate": 9.450830140485315e-06, - "loss": 0.7781, + "loss": 0.5664, "step": 1950 }, { "epoch": 7.010191570881226, - "grad_norm": 40.50346755981445, + "grad_norm": 34.11857223510742, "learning_rate": 9.442315879097488e-06, - "loss": 1.2897, + "loss": 1.4365, "step": 1960 }, { "epoch": 7.01095785440613, - "grad_norm": 25.086835861206055, + "grad_norm": 28.374338150024414, "learning_rate": 9.433801617709665e-06, - "loss": 1.2992, + "loss": 1.6464, "step": 1970 }, { "epoch": 7.011724137931035, - "grad_norm": 0.26018697023391724, + "grad_norm": 0.19604957103729248, "learning_rate": 9.42528735632184e-06, - "loss": 0.0815, + "loss": 0.0226, "step": 1980 }, { "epoch": 7.012490421455938, - "grad_norm": 36.31147003173828, + "grad_norm": 29.237640380859375, "learning_rate": 9.416773094934015e-06, - "loss": 1.0688, + "loss": 1.2042, "step": 1990 }, { "epoch": 7.013256704980843, - "grad_norm": 0.0778973400592804, + "grad_norm": 0.060450419783592224, "learning_rate": 9.408258833546192e-06, - "loss": 0.9173, + "loss": 1.0219, "step": 2000 }, { "epoch": 7.014022988505747, - "grad_norm": 0.14355182647705078, + "grad_norm": 0.17581497132778168, "learning_rate": 9.399744572158365e-06, - "loss": 0.9381, + "loss": 0.9612, "step": 2010 }, { "epoch": 7.014789272030652, - "grad_norm": 38.620948791503906, + "grad_norm": 38.61123275756836, "learning_rate": 9.391230310770542e-06, - "loss": 1.2304, + "loss": 1.4265, "step": 2020 }, { "epoch": 7.015555555555555, - "grad_norm": 0.22130434215068817, + "grad_norm": 0.29424068331718445, "learning_rate": 9.382716049382717e-06, - "loss": 1.0296, + "loss": 0.9533, "step": 2030 }, { "epoch": 7.01632183908046, - "grad_norm": 0.2928678095340729, + "grad_norm": 0.15735261142253876, "learning_rate": 9.374201787994892e-06, - "loss": 0.4804, + "loss": 0.4622, "step": 2040 }, { "epoch": 7.017088122605364, - "grad_norm": 1.1607459783554077, + "grad_norm": 0.45708709955215454, "learning_rate": 9.365687526607068e-06, - "loss": 0.9858, + "loss": 0.9547, "step": 2050 }, { "epoch": 7.017854406130268, - "grad_norm": 0.311290979385376, + "grad_norm": 0.3691258728504181, "learning_rate": 9.357173265219243e-06, - "loss": 2.2055, + "loss": 2.1669, "step": 2060 }, { "epoch": 7.018620689655172, - "grad_norm": 0.0509340837597847, + "grad_norm": 0.047502584755420685, "learning_rate": 9.348659003831418e-06, - "loss": 1.2935, + "loss": 1.2007, "step": 2070 }, { "epoch": 7.019386973180077, - "grad_norm": 28.45035171508789, + "grad_norm": 26.50173568725586, "learning_rate": 9.340144742443595e-06, - "loss": 0.9311, + "loss": 1.0459, "step": 2080 }, { "epoch": 7.02, - "eval_accuracy": 0.5555555555555556, - "eval_loss": 1.7414064407348633, - "eval_runtime": 18.5252, - "eval_samples_per_second": 2.429, - "eval_steps_per_second": 2.429, + "eval_accuracy": 0.5777777777777777, + "eval_loss": 1.667381763458252, + "eval_runtime": 15.8255, + "eval_samples_per_second": 2.844, + "eval_steps_per_second": 2.844, "step": 2088 }, { "epoch": 8.00015325670498, - "grad_norm": 0.17693942785263062, + "grad_norm": 0.26884081959724426, "learning_rate": 9.331630481055768e-06, - "loss": 1.1265, + "loss": 1.0929, "step": 2090 }, { "epoch": 8.000919540229885, - "grad_norm": 30.539339065551758, + "grad_norm": 44.09854507446289, "learning_rate": 9.323116219667945e-06, - "loss": 0.8481, + "loss": 0.9725, "step": 2100 }, { "epoch": 8.001685823754789, - "grad_norm": 1.394698143005371, + "grad_norm": 1.0260252952575684, "learning_rate": 9.31460195828012e-06, - "loss": 1.1256, + "loss": 1.0494, "step": 2110 }, { "epoch": 8.002452107279694, - "grad_norm": 0.030301367864012718, + "grad_norm": 0.03606044873595238, "learning_rate": 9.306087696892295e-06, - "loss": 0.6935, + "loss": 0.7965, "step": 2120 }, { "epoch": 8.003218390804598, - "grad_norm": 0.08991295099258423, + "grad_norm": 0.4968773126602173, "learning_rate": 9.297573435504472e-06, - "loss": 2.314, + "loss": 2.545, "step": 2130 }, { "epoch": 8.003984674329502, - "grad_norm": 0.37085434794425964, + "grad_norm": 0.4621148705482483, "learning_rate": 9.289059174116647e-06, - "loss": 0.0121, + "loss": 0.0122, "step": 2140 }, { "epoch": 8.004750957854407, - "grad_norm": 0.08312354981899261, + "grad_norm": 0.14638549089431763, "learning_rate": 9.280544912728822e-06, - "loss": 0.4283, + "loss": 0.1765, "step": 2150 }, { "epoch": 8.00551724137931, - "grad_norm": 32.25566101074219, + "grad_norm": 32.06683349609375, "learning_rate": 9.272030651340997e-06, - "loss": 0.6677, + "loss": 0.6421, "step": 2160 }, { "epoch": 8.006283524904214, - "grad_norm": 32.560298919677734, + "grad_norm": 42.689144134521484, "learning_rate": 9.263516389953172e-06, - "loss": 3.217, + "loss": 3.0156, "step": 2170 }, { "epoch": 8.00704980842912, - "grad_norm": 10.358272552490234, + "grad_norm": 34.9857063293457, "learning_rate": 9.255002128565348e-06, - "loss": 1.8115, + "loss": 1.8613, "step": 2180 }, { "epoch": 8.007816091954023, - "grad_norm": 45.08771896362305, + "grad_norm": 78.8707275390625, "learning_rate": 9.246487867177523e-06, - "loss": 0.7355, + "loss": 0.6685, "step": 2190 }, { "epoch": 8.008582375478927, - "grad_norm": 31.09504508972168, + "grad_norm": 72.47820281982422, "learning_rate": 9.237973605789698e-06, - "loss": 0.6319, + "loss": 0.8171, "step": 2200 }, { "epoch": 8.009348659003832, - "grad_norm": 2.917539358139038, + "grad_norm": 1.267533540725708, "learning_rate": 9.229459344401875e-06, - "loss": 1.2356, + "loss": 1.7606, "step": 2210 }, { "epoch": 8.010114942528736, - "grad_norm": 0.6580478549003601, + "grad_norm": 0.42801421880722046, "learning_rate": 9.220945083014048e-06, - "loss": 0.4803, + "loss": 0.6275, "step": 2220 }, { "epoch": 8.01088122605364, - "grad_norm": 0.1518530696630478, + "grad_norm": 0.41535690426826477, "learning_rate": 9.212430821626225e-06, - "loss": 0.2355, + "loss": 0.4323, "step": 2230 }, { "epoch": 8.011647509578545, - "grad_norm": 0.09981821477413177, + "grad_norm": 0.7334343194961548, "learning_rate": 9.2039165602384e-06, - "loss": 1.2565, + "loss": 1.017, "step": 2240 }, { "epoch": 8.012413793103448, - "grad_norm": 1.7428799867630005, + "grad_norm": 0.20332147181034088, "learning_rate": 9.195402298850575e-06, - "loss": 0.8159, + "loss": 0.9734, "step": 2250 }, { "epoch": 8.013180076628352, - "grad_norm": 0.164898082613945, + "grad_norm": 0.29250460863113403, "learning_rate": 9.186888037462752e-06, - "loss": 1.6338, + "loss": 1.4371, "step": 2260 }, { "epoch": 8.013946360153257, - "grad_norm": 27.460119247436523, + "grad_norm": 47.41959762573242, "learning_rate": 9.178373776074927e-06, - "loss": 1.3984, + "loss": 1.1216, "step": 2270 }, { "epoch": 8.01471264367816, - "grad_norm": 0.48447030782699585, + "grad_norm": 0.11371457576751709, "learning_rate": 9.169859514687102e-06, - "loss": 0.729, + "loss": 0.8474, "step": 2280 }, { "epoch": 8.015478927203064, - "grad_norm": 38.24510955810547, + "grad_norm": 60.40354919433594, "learning_rate": 9.161345253299277e-06, - "loss": 1.0315, + "loss": 0.741, "step": 2290 }, { "epoch": 8.01624521072797, - "grad_norm": 0.17362213134765625, + "grad_norm": 0.3201305568218231, "learning_rate": 9.152830991911452e-06, - "loss": 1.3995, + "loss": 1.5068, "step": 2300 }, { "epoch": 8.017011494252873, - "grad_norm": 0.16559770703315735, + "grad_norm": 0.1597777158021927, "learning_rate": 9.144316730523628e-06, - "loss": 0.8855, + "loss": 0.8057, "step": 2310 }, { "epoch": 8.017777777777777, - "grad_norm": 45.63298034667969, + "grad_norm": 30.17101287841797, "learning_rate": 9.135802469135803e-06, - "loss": 1.6324, + "loss": 1.5909, "step": 2320 }, { "epoch": 8.018544061302682, - "grad_norm": 0.5894901156425476, + "grad_norm": 0.1970360279083252, "learning_rate": 9.127288207747978e-06, - "loss": 0.2804, + "loss": 0.3166, "step": 2330 }, { "epoch": 8.019310344827586, - "grad_norm": 0.06097891554236412, + "grad_norm": 0.06318873167037964, "learning_rate": 9.118773946360155e-06, - "loss": 0.9805, + "loss": 0.9128, "step": 2340 }, { "epoch": 8.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 2.175300121307373, - "eval_runtime": 17.4474, - "eval_samples_per_second": 2.579, - "eval_steps_per_second": 2.579, + "eval_loss": 2.2127251625061035, + "eval_runtime": 16.0647, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 2.801, "step": 2349 }, { "epoch": 9.00007662835249, - "grad_norm": 0.5459292531013489, + "grad_norm": 0.07489515841007233, "learning_rate": 9.110259684972328e-06, - "loss": 1.0465, + "loss": 0.9024, "step": 2350 }, { "epoch": 9.000842911877395, - "grad_norm": 41.98398208618164, + "grad_norm": 43.35409164428711, "learning_rate": 9.101745423584505e-06, - "loss": 0.4853, + "loss": 0.1532, "step": 2360 }, { "epoch": 9.001609195402299, - "grad_norm": 57.37398910522461, + "grad_norm": 32.60152816772461, "learning_rate": 9.09323116219668e-06, - "loss": 1.6212, + "loss": 1.8436, "step": 2370 }, { "epoch": 9.002375478927203, - "grad_norm": 49.405460357666016, + "grad_norm": 60.675411224365234, "learning_rate": 9.084716900808855e-06, - "loss": 0.4106, + "loss": 0.4344, "step": 2380 }, { "epoch": 9.003141762452108, - "grad_norm": 0.15124031901359558, + "grad_norm": 0.11013050377368927, "learning_rate": 9.076202639421032e-06, - "loss": 1.2342, + "loss": 1.1302, "step": 2390 }, { "epoch": 9.003908045977012, - "grad_norm": 33.65945816040039, + "grad_norm": 32.578033447265625, "learning_rate": 9.067688378033207e-06, - "loss": 1.3358, + "loss": 1.2017, "step": 2400 }, { "epoch": 9.004674329501915, - "grad_norm": 7.62302827835083, + "grad_norm": 14.693634033203125, "learning_rate": 9.059174116645382e-06, - "loss": 1.171, + "loss": 1.1827, "step": 2410 }, { "epoch": 9.00544061302682, - "grad_norm": 4.565344333648682, + "grad_norm": 1.9018902778625488, "learning_rate": 9.050659855257558e-06, - "loss": 0.4575, + "loss": 0.7081, "step": 2420 }, { "epoch": 9.006206896551724, - "grad_norm": 6.922524929046631, + "grad_norm": 8.96721076965332, "learning_rate": 9.042145593869732e-06, - "loss": 1.2913, + "loss": 1.4477, "step": 2430 }, { "epoch": 9.006973180076628, - "grad_norm": 47.88801193237305, + "grad_norm": 94.3230209350586, "learning_rate": 9.033631332481908e-06, - "loss": 0.7557, + "loss": 0.4914, "step": 2440 }, { "epoch": 9.007739463601533, - "grad_norm": 0.0910555049777031, + "grad_norm": 1.2807934284210205, "learning_rate": 9.025117071094083e-06, - "loss": 0.843, + "loss": 0.9399, "step": 2450 }, { "epoch": 9.008505747126437, - "grad_norm": 0.050048861652612686, + "grad_norm": 0.09760165959596634, "learning_rate": 9.016602809706258e-06, - "loss": 0.935, + "loss": 1.2025, "step": 2460 }, { "epoch": 9.00927203065134, - "grad_norm": 0.024516461417078972, + "grad_norm": 0.020161231979727745, "learning_rate": 9.008088548318435e-06, - "loss": 0.6028, + "loss": 0.3715, "step": 2470 }, { "epoch": 9.010038314176246, - "grad_norm": 0.2837757170200348, + "grad_norm": 0.08792616426944733, "learning_rate": 8.999574286930608e-06, - "loss": 0.002, + "loss": 0.0014, "step": 2480 }, { "epoch": 9.01080459770115, - "grad_norm": 0.051584236323833466, + "grad_norm": 0.04609476774930954, "learning_rate": 8.991060025542785e-06, - "loss": 0.0024, + "loss": 0.0087, "step": 2490 }, { "epoch": 9.011570881226053, - "grad_norm": 0.3903149366378784, + "grad_norm": 0.09467889368534088, "learning_rate": 8.98254576415496e-06, - "loss": 1.2635, + "loss": 1.4492, "step": 2500 }, { "epoch": 9.012337164750958, - "grad_norm": 0.03827536851167679, + "grad_norm": 0.14842702448368073, "learning_rate": 8.974031502767135e-06, - "loss": 1.0014, + "loss": 1.2094, "step": 2510 }, { "epoch": 9.013103448275862, - "grad_norm": 0.08342862874269485, + "grad_norm": 0.514483630657196, "learning_rate": 8.965517241379312e-06, - "loss": 2.6924, + "loss": 2.8405, "step": 2520 }, { "epoch": 9.013869731800765, - "grad_norm": 0.38197821378707886, + "grad_norm": 0.5182496309280396, "learning_rate": 8.957002979991487e-06, - "loss": 0.3544, + "loss": 0.2532, "step": 2530 }, { "epoch": 9.01463601532567, - "grad_norm": 0.3897605836391449, + "grad_norm": 0.7483034729957581, "learning_rate": 8.948488718603662e-06, - "loss": 2.9981, + "loss": 3.1011, "step": 2540 }, { "epoch": 9.015402298850574, - "grad_norm": 1.5990636348724365, + "grad_norm": 1.1251901388168335, "learning_rate": 8.939974457215838e-06, - "loss": 1.287, + "loss": 1.5146, "step": 2550 }, { "epoch": 9.01616858237548, - "grad_norm": 21.688724517822266, + "grad_norm": 37.161739349365234, "learning_rate": 8.931460195828012e-06, - "loss": 0.6554, + "loss": 0.85, "step": 2560 }, { "epoch": 9.016934865900383, - "grad_norm": 22.26905632019043, + "grad_norm": 51.83870315551758, "learning_rate": 8.922945934440188e-06, - "loss": 1.5073, + "loss": 1.6904, "step": 2570 }, { "epoch": 9.017701149425287, - "grad_norm": 33.69087600708008, + "grad_norm": 76.26094055175781, "learning_rate": 8.914431673052363e-06, - "loss": 0.4575, + "loss": 0.466, "step": 2580 }, { "epoch": 9.018467432950192, - "grad_norm": 0.15235482156276703, + "grad_norm": 0.2700035870075226, "learning_rate": 8.905917411664538e-06, - "loss": 0.9816, + "loss": 1.0376, "step": 2590 }, { "epoch": 9.019233716475096, - "grad_norm": 0.6422010064125061, + "grad_norm": 0.7433048486709595, "learning_rate": 8.897403150276715e-06, - "loss": 2.0322, + "loss": 2.1063, "step": 2600 }, { "epoch": 9.02, - "grad_norm": 0.5487821698188782, + "grad_norm": 0.9286152124404907, "learning_rate": 8.888888888888888e-06, - "loss": 0.6426, + "loss": 0.5241, "step": 2610 }, { "epoch": 9.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 1.7462998628616333, - "eval_runtime": 18.3769, - "eval_samples_per_second": 2.449, - "eval_steps_per_second": 2.449, + "eval_loss": 1.8463549613952637, + "eval_runtime": 16.2747, + "eval_samples_per_second": 2.765, + "eval_steps_per_second": 2.765, "step": 2610 }, { "epoch": 10.000766283524904, - "grad_norm": 0.39281412959098816, + "grad_norm": 0.10940740257501602, "learning_rate": 8.880374627501065e-06, - "loss": 0.6869, + "loss": 0.7769, "step": 2620 }, { "epoch": 10.001532567049809, - "grad_norm": 0.13234415650367737, + "grad_norm": 0.3120407462120056, "learning_rate": 8.87186036611324e-06, - "loss": 1.6157, + "loss": 1.758, "step": 2630 }, { "epoch": 10.002298850574713, - "grad_norm": 0.04989105463027954, + "grad_norm": 0.059983547776937485, "learning_rate": 8.863346104725415e-06, - "loss": 0.445, + "loss": 0.5762, "step": 2640 }, { "epoch": 10.003065134099616, - "grad_norm": 20.0693359375, + "grad_norm": 30.121280670166016, "learning_rate": 8.854831843337592e-06, - "loss": 1.9525, + "loss": 2.7038, "step": 2650 }, { "epoch": 10.003831417624522, - "grad_norm": 0.36622264981269836, + "grad_norm": 0.3512975871562958, "learning_rate": 8.846317581949767e-06, - "loss": 1.4762, + "loss": 1.6626, "step": 2660 }, { "epoch": 10.004597701149425, - "grad_norm": 0.03537037596106529, + "grad_norm": 0.07755214720964432, "learning_rate": 8.837803320561942e-06, - "loss": 0.0321, + "loss": 0.0285, "step": 2670 }, { "epoch": 10.005363984674329, - "grad_norm": 0.34984490275382996, + "grad_norm": 0.3327948749065399, "learning_rate": 8.829289059174118e-06, - "loss": 2.5355, + "loss": 2.4637, "step": 2680 }, { "epoch": 10.006130268199234, - "grad_norm": 0.2767044007778168, + "grad_norm": 0.309066504240036, "learning_rate": 8.820774797786292e-06, - "loss": 1.0077, + "loss": 1.2045, "step": 2690 }, { "epoch": 10.006896551724138, - "grad_norm": 0.06856942921876907, + "grad_norm": 0.3032403290271759, "learning_rate": 8.812260536398468e-06, - "loss": 0.0049, + "loss": 0.0114, "step": 2700 }, { "epoch": 10.007662835249041, - "grad_norm": 0.24044296145439148, + "grad_norm": 0.2840149998664856, "learning_rate": 8.803746275010643e-06, - "loss": 2.0038, + "loss": 1.7546, "step": 2710 }, { "epoch": 10.008429118773947, - "grad_norm": 0.8038799166679382, + "grad_norm": 0.8331330418586731, "learning_rate": 8.795232013622818e-06, - "loss": 0.8551, + "loss": 0.7119, "step": 2720 }, { "epoch": 10.00919540229885, - "grad_norm": 0.03398509323596954, + "grad_norm": 0.0775945633649826, "learning_rate": 8.786717752234995e-06, - "loss": 0.0174, + "loss": 0.0084, "step": 2730 }, { "epoch": 10.009961685823756, - "grad_norm": 0.05750809609889984, + "grad_norm": 0.058158449828624725, "learning_rate": 8.77820349084717e-06, - "loss": 1.2572, + "loss": 1.1664, "step": 2740 }, { "epoch": 10.01072796934866, - "grad_norm": 21.19040870666504, + "grad_norm": 29.93315887451172, "learning_rate": 8.769689229459345e-06, - "loss": 1.5835, + "loss": 1.6866, "step": 2750 }, { "epoch": 10.011494252873563, - "grad_norm": 0.40747809410095215, + "grad_norm": 0.2126077562570572, "learning_rate": 8.76117496807152e-06, - "loss": 1.2114, + "loss": 1.498, "step": 2760 }, { "epoch": 10.012260536398468, - "grad_norm": 0.3179941177368164, + "grad_norm": 0.30738991498947144, "learning_rate": 8.752660706683695e-06, - "loss": 0.4078, + "loss": 0.3981, "step": 2770 }, { "epoch": 10.013026819923372, - "grad_norm": 0.2843784987926483, + "grad_norm": 0.2726083993911743, "learning_rate": 8.744146445295872e-06, - "loss": 1.2185, + "loss": 1.2491, "step": 2780 }, { "epoch": 10.013793103448275, - "grad_norm": 0.040187932550907135, + "grad_norm": 0.10244955867528915, "learning_rate": 8.735632183908047e-06, - "loss": 0.4684, + "loss": 0.4798, "step": 2790 }, { "epoch": 10.01455938697318, - "grad_norm": 32.20536422729492, + "grad_norm": 29.517539978027344, "learning_rate": 8.727117922520222e-06, - "loss": 2.1075, + "loss": 1.9199, "step": 2800 }, { "epoch": 10.015325670498084, - "grad_norm": 0.26296505331993103, + "grad_norm": 0.6872894167900085, "learning_rate": 8.718603661132398e-06, - "loss": 1.1633, + "loss": 0.693, "step": 2810 }, { "epoch": 10.016091954022988, - "grad_norm": 28.483657836914062, + "grad_norm": 34.48725509643555, "learning_rate": 8.710089399744572e-06, - "loss": 1.8388, + "loss": 1.8904, "step": 2820 }, { "epoch": 10.016858237547893, - "grad_norm": 33.29490661621094, + "grad_norm": 31.905208587646484, "learning_rate": 8.701575138356748e-06, - "loss": 1.2718, + "loss": 1.4197, "step": 2830 }, { "epoch": 10.017624521072797, - "grad_norm": 0.32636523246765137, + "grad_norm": 0.11004088073968887, "learning_rate": 8.693060876968923e-06, - "loss": 0.0197, + "loss": 0.0104, "step": 2840 }, { "epoch": 10.0183908045977, - "grad_norm": 0.3445032238960266, + "grad_norm": 0.15967635810375214, "learning_rate": 8.684546615581098e-06, - "loss": 0.5986, + "loss": 0.5465, "step": 2850 }, { "epoch": 10.019157088122606, - "grad_norm": 0.8212053179740906, + "grad_norm": 30.832603454589844, "learning_rate": 8.676032354193275e-06, - "loss": 1.1039, + "loss": 0.7211, "step": 2860 }, { "epoch": 10.01992337164751, - "grad_norm": 0.04025799781084061, + "grad_norm": 0.03421596810221672, "learning_rate": 8.66751809280545e-06, - "loss": 1.3657, + "loss": 1.1457, "step": 2870 }, { "epoch": 10.02, "eval_accuracy": 0.5555555555555556, - "eval_loss": 1.5317984819412231, - "eval_runtime": 17.6732, - "eval_samples_per_second": 2.546, - "eval_steps_per_second": 2.546, + "eval_loss": 2.0478696823120117, + "eval_runtime": 16.1883, + "eval_samples_per_second": 2.78, + "eval_steps_per_second": 2.78, "step": 2871 }, { "epoch": 11.000689655172414, - "grad_norm": 0.06636291742324829, + "grad_norm": 0.08310609310865402, "learning_rate": 8.659003831417625e-06, - "loss": 0.8265, + "loss": 1.1346, "step": 2880 }, { "epoch": 11.001455938697317, - "grad_norm": 0.6545989513397217, + "grad_norm": 0.5417599081993103, "learning_rate": 8.650489570029802e-06, - "loss": 1.3514, + "loss": 1.9824, "step": 2890 }, { "epoch": 11.002222222222223, - "grad_norm": 1.6955431699752808, + "grad_norm": 2.308333158493042, "learning_rate": 8.641975308641975e-06, - "loss": 0.5552, + "loss": 0.7273, "step": 2900 }, { "epoch": 11.002988505747126, - "grad_norm": 56.05598831176758, + "grad_norm": 64.91603088378906, "learning_rate": 8.633461047254152e-06, - "loss": 1.042, + "loss": 1.1545, "step": 2910 }, { "epoch": 11.00375478927203, - "grad_norm": 0.23931346833705902, + "grad_norm": 0.4775558114051819, "learning_rate": 8.624946785866327e-06, - "loss": 0.614, + "loss": 0.4754, "step": 2920 }, { "epoch": 11.004521072796935, - "grad_norm": 80.81182098388672, + "grad_norm": 45.525115966796875, "learning_rate": 8.616432524478502e-06, - "loss": 1.6702, + "loss": 1.4855, "step": 2930 }, { "epoch": 11.005287356321839, - "grad_norm": 45.35509490966797, + "grad_norm": 46.88608169555664, "learning_rate": 8.607918263090678e-06, - "loss": 1.3748, + "loss": 1.3386, "step": 2940 }, { "epoch": 11.006053639846744, - "grad_norm": 20.421953201293945, + "grad_norm": 32.513484954833984, "learning_rate": 8.599404001702853e-06, - "loss": 1.7681, + "loss": 2.113, "step": 2950 }, { "epoch": 11.006819923371648, - "grad_norm": 0.7332080602645874, + "grad_norm": 0.36965978145599365, "learning_rate": 8.590889740315028e-06, - "loss": 0.8293, + "loss": 0.9524, "step": 2960 }, { "epoch": 11.007586206896551, - "grad_norm": 14.54810619354248, + "grad_norm": 2.321744203567505, "learning_rate": 8.582375478927203e-06, - "loss": 1.058, + "loss": 1.5969, "step": 2970 }, { "epoch": 11.008352490421457, - "grad_norm": 12.682663917541504, + "grad_norm": 59.43613815307617, "learning_rate": 8.573861217539378e-06, - "loss": 0.1965, + "loss": 0.3492, "step": 2980 }, { "epoch": 11.00911877394636, - "grad_norm": 70.7584228515625, + "grad_norm": 61.816165924072266, "learning_rate": 8.565346956151555e-06, - "loss": 0.6758, + "loss": 0.8922, "step": 2990 }, { "epoch": 11.009885057471264, - "grad_norm": 0.2769882082939148, + "grad_norm": 2.5952584743499756, "learning_rate": 8.55683269476373e-06, - "loss": 1.6247, + "loss": 1.253, "step": 3000 }, { "epoch": 11.01065134099617, - "grad_norm": 74.33293151855469, + "grad_norm": 62.42274856567383, "learning_rate": 8.548318433375905e-06, - "loss": 0.5452, + "loss": 0.4903, "step": 3010 }, { "epoch": 11.011417624521073, - "grad_norm": 0.056933630257844925, + "grad_norm": 0.0369681753218174, "learning_rate": 8.539804171988082e-06, - "loss": 0.002, + "loss": 0.0022, "step": 3020 }, { "epoch": 11.012183908045976, - "grad_norm": 79.40782165527344, + "grad_norm": 61.424781799316406, "learning_rate": 8.531289910600255e-06, - "loss": 0.8951, + "loss": 1.0108, "step": 3030 }, { "epoch": 11.012950191570882, - "grad_norm": 0.02401474118232727, + "grad_norm": 0.1121385395526886, "learning_rate": 8.522775649212432e-06, - "loss": 1.6231, + "loss": 1.647, "step": 3040 }, { "epoch": 11.013716475095785, - "grad_norm": 52.57748031616211, + "grad_norm": 52.22344207763672, "learning_rate": 8.514261387824607e-06, - "loss": 1.6131, + "loss": 1.6624, "step": 3050 }, { "epoch": 11.014482758620689, - "grad_norm": 22.145566940307617, + "grad_norm": 13.415779113769531, "learning_rate": 8.505747126436782e-06, - "loss": 0.2288, + "loss": 0.1211, "step": 3060 }, { "epoch": 11.015249042145594, - "grad_norm": 1.6091238260269165, + "grad_norm": 0.41706305742263794, "learning_rate": 8.497232865048958e-06, - "loss": 0.4832, + "loss": 0.6423, "step": 3070 }, { "epoch": 11.016015325670498, - "grad_norm": 0.0861823707818985, + "grad_norm": 0.06626871973276138, "learning_rate": 8.488718603661133e-06, - "loss": 0.8008, + "loss": 1.0995, "step": 3080 }, { "epoch": 11.016781609195402, - "grad_norm": 0.009323697537183762, + "grad_norm": 0.01878122240304947, "learning_rate": 8.480204342273308e-06, - "loss": 0.6413, + "loss": 0.6175, "step": 3090 }, { "epoch": 11.017547892720307, - "grad_norm": 0.06754224002361298, + "grad_norm": 0.042747728526592255, "learning_rate": 8.471690080885483e-06, - "loss": 1.3094, + "loss": 1.4963, "step": 3100 }, { "epoch": 11.01831417624521, - "grad_norm": 53.206298828125, + "grad_norm": 56.12227249145508, "learning_rate": 8.463175819497658e-06, - "loss": 1.3463, + "loss": 1.5122, "step": 3110 }, { "epoch": 11.019080459770114, - "grad_norm": 0.6144325733184814, + "grad_norm": 0.4493597149848938, "learning_rate": 8.454661558109835e-06, - "loss": 0.0424, + "loss": 0.0122, "step": 3120 }, { "epoch": 11.01984674329502, - "grad_norm": 11.595216751098633, + "grad_norm": 46.640464782714844, "learning_rate": 8.44614729672201e-06, - "loss": 0.7183, + "loss": 1.2264, "step": 3130 }, { "epoch": 11.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 1.2937263250350952, - "eval_runtime": 17.3462, - "eval_samples_per_second": 2.594, - "eval_steps_per_second": 2.594, + "eval_accuracy": 0.5555555555555556, + "eval_loss": 1.9069896936416626, + "eval_runtime": 17.0154, + "eval_samples_per_second": 2.645, + "eval_steps_per_second": 2.645, "step": 3132 }, { "epoch": 12.000613026819924, - "grad_norm": 0.051024023443460464, + "grad_norm": 0.05294239521026611, "learning_rate": 8.437633035334185e-06, - "loss": 0.0599, + "loss": 0.1777, "step": 3140 }, { "epoch": 12.001379310344827, - "grad_norm": 83.38597106933594, + "grad_norm": 45.58882522583008, "learning_rate": 8.429118773946362e-06, - "loss": 0.3836, + "loss": 0.5973, "step": 3150 }, { "epoch": 12.002145593869733, - "grad_norm": 101.97233581542969, + "grad_norm": 181.31472778320312, "learning_rate": 8.420604512558537e-06, - "loss": 1.2131, + "loss": 1.2029, "step": 3160 }, { "epoch": 12.002911877394636, - "grad_norm": 55.02336883544922, + "grad_norm": 77.20633697509766, "learning_rate": 8.412090251170712e-06, - "loss": 0.0911, + "loss": 0.232, "step": 3170 }, { "epoch": 12.00367816091954, - "grad_norm": 3.341539144515991, + "grad_norm": 0.7357093095779419, "learning_rate": 8.403575989782887e-06, - "loss": 0.2343, + "loss": 0.2424, "step": 3180 }, { "epoch": 12.004444444444445, - "grad_norm": 2.1928958892822266, + "grad_norm": 4.680446624755859, "learning_rate": 8.395061728395062e-06, - "loss": 0.7526, + "loss": 0.5459, "step": 3190 }, { "epoch": 12.005210727969349, - "grad_norm": 0.0289593655616045, + "grad_norm": 0.027788257226347923, "learning_rate": 8.386547467007238e-06, - "loss": 0.0406, + "loss": 0.1938, "step": 3200 }, { "epoch": 12.005977011494252, - "grad_norm": 1.1296756267547607, + "grad_norm": 1.7406878471374512, "learning_rate": 8.378033205619413e-06, - "loss": 1.1734, + "loss": 1.9564, "step": 3210 }, { "epoch": 12.006743295019158, - "grad_norm": 0.7496622204780579, + "grad_norm": 1.2436535358428955, "learning_rate": 8.369518944231588e-06, - "loss": 0.8601, + "loss": 0.5914, "step": 3220 }, { "epoch": 12.007509578544061, - "grad_norm": 0.012187754735350609, + "grad_norm": 0.01882697455585003, "learning_rate": 8.361004682843763e-06, - "loss": 0.6837, + "loss": 0.305, "step": 3230 }, { "epoch": 12.008275862068965, - "grad_norm": 56.76580047607422, + "grad_norm": 73.35531616210938, "learning_rate": 8.35249042145594e-06, - "loss": 1.9065, + "loss": 1.8261, "step": 3240 }, { "epoch": 12.00904214559387, - "grad_norm": 51.99478530883789, + "grad_norm": 36.61800003051758, "learning_rate": 8.343976160068115e-06, - "loss": 1.2939, + "loss": 1.1471, "step": 3250 }, { "epoch": 12.009808429118774, - "grad_norm": 0.5113939046859741, + "grad_norm": 0.8787409663200378, "learning_rate": 8.33546189868029e-06, - "loss": 1.0016, + "loss": 0.8497, "step": 3260 }, { "epoch": 12.010574712643677, - "grad_norm": 0.12287792563438416, + "grad_norm": 0.06883213669061661, "learning_rate": 8.326947637292465e-06, - "loss": 0.4829, + "loss": 0.6148, "step": 3270 }, { "epoch": 12.011340996168583, - "grad_norm": 0.03340792655944824, + "grad_norm": 0.0322272852063179, "learning_rate": 8.318433375904642e-06, - "loss": 1.3763, + "loss": 1.4349, "step": 3280 }, { "epoch": 12.012107279693486, - "grad_norm": 0.26685163378715515, + "grad_norm": 0.5764551162719727, "learning_rate": 8.309919114516817e-06, - "loss": 0.2169, + "loss": 0.4347, "step": 3290 }, { "epoch": 12.01287356321839, - "grad_norm": 0.04748799651861191, + "grad_norm": 0.09783365577459335, "learning_rate": 8.301404853128992e-06, - "loss": 1.0985, + "loss": 1.1393, "step": 3300 }, { "epoch": 12.013639846743295, - "grad_norm": 1.0526065826416016, + "grad_norm": 0.4599780738353729, "learning_rate": 8.292890591741167e-06, - "loss": 0.8121, + "loss": 0.913, "step": 3310 }, { "epoch": 12.014406130268199, - "grad_norm": 98.6900634765625, + "grad_norm": 68.13677215576172, "learning_rate": 8.284376330353342e-06, - "loss": 1.8323, + "loss": 2.0881, "step": 3320 }, { "epoch": 12.015172413793103, - "grad_norm": 0.10765168815851212, + "grad_norm": 0.0598631352186203, "learning_rate": 8.275862068965518e-06, - "loss": 0.6661, + "loss": 1.2355, "step": 3330 }, { "epoch": 12.015938697318008, - "grad_norm": 0.03830413147807121, + "grad_norm": 0.032543160021305084, "learning_rate": 8.267347807577693e-06, - "loss": 0.6018, + "loss": 0.3538, "step": 3340 }, { "epoch": 12.016704980842912, - "grad_norm": 0.3407254219055176, + "grad_norm": 0.5091415643692017, "learning_rate": 8.258833546189868e-06, - "loss": 0.872, + "loss": 1.857, "step": 3350 }, { "epoch": 12.017471264367815, - "grad_norm": 0.27802738547325134, + "grad_norm": 0.6881793141365051, "learning_rate": 8.250319284802043e-06, - "loss": 0.8596, + "loss": 0.4295, "step": 3360 }, { "epoch": 12.01823754789272, - "grad_norm": 0.034988779574632645, + "grad_norm": 0.03967675194144249, "learning_rate": 8.24180502341422e-06, - "loss": 1.9215, + "loss": 1.4732, "step": 3370 }, { "epoch": 12.019003831417624, - "grad_norm": 0.050661202520132065, + "grad_norm": 0.026304146274924278, "learning_rate": 8.233290762026395e-06, - "loss": 0.7721, + "loss": 0.4333, "step": 3380 }, { "epoch": 12.01977011494253, - "grad_norm": 0.23323896527290344, + "grad_norm": 0.5558311343193054, "learning_rate": 8.22477650063857e-06, - "loss": 1.0421, + "loss": 1.4812, "step": 3390 }, { "epoch": 12.02, - "eval_accuracy": 0.6, - "eval_loss": 1.3575907945632935, - "eval_runtime": 17.4982, - "eval_samples_per_second": 2.572, - "eval_steps_per_second": 2.572, + "eval_accuracy": 0.7111111111111111, + "eval_loss": 0.9185453653335571, + "eval_runtime": 18.142, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, "step": 3393 }, { "epoch": 13.000536398467434, - "grad_norm": 2.6243691444396973, + "grad_norm": 0.8824403285980225, "learning_rate": 8.216262239250745e-06, - "loss": 0.2953, + "loss": 0.2256, "step": 3400 }, { "epoch": 13.001302681992337, - "grad_norm": 0.009841942228376865, + "grad_norm": 0.007885945029556751, "learning_rate": 8.207747977862922e-06, - "loss": 0.5137, + "loss": 0.093, "step": 3410 }, { "epoch": 13.00206896551724, - "grad_norm": 0.04530877619981766, + "grad_norm": 0.02462655119597912, "learning_rate": 8.199233716475097e-06, - "loss": 0.6094, + "loss": 1.311, "step": 3420 }, { "epoch": 13.002835249042146, - "grad_norm": 104.12423706054688, + "grad_norm": 118.39599609375, "learning_rate": 8.190719455087272e-06, - "loss": 0.5796, + "loss": 0.5309, "step": 3430 }, { "epoch": 13.00360153256705, - "grad_norm": 0.05378086864948273, + "grad_norm": 0.04293208196759224, "learning_rate": 8.182205193699447e-06, - "loss": 1.2054, + "loss": 0.5239, "step": 3440 }, { "epoch": 13.004367816091953, - "grad_norm": 0.1626320481300354, + "grad_norm": 0.022847572341561317, "learning_rate": 8.173690932311623e-06, - "loss": 0.5962, + "loss": 0.687, "step": 3450 }, { "epoch": 13.005134099616859, - "grad_norm": 0.05897984281182289, + "grad_norm": 0.07785163074731827, "learning_rate": 8.165176670923798e-06, - "loss": 2.163, + "loss": 0.9864, "step": 3460 }, { "epoch": 13.005900383141762, - "grad_norm": 0.10230876505374908, + "grad_norm": 0.043123744428157806, "learning_rate": 8.156662409535973e-06, - "loss": 0.886, + "loss": 0.7409, "step": 3470 }, { "epoch": 13.006666666666666, - "grad_norm": 98.008056640625, + "grad_norm": 84.49942016601562, "learning_rate": 8.148148148148148e-06, - "loss": 0.5957, + "loss": 0.8819, "step": 3480 }, { "epoch": 13.007432950191571, - "grad_norm": 0.5273327231407166, + "grad_norm": 0.2664766013622284, "learning_rate": 8.139633886760325e-06, - "loss": 0.0078, + "loss": 0.0718, "step": 3490 }, { "epoch": 13.008199233716475, - "grad_norm": 0.03917591646313667, + "grad_norm": 0.010902808047831059, "learning_rate": 8.1311196253725e-06, - "loss": 1.0983, + "loss": 0.8543, "step": 3500 }, { "epoch": 13.008965517241379, - "grad_norm": 1.1955628395080566, + "grad_norm": 0.3298071026802063, "learning_rate": 8.122605363984675e-06, - "loss": 0.3713, + "loss": 0.6899, "step": 3510 }, { "epoch": 13.009731800766284, - "grad_norm": 51.205055236816406, + "grad_norm": 284.6703186035156, "learning_rate": 8.11409110259685e-06, - "loss": 1.7528, + "loss": 0.7249, "step": 3520 }, { "epoch": 13.010498084291187, - "grad_norm": 226.49388122558594, + "grad_norm": 45.97584915161133, "learning_rate": 8.105576841209027e-06, - "loss": 0.406, + "loss": 0.6982, "step": 3530 }, { "epoch": 13.011264367816091, - "grad_norm": 0.008933121338486671, + "grad_norm": 0.017386356368660927, "learning_rate": 8.097062579821202e-06, - "loss": 0.0047, + "loss": 0.0479, "step": 3540 }, { "epoch": 13.012030651340996, - "grad_norm": 187.11196899414062, + "grad_norm": 16.1013126373291, "learning_rate": 8.088548318433377e-06, - "loss": 1.6043, + "loss": 1.9168, "step": 3550 }, { "epoch": 13.0127969348659, - "grad_norm": 0.038275640457868576, + "grad_norm": 0.11776525527238846, "learning_rate": 8.080034057045552e-06, - "loss": 0.1214, + "loss": 0.1906, "step": 3560 }, { "epoch": 13.013563218390805, - "grad_norm": 0.01806672103703022, + "grad_norm": 0.031767915934324265, "learning_rate": 8.071519795657727e-06, - "loss": 2.1923, + "loss": 2.137, "step": 3570 }, { "epoch": 13.014329501915709, - "grad_norm": 0.7170591950416565, + "grad_norm": 0.31590378284454346, "learning_rate": 8.063005534269903e-06, - "loss": 2.1442, + "loss": 2.3844, "step": 3580 }, { "epoch": 13.015095785440613, - "grad_norm": 1.2263590097427368, + "grad_norm": 0.08533840626478195, "learning_rate": 8.054491272882078e-06, - "loss": 0.7875, + "loss": 0.4612, "step": 3590 }, { "epoch": 13.015862068965518, - "grad_norm": 0.01844172552227974, + "grad_norm": 0.02838957868516445, "learning_rate": 8.045977011494253e-06, - "loss": 1.1393, + "loss": 1.3583, "step": 3600 }, { "epoch": 13.016628352490422, - "grad_norm": 0.04476948827505112, + "grad_norm": 0.09325381368398666, "learning_rate": 8.037462750106428e-06, - "loss": 0.7255, + "loss": 0.5914, "step": 3610 }, { "epoch": 13.017394636015325, - "grad_norm": 322.136962890625, + "grad_norm": 85.23422241210938, "learning_rate": 8.028948488718605e-06, - "loss": 0.9151, + "loss": 0.9606, "step": 3620 }, { "epoch": 13.01816091954023, - "grad_norm": 0.06272944808006287, + "grad_norm": 0.13213291764259338, "learning_rate": 8.02043422733078e-06, - "loss": 0.377, + "loss": 0.2522, "step": 3630 }, { "epoch": 13.018927203065134, - "grad_norm": 10.835458755493164, + "grad_norm": 40.80291748046875, "learning_rate": 8.011919965942955e-06, - "loss": 2.3775, + "loss": 2.3329, "step": 3640 }, { "epoch": 13.019693486590038, - "grad_norm": 0.223807230591774, + "grad_norm": 1.2269668579101562, "learning_rate": 8.00340570455513e-06, - "loss": 0.505, + "loss": 0.2804, "step": 3650 }, { "epoch": 13.02, - "eval_accuracy": 0.6222222222222222, - "eval_loss": 1.2773141860961914, - "eval_runtime": 18.2232, - "eval_samples_per_second": 2.469, - "eval_steps_per_second": 2.469, + "eval_accuracy": 0.6, + "eval_loss": 1.6408798694610596, + "eval_runtime": 18.0548, + "eval_samples_per_second": 2.492, + "eval_steps_per_second": 2.492, "step": 3654 }, { "epoch": 14.000459770114942, - "grad_norm": 23.774599075317383, + "grad_norm": 46.51220703125, "learning_rate": 7.994891443167307e-06, - "loss": 1.4754, + "loss": 1.9863, "step": 3660 }, { "epoch": 14.001226053639847, - "grad_norm": 0.16081327199935913, + "grad_norm": 4.256190299987793, "learning_rate": 7.986377181779482e-06, - "loss": 0.496, + "loss": 0.9324, "step": 3670 }, { "epoch": 14.00199233716475, - "grad_norm": 0.6054710149765015, + "grad_norm": 0.07637009769678116, "learning_rate": 7.977862920391657e-06, - "loss": 0.398, + "loss": 0.5618, "step": 3680 }, { "epoch": 14.002758620689654, - "grad_norm": 0.01645374484360218, + "grad_norm": 0.019988488405942917, "learning_rate": 7.969348659003832e-06, - "loss": 0.2459, + "loss": 0.5217, "step": 3690 }, { "epoch": 14.00352490421456, - "grad_norm": 39.37165451049805, + "grad_norm": 120.40330505371094, "learning_rate": 7.960834397616007e-06, - "loss": 0.7992, + "loss": 0.8762, "step": 3700 }, { "epoch": 14.004291187739463, - "grad_norm": 0.014056538231670856, + "grad_norm": 0.039856839925050735, "learning_rate": 7.952320136228183e-06, - "loss": 0.543, + "loss": 0.4963, "step": 3710 }, { "epoch": 14.005057471264367, - "grad_norm": 163.24331665039062, + "grad_norm": 25.75318145751953, "learning_rate": 7.943805874840358e-06, - "loss": 1.1832, + "loss": 0.9267, "step": 3720 }, { "epoch": 14.005823754789272, - "grad_norm": 0.01770191825926304, + "grad_norm": 0.024051252752542496, "learning_rate": 7.935291613452533e-06, - "loss": 0.7983, + "loss": 0.7305, "step": 3730 }, { "epoch": 14.006590038314176, - "grad_norm": 88.43537902832031, + "grad_norm": 223.2063446044922, "learning_rate": 7.92677735206471e-06, - "loss": 0.9745, + "loss": 0.8236, "step": 3740 }, { "epoch": 14.007356321839081, - "grad_norm": 133.46218872070312, + "grad_norm": 161.97634887695312, "learning_rate": 7.918263090676885e-06, - "loss": 0.261, + "loss": 0.3708, "step": 3750 }, { "epoch": 14.008122605363985, - "grad_norm": 0.08603092283010483, + "grad_norm": 2.1761410236358643, "learning_rate": 7.90974882928906e-06, - "loss": 0.4193, + "loss": 0.549, "step": 3760 }, { "epoch": 14.008888888888889, - "grad_norm": 0.10657916218042374, + "grad_norm": 125.46454620361328, "learning_rate": 7.901234567901235e-06, - "loss": 0.0023, + "loss": 0.1352, "step": 3770 }, { "epoch": 14.009655172413794, - "grad_norm": 0.00975658930838108, + "grad_norm": 0.009826806373894215, "learning_rate": 7.89272030651341e-06, - "loss": 0.8424, + "loss": 0.5067, "step": 3780 }, { "epoch": 14.010421455938697, - "grad_norm": 86.33008575439453, + "grad_norm": 154.9338836669922, "learning_rate": 7.884206045125587e-06, - "loss": 1.5805, + "loss": 0.9069, "step": 3790 }, { "epoch": 14.011187739463601, - "grad_norm": 0.027157893404364586, + "grad_norm": 0.013454711996018887, "learning_rate": 7.875691783737762e-06, - "loss": 2.1174, + "loss": 1.7521, "step": 3800 }, { "epoch": 14.011954022988506, - "grad_norm": 0.1424316018819809, + "grad_norm": 0.04040395841002464, "learning_rate": 7.867177522349937e-06, - "loss": 0.5742, + "loss": 0.4916, "step": 3810 }, { "epoch": 14.01272030651341, - "grad_norm": 0.5374491214752197, + "grad_norm": 0.17743313312530518, "learning_rate": 7.858663260962112e-06, - "loss": 1.877, + "loss": 1.4732, "step": 3820 }, { "epoch": 14.013486590038314, - "grad_norm": 0.33836546540260315, + "grad_norm": 0.04594993591308594, "learning_rate": 7.850148999574287e-06, - "loss": 1.6077, + "loss": 1.8179, "step": 3830 }, { "epoch": 14.014252873563219, - "grad_norm": 50.01400375366211, + "grad_norm": 5.735226631164551, "learning_rate": 7.841634738186463e-06, - "loss": 0.9444, + "loss": 0.6406, "step": 3840 }, { "epoch": 14.015019157088123, - "grad_norm": 0.6236952543258667, + "grad_norm": 0.9820342063903809, "learning_rate": 7.833120476798638e-06, - "loss": 0.8567, + "loss": 0.754, "step": 3850 }, { "epoch": 14.015785440613026, - "grad_norm": 0.047350723296403885, + "grad_norm": 0.01929650641977787, "learning_rate": 7.824606215410813e-06, - "loss": 0.0102, + "loss": 0.005, "step": 3860 }, { "epoch": 14.016551724137932, - "grad_norm": 0.02176245115697384, + "grad_norm": 0.014248164370656013, "learning_rate": 7.81609195402299e-06, - "loss": 0.0023, + "loss": 0.0008, "step": 3870 }, { "epoch": 14.017318007662835, - "grad_norm": 0.03785670921206474, + "grad_norm": 0.06030108407139778, "learning_rate": 7.807577692635165e-06, - "loss": 1.2998, + "loss": 1.5111, "step": 3880 }, { "epoch": 14.018084291187739, - "grad_norm": 0.1655324399471283, + "grad_norm": 5.951136112213135, "learning_rate": 7.79906343124734e-06, - "loss": 2.3854, + "loss": 2.5422, "step": 3890 }, { "epoch": 14.018850574712644, - "grad_norm": 33.04777908325195, + "grad_norm": 53.99986267089844, "learning_rate": 7.790549169859515e-06, - "loss": 1.3176, + "loss": 1.5943, "step": 3900 }, { "epoch": 14.019616858237548, - "grad_norm": 0.16066475212574005, + "grad_norm": 0.1597384214401245, "learning_rate": 7.78203490847169e-06, - "loss": 0.0108, + "loss": 0.0047, "step": 3910 }, { "epoch": 14.02, - "eval_accuracy": 0.6222222222222222, - "eval_loss": 1.6286754608154297, - "eval_runtime": 18.084, - "eval_samples_per_second": 2.488, - "eval_steps_per_second": 2.488, + "eval_accuracy": 0.6, + "eval_loss": 1.9807076454162598, + "eval_runtime": 17.0523, + "eval_samples_per_second": 2.639, + "eval_steps_per_second": 2.639, "step": 3915 }, { "epoch": 15.000383141762452, - "grad_norm": 0.07879719883203506, + "grad_norm": 0.03543298691511154, "learning_rate": 7.773520647083867e-06, - "loss": 1.4207, + "loss": 1.3077, "step": 3920 }, { "epoch": 15.001149425287357, - "grad_norm": 0.07686347514390945, + "grad_norm": 0.03290474787354469, "learning_rate": 7.765006385696042e-06, - "loss": 0.0183, + "loss": 0.0499, "step": 3930 }, { "epoch": 15.00191570881226, - "grad_norm": 0.03065226413309574, + "grad_norm": 0.055637381970882416, "learning_rate": 7.756492124308217e-06, - "loss": 0.8859, + "loss": 0.0992, "step": 3940 }, { "epoch": 15.002681992337164, - "grad_norm": 32.234676361083984, + "grad_norm": 66.07109832763672, "learning_rate": 7.747977862920393e-06, - "loss": 0.6148, + "loss": 0.5963, "step": 3950 }, { "epoch": 15.00344827586207, - "grad_norm": 0.04462090879678726, + "grad_norm": 0.027700673788785934, "learning_rate": 7.739463601532567e-06, - "loss": 1.0223, + "loss": 1.1152, "step": 3960 }, { "epoch": 15.004214559386973, - "grad_norm": 0.3321496844291687, + "grad_norm": 0.18067725002765656, "learning_rate": 7.730949340144743e-06, - "loss": 2.1966, + "loss": 2.5902, "step": 3970 }, { "epoch": 15.004980842911877, - "grad_norm": 1.3623191118240356, + "grad_norm": 1.1892890930175781, "learning_rate": 7.722435078756918e-06, - "loss": 1.0328, + "loss": 0.5789, "step": 3980 }, { "epoch": 15.005747126436782, - "grad_norm": 0.8339823484420776, + "grad_norm": 0.12023183703422546, "learning_rate": 7.713920817369093e-06, - "loss": 1.1835, + "loss": 1.0641, "step": 3990 }, { "epoch": 15.006513409961686, - "grad_norm": 0.9304522275924683, + "grad_norm": 0.3352854549884796, "learning_rate": 7.70540655598127e-06, - "loss": 0.0508, + "loss": 0.1204, "step": 4000 }, { "epoch": 15.00727969348659, - "grad_norm": 0.6551979184150696, + "grad_norm": 0.39131391048431396, "learning_rate": 7.696892294593445e-06, - "loss": 0.1281, + "loss": 0.7133, "step": 4010 }, { "epoch": 15.008045977011495, - "grad_norm": 0.05935044214129448, + "grad_norm": 0.03684370592236519, "learning_rate": 7.68837803320562e-06, - "loss": 0.4982, + "loss": 0.3149, "step": 4020 }, { "epoch": 15.008812260536398, - "grad_norm": 0.015151155181229115, + "grad_norm": 0.039831217378377914, "learning_rate": 7.679863771817797e-06, - "loss": 1.9095, + "loss": 1.7755, "step": 4030 }, { "epoch": 15.009578544061302, - "grad_norm": 12.166438102722168, + "grad_norm": 0.21220111846923828, "learning_rate": 7.67134951042997e-06, - "loss": 1.1904, + "loss": 1.0871, "step": 4040 }, { "epoch": 15.010344827586207, - "grad_norm": 0.4404546618461609, + "grad_norm": 0.2724530100822449, "learning_rate": 7.662835249042147e-06, - "loss": 0.456, + "loss": 1.1602, "step": 4050 }, { "epoch": 15.011111111111111, - "grad_norm": 0.03643851727247238, + "grad_norm": 0.026120534166693687, "learning_rate": 7.654320987654322e-06, - "loss": 0.7266, + "loss": 0.731, "step": 4060 }, { "epoch": 15.011877394636015, - "grad_norm": 1.1919721364974976, + "grad_norm": 0.0993092805147171, "learning_rate": 7.645806726266497e-06, - "loss": 0.3171, + "loss": 0.0082, "step": 4070 }, { "epoch": 15.01264367816092, - "grad_norm": 0.031446754932403564, + "grad_norm": 0.05011637881398201, "learning_rate": 7.637292464878673e-06, - "loss": 1.272, + "loss": 1.3451, "step": 4080 }, { "epoch": 15.013409961685824, - "grad_norm": 0.03175445646047592, + "grad_norm": 0.0289269108325243, "learning_rate": 7.6287782034908475e-06, - "loss": 0.5464, + "loss": 0.5778, "step": 4090 }, { "epoch": 15.014176245210727, - "grad_norm": 0.028192123398184776, + "grad_norm": 0.02133866772055626, "learning_rate": 7.620263942103023e-06, - "loss": 0.4719, + "loss": 0.9076, "step": 4100 }, { "epoch": 15.014942528735633, - "grad_norm": 0.060712672770023346, + "grad_norm": 0.03280365839600563, "learning_rate": 7.611749680715198e-06, - "loss": 0.2417, + "loss": 0.3617, "step": 4110 }, { "epoch": 15.015708812260536, - "grad_norm": 0.030363334342837334, + "grad_norm": 0.01994476653635502, "learning_rate": 7.603235419327374e-06, - "loss": 0.1247, + "loss": 0.2292, "step": 4120 }, { "epoch": 15.01647509578544, - "grad_norm": 0.021081579849123955, + "grad_norm": 0.024172259494662285, "learning_rate": 7.59472115793955e-06, - "loss": 1.1511, + "loss": 1.2316, "step": 4130 }, { "epoch": 15.017241379310345, - "grad_norm": 0.014939953573048115, + "grad_norm": 0.015413536690175533, "learning_rate": 7.586206896551724e-06, - "loss": 0.6771, + "loss": 0.5094, "step": 4140 }, { "epoch": 15.018007662835249, - "grad_norm": 0.23332993686199188, + "grad_norm": 0.13066789507865906, "learning_rate": 7.5776926351639e-06, - "loss": 1.1971, + "loss": 0.6741, "step": 4150 }, { "epoch": 15.018773946360152, - "grad_norm": 155.1028289794922, + "grad_norm": 226.49913024902344, "learning_rate": 7.569178373776076e-06, - "loss": 0.992, + "loss": 1.0478, "step": 4160 }, { "epoch": 15.019540229885058, - "grad_norm": 1.1900966167449951, + "grad_norm": 0.0611235685646534, "learning_rate": 7.560664112388251e-06, - "loss": 1.0271, + "loss": 0.0406, "step": 4170 }, { "epoch": 15.02, - "eval_accuracy": 0.6, - "eval_loss": 1.7293899059295654, - "eval_runtime": 17.49, - "eval_samples_per_second": 2.573, - "eval_steps_per_second": 2.573, + "eval_accuracy": 0.6888888888888889, + "eval_loss": 1.6698917150497437, + "eval_runtime": 16.9965, + "eval_samples_per_second": 2.648, + "eval_steps_per_second": 2.648, "step": 4176 }, { "epoch": 16.00030651340996, - "grad_norm": 0.024643549695611, + "grad_norm": 0.8609100580215454, "learning_rate": 7.552149851000427e-06, - "loss": 0.5293, + "loss": 0.4177, "step": 4180 }, { "epoch": 16.001072796934867, - "grad_norm": 0.047146569937467575, + "grad_norm": 0.029161520302295685, "learning_rate": 7.543635589612601e-06, - "loss": 0.4614, + "loss": 0.8359, "step": 4190 }, { "epoch": 16.00183908045977, - "grad_norm": 0.044430337846279144, + "grad_norm": 0.01977994292974472, "learning_rate": 7.535121328224777e-06, - "loss": 0.058, + "loss": 0.484, "step": 4200 }, { "epoch": 16.002605363984674, - "grad_norm": 6.5089311599731445, + "grad_norm": 0.10432259738445282, "learning_rate": 7.5266070668369525e-06, - "loss": 0.1206, + "loss": 0.0009, "step": 4210 }, { "epoch": 16.003371647509578, - "grad_norm": 0.025545841082930565, + "grad_norm": 0.014178512617945671, "learning_rate": 7.5180928054491275e-06, - "loss": 0.5964, + "loss": 0.6217, "step": 4220 }, { "epoch": 16.00413793103448, - "grad_norm": 0.014360751025378704, + "grad_norm": 0.007755284197628498, "learning_rate": 7.509578544061303e-06, - "loss": 1.2176, + "loss": 0.4466, "step": 4230 }, { "epoch": 16.00490421455939, - "grad_norm": 0.14016897976398468, + "grad_norm": 0.24808944761753082, "learning_rate": 7.501064282673479e-06, - "loss": 0.0021, + "loss": 0.0037, "step": 4240 }, { "epoch": 16.005670498084292, - "grad_norm": 0.03418239578604698, + "grad_norm": 0.06383653730154037, "learning_rate": 7.492550021285654e-06, - "loss": 0.6907, + "loss": 0.0045, "step": 4250 }, { "epoch": 16.006436781609196, - "grad_norm": 0.14730603992938995, + "grad_norm": 0.4502498209476471, "learning_rate": 7.48403575989783e-06, - "loss": 0.0009, + "loss": 0.0008, "step": 4260 }, { "epoch": 16.0072030651341, - "grad_norm": 0.11038258671760559, + "grad_norm": 0.401091605424881, "learning_rate": 7.475521498510004e-06, - "loss": 2.4944, + "loss": 1.9097, "step": 4270 }, { "epoch": 16.007969348659003, - "grad_norm": 2.471550226211548, + "grad_norm": 33.240867614746094, "learning_rate": 7.46700723712218e-06, - "loss": 1.6771, + "loss": 2.2527, "step": 4280 }, { "epoch": 16.008735632183907, - "grad_norm": 0.05922994762659073, + "grad_norm": 0.0071189249865710735, "learning_rate": 7.458492975734356e-06, - "loss": 0.8291, + "loss": 0.7757, "step": 4290 }, { "epoch": 16.009501915708814, - "grad_norm": 0.16022120416164398, + "grad_norm": 5.007058143615723, "learning_rate": 7.449978714346531e-06, - "loss": 0.4346, + "loss": 0.4985, "step": 4300 }, { "epoch": 16.010268199233717, - "grad_norm": 0.250328004360199, + "grad_norm": 0.4576053321361542, "learning_rate": 7.441464452958707e-06, - "loss": 1.7416, + "loss": 2.7173, "step": 4310 }, { "epoch": 16.01103448275862, - "grad_norm": 0.0273395087569952, + "grad_norm": 1.0771870613098145, "learning_rate": 7.4329501915708825e-06, - "loss": 0.871, + "loss": 0.6737, "step": 4320 }, { "epoch": 16.011800766283525, - "grad_norm": 2.170623302459717, + "grad_norm": 0.03956464305520058, "learning_rate": 7.4244359301830575e-06, - "loss": 0.7203, + "loss": 0.4047, "step": 4330 }, { "epoch": 16.01256704980843, - "grad_norm": 0.20951783657073975, + "grad_norm": 0.036584869027137756, "learning_rate": 7.4159216687952325e-06, - "loss": 0.5987, + "loss": 0.3712, "step": 4340 }, { "epoch": 16.013333333333332, - "grad_norm": 1.3539113998413086, + "grad_norm": 344.5559387207031, "learning_rate": 7.4074074074074075e-06, - "loss": 0.9079, + "loss": 0.8361, "step": 4350 }, { "epoch": 16.01409961685824, - "grad_norm": 0.042872458696365356, + "grad_norm": 0.011696291156113148, "learning_rate": 7.398893146019583e-06, - "loss": 0.0098, + "loss": 0.0092, "step": 4360 }, { "epoch": 16.014865900383143, - "grad_norm": 0.012476401403546333, + "grad_norm": 0.007536092773079872, "learning_rate": 7.390378884631759e-06, - "loss": 0.5032, + "loss": 0.17, "step": 4370 }, { "epoch": 16.015632183908046, - "grad_norm": 0.021217862144112587, + "grad_norm": 0.01246755663305521, "learning_rate": 7.381864623243934e-06, - "loss": 1.2424, + "loss": 1.0016, "step": 4380 }, { "epoch": 16.01639846743295, - "grad_norm": 52.91742706298828, + "grad_norm": 422.7257995605469, "learning_rate": 7.37335036185611e-06, - "loss": 1.1905, + "loss": 0.4662, "step": 4390 }, { "epoch": 16.017164750957853, - "grad_norm": 1.552392601966858, + "grad_norm": 5.647126197814941, "learning_rate": 7.364836100468284e-06, - "loss": 1.0229, + "loss": 0.905, "step": 4400 }, { "epoch": 16.017931034482757, - "grad_norm": 66.9126205444336, + "grad_norm": 50.1041374206543, "learning_rate": 7.35632183908046e-06, - "loss": 1.4453, + "loss": 1.5094, "step": 4410 }, { "epoch": 16.018697318007664, - "grad_norm": 0.2739672362804413, + "grad_norm": 0.12542405724525452, "learning_rate": 7.347807577692636e-06, - "loss": 0.5169, + "loss": 0.6731, "step": 4420 }, { "epoch": 16.019463601532568, - "grad_norm": 0.13632571697235107, + "grad_norm": 0.06219151243567467, "learning_rate": 7.339293316304811e-06, - "loss": 0.6963, + "loss": 1.102, "step": 4430 }, { "epoch": 16.02, - "eval_accuracy": 0.6, - "eval_loss": 1.8978708982467651, - "eval_runtime": 17.4877, - "eval_samples_per_second": 2.573, - "eval_steps_per_second": 2.573, + "eval_accuracy": 0.5777777777777777, + "eval_loss": 1.7952759265899658, + "eval_runtime": 15.3846, + "eval_samples_per_second": 2.925, + "eval_steps_per_second": 2.925, "step": 4437 }, { "epoch": 17.000229885057472, - "grad_norm": 0.20518171787261963, + "grad_norm": 0.0784146785736084, "learning_rate": 7.330779054916987e-06, - "loss": 0.7537, + "loss": 0.4915, "step": 4440 }, { "epoch": 17.000996168582375, - "grad_norm": 97.11636352539062, + "grad_norm": 116.78779602050781, "learning_rate": 7.3222647935291625e-06, - "loss": 1.6449, + "loss": 1.0051, "step": 4450 }, { "epoch": 17.00176245210728, - "grad_norm": 0.18592728674411774, + "grad_norm": 0.1718118041753769, "learning_rate": 7.3137505321413375e-06, - "loss": 0.6275, + "loss": 0.5452, "step": 4460 }, { "epoch": 17.002528735632183, - "grad_norm": 0.07581646740436554, + "grad_norm": 0.029343394562602043, "learning_rate": 7.305236270753513e-06, - "loss": 1.3236, + "loss": 1.0849, "step": 4470 }, { "epoch": 17.00329501915709, - "grad_norm": 0.38306108117103577, + "grad_norm": 0.9631693363189697, "learning_rate": 7.2967220093656875e-06, - "loss": 0.0036, + "loss": 0.0028, "step": 4480 }, { "epoch": 17.004061302681993, - "grad_norm": 139.12086486816406, + "grad_norm": 30.73664665222168, "learning_rate": 7.288207747977863e-06, - "loss": 1.0287, + "loss": 0.1304, "step": 4490 }, { "epoch": 17.004827586206897, - "grad_norm": 0.13149358332157135, + "grad_norm": 0.07852865755558014, "learning_rate": 7.279693486590039e-06, - "loss": 0.2367, + "loss": 0.0362, "step": 4500 }, { "epoch": 17.0055938697318, - "grad_norm": 256.7980651855469, + "grad_norm": 0.048740509897470474, "learning_rate": 7.271179225202214e-06, - "loss": 0.2394, + "loss": 0.6225, "step": 4510 }, { "epoch": 17.006360153256704, - "grad_norm": 0.6376872062683105, + "grad_norm": 0.8835161328315735, "learning_rate": 7.26266496381439e-06, - "loss": 1.0069, + "loss": 0.5793, "step": 4520 }, { "epoch": 17.007126436781608, - "grad_norm": 186.01611328125, + "grad_norm": 127.56395721435547, "learning_rate": 7.254150702426566e-06, - "loss": 1.967, + "loss": 1.3355, "step": 4530 }, { "epoch": 17.007892720306515, - "grad_norm": 241.28439331054688, + "grad_norm": 102.07418060302734, "learning_rate": 7.24563644103874e-06, - "loss": 0.7177, + "loss": 1.2969, "step": 4540 }, { "epoch": 17.00865900383142, - "grad_norm": 0.01180502213537693, + "grad_norm": 0.009165309369564056, "learning_rate": 7.237122179650916e-06, - "loss": 0.0431, + "loss": 0.4298, "step": 4550 }, { "epoch": 17.009425287356322, - "grad_norm": 37.48667907714844, + "grad_norm": 60.98646926879883, "learning_rate": 7.228607918263091e-06, - "loss": 1.4961, + "loss": 1.0061, "step": 4560 }, { "epoch": 17.010191570881226, - "grad_norm": 0.26516199111938477, + "grad_norm": 0.06397134810686111, "learning_rate": 7.220093656875267e-06, - "loss": 0.8018, + "loss": 0.4475, "step": 4570 }, { "epoch": 17.01095785440613, - "grad_norm": 0.02556721866130829, + "grad_norm": 0.009073864668607712, "learning_rate": 7.2115793954874425e-06, - "loss": 0.3108, + "loss": 0.0017, "step": 4580 }, { "epoch": 17.011724137931033, - "grad_norm": 0.010226594284176826, + "grad_norm": 0.006783429998904467, "learning_rate": 7.2030651340996175e-06, - "loss": 0.0021, + "loss": 0.0011, "step": 4590 }, { "epoch": 17.01249042145594, - "grad_norm": 1.0882664918899536, + "grad_norm": 0.35728031396865845, "learning_rate": 7.194550872711793e-06, - "loss": 1.4162, + "loss": 1.65, "step": 4600 }, { "epoch": 17.013256704980844, - "grad_norm": 0.7096887230873108, + "grad_norm": 2.305983304977417, "learning_rate": 7.1860366113239675e-06, - "loss": 0.7838, + "loss": 0.8142, "step": 4610 }, { "epoch": 17.014022988505747, - "grad_norm": 0.6446572542190552, + "grad_norm": 377.1839294433594, "learning_rate": 7.177522349936143e-06, - "loss": 0.6923, + "loss": 0.3996, "step": 4620 }, { "epoch": 17.01478927203065, - "grad_norm": 0.01274694874882698, + "grad_norm": 0.01987631246447563, "learning_rate": 7.169008088548319e-06, - "loss": 0.5459, + "loss": 0.6598, "step": 4630 }, { "epoch": 17.015555555555554, - "grad_norm": 0.6159297823905945, + "grad_norm": 0.4803782105445862, "learning_rate": 7.160493827160494e-06, - "loss": 0.4648, + "loss": 0.4526, "step": 4640 }, { "epoch": 17.016321839080458, - "grad_norm": 0.02117849886417389, + "grad_norm": 0.01265406422317028, "learning_rate": 7.15197956577267e-06, - "loss": 1.8506, + "loss": 1.8266, "step": 4650 }, { "epoch": 17.017088122605365, - "grad_norm": 0.08784272521734238, + "grad_norm": 0.06306411325931549, "learning_rate": 7.143465304384846e-06, - "loss": 1.1206, + "loss": 0.4852, "step": 4660 }, { "epoch": 17.01785440613027, - "grad_norm": 0.05527346953749657, + "grad_norm": 0.06567279994487762, "learning_rate": 7.13495104299702e-06, - "loss": 0.9203, + "loss": 1.4043, "step": 4670 }, { "epoch": 17.018620689655172, - "grad_norm": 136.2298583984375, + "grad_norm": 38.57827377319336, "learning_rate": 7.126436781609196e-06, - "loss": 0.9176, + "loss": 1.0563, "step": 4680 }, { "epoch": 17.019386973180076, - "grad_norm": 134.2930908203125, + "grad_norm": 207.01771545410156, "learning_rate": 7.117922520221371e-06, - "loss": 0.6715, + "loss": 0.6474, "step": 4690 }, { "epoch": 17.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 1.5500869750976562, - "eval_runtime": 17.4148, - "eval_samples_per_second": 2.584, - "eval_steps_per_second": 2.584, + "eval_accuracy": 0.5555555555555556, + "eval_loss": 1.852599024772644, + "eval_runtime": 15.4388, + "eval_samples_per_second": 2.915, + "eval_steps_per_second": 2.915, "step": 4698 }, { "epoch": 18.00015325670498, - "grad_norm": 0.08759082853794098, + "grad_norm": 0.03560800477862358, "learning_rate": 7.109408258833547e-06, - "loss": 0.5428, + "loss": 0.5889, "step": 4700 }, { "epoch": 18.000919540229884, - "grad_norm": 200.1810760498047, + "grad_norm": 31.137287139892578, "learning_rate": 7.1008939974457225e-06, - "loss": 1.3501, + "loss": 0.9552, "step": 4710 }, { "epoch": 18.00168582375479, - "grad_norm": 0.02465493232011795, + "grad_norm": 0.0376298613846302, "learning_rate": 7.0923797360578975e-06, - "loss": 0.0384, + "loss": 0.2546, "step": 4720 }, { "epoch": 18.002452107279694, - "grad_norm": 0.08276358246803284, + "grad_norm": 0.08126337081193924, "learning_rate": 7.083865474670073e-06, - "loss": 0.1773, + "loss": 0.098, "step": 4730 }, { "epoch": 18.003218390804598, - "grad_norm": 0.005425894632935524, + "grad_norm": 0.007752793841063976, "learning_rate": 7.075351213282249e-06, - "loss": 0.597, + "loss": 1.3142, "step": 4740 }, { "epoch": 18.0039846743295, - "grad_norm": 0.019214922562241554, + "grad_norm": 0.012709980830550194, "learning_rate": 7.066836951894423e-06, - "loss": 0.1638, + "loss": 0.456, "step": 4750 }, { "epoch": 18.004750957854405, - "grad_norm": 0.03398166224360466, + "grad_norm": 0.019455671310424805, "learning_rate": 7.058322690506599e-06, - "loss": 1.1991, + "loss": 2.3763, "step": 4760 }, { "epoch": 18.00551724137931, - "grad_norm": 0.0638684406876564, + "grad_norm": 0.0941697284579277, "learning_rate": 7.049808429118774e-06, - "loss": 0.9111, + "loss": 0.3242, "step": 4770 }, { "epoch": 18.006283524904216, - "grad_norm": 0.010825125500559807, + "grad_norm": 0.01922181434929371, "learning_rate": 7.04129416773095e-06, - "loss": 0.1274, + "loss": 0.0027, "step": 4780 }, { "epoch": 18.00704980842912, - "grad_norm": 35.97711181640625, + "grad_norm": 283.144775390625, "learning_rate": 7.032779906343126e-06, - "loss": 1.3644, + "loss": 1.4977, "step": 4790 }, { "epoch": 18.007816091954023, - "grad_norm": 0.07964839786291122, + "grad_norm": 0.155217245221138, "learning_rate": 7.0242656449553e-06, - "loss": 0.6691, + "loss": 1.185, "step": 4800 }, { "epoch": 18.008582375478927, - "grad_norm": 18.72853660583496, + "grad_norm": 29.519556045532227, "learning_rate": 7.015751383567476e-06, - "loss": 0.4231, + "loss": 0.0117, "step": 4810 }, { "epoch": 18.00934865900383, - "grad_norm": 0.05378716066479683, + "grad_norm": 0.039598166942596436, "learning_rate": 7.007237122179652e-06, - "loss": 1.2705, + "loss": 1.1183, "step": 4820 }, { "epoch": 18.010114942528734, - "grad_norm": 0.21670959889888763, + "grad_norm": 0.935707688331604, "learning_rate": 6.998722860791827e-06, - "loss": 0.8296, + "loss": 0.8526, "step": 4830 }, { "epoch": 18.01088122605364, - "grad_norm": 0.032097745686769485, + "grad_norm": 0.04442694038152695, "learning_rate": 6.9902085994040025e-06, - "loss": 0.175, + "loss": 0.363, "step": 4840 }, { "epoch": 18.011647509578545, - "grad_norm": 4.886181831359863, + "grad_norm": 199.830322265625, "learning_rate": 6.9816943380161775e-06, - "loss": 0.1449, + "loss": 0.0711, "step": 4850 }, { "epoch": 18.01241379310345, - "grad_norm": 6.5394511222839355, + "grad_norm": 0.5979216694831848, "learning_rate": 6.973180076628353e-06, - "loss": 0.0958, + "loss": 0.9456, "step": 4860 }, { "epoch": 18.013180076628352, - "grad_norm": 0.0074285720475018024, + "grad_norm": 0.010507689788937569, "learning_rate": 6.964665815240529e-06, - "loss": 0.71, + "loss": 0.6042, "step": 4870 }, { "epoch": 18.013946360153255, - "grad_norm": 0.04040542617440224, + "grad_norm": 0.057047780603170395, "learning_rate": 6.956151553852703e-06, - "loss": 0.9357, + "loss": 0.9494, "step": 4880 }, { "epoch": 18.014712643678163, - "grad_norm": 0.2584971785545349, + "grad_norm": 0.22485439479351044, "learning_rate": 6.947637292464879e-06, - "loss": 0.0201, + "loss": 0.6999, "step": 4890 }, { "epoch": 18.015478927203066, - "grad_norm": 0.024386756122112274, + "grad_norm": 0.023178046569228172, "learning_rate": 6.939123031077054e-06, - "loss": 0.7518, + "loss": 1.0686, "step": 4900 }, { "epoch": 18.01624521072797, - "grad_norm": 0.032193202525377274, + "grad_norm": 0.04680141061544418, "learning_rate": 6.93060876968923e-06, - "loss": 0.675, + "loss": 0.9795, "step": 4910 }, { "epoch": 18.017011494252873, - "grad_norm": 0.04422587528824806, + "grad_norm": 1.6066311597824097, "learning_rate": 6.922094508301406e-06, - "loss": 1.1775, + "loss": 0.6676, "step": 4920 }, { "epoch": 18.017777777777777, - "grad_norm": 0.02131647802889347, + "grad_norm": 0.05699748173356056, "learning_rate": 6.913580246913581e-06, - "loss": 0.1404, + "loss": 0.0025, "step": 4930 }, { "epoch": 18.01854406130268, - "grad_norm": 0.042147520929574966, + "grad_norm": 0.019538363441824913, "learning_rate": 6.905065985525757e-06, - "loss": 0.84, + "loss": 1.1296, "step": 4940 }, { "epoch": 18.019310344827588, - "grad_norm": 108.7848129272461, + "grad_norm": 0.1422862559556961, "learning_rate": 6.896551724137932e-06, - "loss": 0.8801, + "loss": 0.0028, "step": 4950 }, { "epoch": 18.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 1.5863184928894043, - "eval_runtime": 17.4396, - "eval_samples_per_second": 2.58, - "eval_steps_per_second": 2.58, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 1.4928491115570068, + "eval_runtime": 15.8638, + "eval_samples_per_second": 2.837, + "eval_steps_per_second": 2.837, "step": 4959 }, { "epoch": 19.000076628352492, - "grad_norm": 31.049474716186523, + "grad_norm": 21.753095626831055, "learning_rate": 6.888037462750107e-06, - "loss": 2.5843, + "loss": 2.7918, "step": 4960 }, { "epoch": 19.000842911877395, - "grad_norm": 110.73788452148438, + "grad_norm": 0.014949675649404526, "learning_rate": 6.8795232013622825e-06, - "loss": 0.9921, + "loss": 1.433, "step": 4970 }, { "epoch": 19.0016091954023, - "grad_norm": 10.381717681884766, + "grad_norm": 6.155599117279053, "learning_rate": 6.8710089399744575e-06, - "loss": 0.6567, + "loss": 1.6036, "step": 4980 }, { "epoch": 19.002375478927203, - "grad_norm": 0.020820654928684235, + "grad_norm": 0.015890881419181824, "learning_rate": 6.862494678586633e-06, - "loss": 0.0987, + "loss": 0.1889, "step": 4990 }, { "epoch": 19.003141762452106, - "grad_norm": 0.012431160546839237, + "grad_norm": 0.019246898591518402, "learning_rate": 6.853980417198809e-06, - "loss": 0.289, + "loss": 0.1353, "step": 5000 }, { "epoch": 19.00390804597701, - "grad_norm": 0.14668601751327515, + "grad_norm": 0.4525849223136902, "learning_rate": 6.845466155810983e-06, - "loss": 0.8666, + "loss": 1.0706, "step": 5010 }, { "epoch": 19.004674329501917, - "grad_norm": 0.22783324122428894, + "grad_norm": 1.6934255361557007, "learning_rate": 6.836951894423159e-06, - "loss": 0.0247, + "loss": 0.0105, "step": 5020 }, { "epoch": 19.00544061302682, - "grad_norm": 0.011404894292354584, + "grad_norm": 0.009208932518959045, "learning_rate": 6.828437633035335e-06, - "loss": 0.7899, + "loss": 0.792, "step": 5030 }, { "epoch": 19.006206896551724, - "grad_norm": 5.1333160400390625, + "grad_norm": 0.9257282018661499, "learning_rate": 6.81992337164751e-06, - "loss": 1.1187, + "loss": 0.8427, "step": 5040 }, { "epoch": 19.006973180076628, - "grad_norm": 0.008147607557475567, + "grad_norm": 0.0056198216043412685, "learning_rate": 6.811409110259686e-06, - "loss": 1.1645, + "loss": 1.8075, "step": 5050 }, { "epoch": 19.00773946360153, - "grad_norm": 91.47675323486328, + "grad_norm": 71.06964111328125, "learning_rate": 6.802894848871861e-06, - "loss": 0.2399, + "loss": 0.0455, "step": 5060 }, { "epoch": 19.00850574712644, - "grad_norm": 61.98637390136719, + "grad_norm": 0.10512993484735489, "learning_rate": 6.794380587484037e-06, - "loss": 0.9977, + "loss": 0.0012, "step": 5070 }, { "epoch": 19.009272030651342, - "grad_norm": 0.013683969154953957, + "grad_norm": 0.016775798052549362, "learning_rate": 6.7858663260962125e-06, - "loss": 2.1461, + "loss": 0.0029, "step": 5080 }, { "epoch": 19.010038314176246, - "grad_norm": 0.19238358736038208, + "grad_norm": 0.049808647483587265, "learning_rate": 6.777352064708387e-06, - "loss": 0.5892, + "loss": 0.587, "step": 5090 }, { "epoch": 19.01080459770115, - "grad_norm": 342.3090515136719, + "grad_norm": 0.1620670109987259, "learning_rate": 6.7688378033205625e-06, - "loss": 0.5315, + "loss": 0.9468, "step": 5100 }, { "epoch": 19.011570881226053, - "grad_norm": 0.03742855042219162, + "grad_norm": 0.032429538667201996, "learning_rate": 6.760323541932738e-06, - "loss": 0.5299, + "loss": 0.613, "step": 5110 }, { "epoch": 19.012337164750956, - "grad_norm": 0.036991048604249954, + "grad_norm": 0.04681113362312317, "learning_rate": 6.751809280544913e-06, - "loss": 0.0807, + "loss": 1.2025, "step": 5120 }, { "epoch": 19.013103448275864, - "grad_norm": 175.8551483154297, + "grad_norm": 207.49671936035156, "learning_rate": 6.743295019157089e-06, - "loss": 0.9554, + "loss": 0.9359, "step": 5130 }, { "epoch": 19.013869731800767, - "grad_norm": 0.9708214402198792, + "grad_norm": 0.34224358201026917, "learning_rate": 6.734780757769263e-06, - "loss": 0.7698, + "loss": 0.9068, "step": 5140 }, { "epoch": 19.01463601532567, - "grad_norm": 0.022618252784013748, + "grad_norm": 0.05905255675315857, "learning_rate": 6.726266496381439e-06, - "loss": 0.0227, + "loss": 0.0026, "step": 5150 }, { "epoch": 19.015402298850574, - "grad_norm": 0.1351953148841858, + "grad_norm": 0.0715297982096672, "learning_rate": 6.717752234993615e-06, - "loss": 0.6536, + "loss": 0.784, "step": 5160 }, { "epoch": 19.016168582375478, - "grad_norm": 0.02280157431960106, + "grad_norm": 0.013215141370892525, "learning_rate": 6.70923797360579e-06, - "loss": 0.0011, + "loss": 0.0016, "step": 5170 }, { "epoch": 19.01693486590038, - "grad_norm": 0.022669143974781036, + "grad_norm": 0.021521002054214478, "learning_rate": 6.700723712217966e-06, - "loss": 0.1908, + "loss": 0.4395, "step": 5180 }, { "epoch": 19.01770114942529, - "grad_norm": 0.00702593382447958, + "grad_norm": 0.005349059123545885, "learning_rate": 6.692209450830141e-06, - "loss": 0.0008, + "loss": 0.9613, "step": 5190 }, { "epoch": 19.018467432950192, - "grad_norm": 0.015154541470110416, + "grad_norm": 0.026421790942549706, "learning_rate": 6.683695189442317e-06, - "loss": 1.4747, + "loss": 1.2396, "step": 5200 }, { "epoch": 19.019233716475096, - "grad_norm": 0.08880870789289474, + "grad_norm": 2.56968092918396, "learning_rate": 6.6751809280544925e-06, - "loss": 1.3583, + "loss": 0.0646, "step": 5210 }, { "epoch": 19.02, - "grad_norm": 50.471961975097656, + "grad_norm": 45.920440673828125, "learning_rate": 6.666666666666667e-06, - "loss": 0.5135, + "loss": 1.1598, "step": 5220 }, { "epoch": 19.02, "eval_accuracy": 0.6222222222222222, - "eval_loss": 2.0611658096313477, - "eval_runtime": 17.4592, - "eval_samples_per_second": 2.577, - "eval_steps_per_second": 2.577, + "eval_loss": 2.2158331871032715, + "eval_runtime": 15.453, + "eval_samples_per_second": 2.912, + "eval_steps_per_second": 2.912, "step": 5220 }, { "epoch": 20.000766283524904, - "grad_norm": 0.18206465244293213, + "grad_norm": 0.17777974903583527, "learning_rate": 6.6581524052788425e-06, - "loss": 0.0018, + "loss": 0.0008, "step": 5230 }, { "epoch": 20.001532567049807, - "grad_norm": 0.06915395706892014, + "grad_norm": 0.03575889766216278, "learning_rate": 6.649638143891018e-06, - "loss": 0.8677, + "loss": 0.5296, "step": 5240 }, { "epoch": 20.002298850574714, - "grad_norm": 0.030006706714630127, + "grad_norm": 0.012495114468038082, "learning_rate": 6.641123882503193e-06, - "loss": 0.7953, + "loss": 0.5274, "step": 5250 }, { "epoch": 20.003065134099618, - "grad_norm": 0.04157086834311485, + "grad_norm": 0.14885152876377106, "learning_rate": 6.632609621115369e-06, - "loss": 0.0057, + "loss": 0.0026, "step": 5260 }, { "epoch": 20.00383141762452, - "grad_norm": 1.9022800922393799, + "grad_norm": 0.18088242411613464, "learning_rate": 6.624095359727543e-06, - "loss": 1.9501, + "loss": 1.1936, "step": 5270 }, { "epoch": 20.004597701149425, - "grad_norm": 437.72930908203125, + "grad_norm": 0.021175753325223923, "learning_rate": 6.615581098339719e-06, - "loss": 1.0965, + "loss": 0.6022, "step": 5280 }, { "epoch": 20.00536398467433, - "grad_norm": 28.722957611083984, + "grad_norm": 533.0197143554688, "learning_rate": 6.607066836951895e-06, - "loss": 1.7672, + "loss": 0.8877, "step": 5290 }, { "epoch": 20.006130268199232, - "grad_norm": 0.23794914782047272, + "grad_norm": 0.19679294526576996, "learning_rate": 6.59855257556407e-06, - "loss": 0.835, + "loss": 0.1651, "step": 5300 }, { "epoch": 20.00689655172414, - "grad_norm": 0.22249621152877808, + "grad_norm": 0.06063767150044441, "learning_rate": 6.590038314176246e-06, - "loss": 0.2304, + "loss": 0.2973, "step": 5310 }, { "epoch": 20.007662835249043, - "grad_norm": 0.013976133428514004, + "grad_norm": 0.03280027583241463, "learning_rate": 6.581524052788422e-06, - "loss": 0.5446, + "loss": 0.5431, "step": 5320 }, { "epoch": 20.008429118773947, - "grad_norm": 0.005734669975936413, + "grad_norm": 0.006832515355199575, "learning_rate": 6.573009791400597e-06, - "loss": 0.0845, + "loss": 0.5336, "step": 5330 }, { "epoch": 20.00919540229885, - "grad_norm": 340.7713317871094, + "grad_norm": 0.03919576480984688, "learning_rate": 6.5644955300127725e-06, - "loss": 0.6382, + "loss": 0.6281, "step": 5340 }, { "epoch": 20.009961685823754, - "grad_norm": 4.846546173095703, + "grad_norm": 0.6132537126541138, "learning_rate": 6.555981268624947e-06, - "loss": 0.0106, + "loss": 0.5227, "step": 5350 }, { "epoch": 20.010727969348657, - "grad_norm": 0.006838405970484018, + "grad_norm": 0.007301602512598038, "learning_rate": 6.5474670072371225e-06, - "loss": 0.0028, + "loss": 0.0007, "step": 5360 }, { "epoch": 20.011494252873565, - "grad_norm": 0.010414429940283298, + "grad_norm": 0.012126888148486614, "learning_rate": 6.538952745849298e-06, - "loss": 0.0027, + "loss": 0.6349, "step": 5370 }, { "epoch": 20.01226053639847, - "grad_norm": 84.16322326660156, + "grad_norm": 50.73347854614258, "learning_rate": 6.530438484461473e-06, - "loss": 0.9129, + "loss": 0.7841, "step": 5380 }, { "epoch": 20.013026819923372, - "grad_norm": 0.005516368895769119, + "grad_norm": 0.007847919128835201, "learning_rate": 6.521924223073649e-06, - "loss": 0.9004, + "loss": 1.3938, "step": 5390 }, { "epoch": 20.013793103448275, - "grad_norm": 0.015512916259467602, + "grad_norm": 0.019021129235625267, "learning_rate": 6.513409961685824e-06, - "loss": 0.8055, + "loss": 0.2412, "step": 5400 }, { "epoch": 20.01455938697318, - "grad_norm": 0.008046404458582401, + "grad_norm": 0.00958295352756977, "learning_rate": 6.504895700297999e-06, - "loss": 0.6289, + "loss": 0.821, "step": 5410 }, { "epoch": 20.015325670498083, - "grad_norm": 0.08862919360399246, + "grad_norm": 272.3012390136719, "learning_rate": 6.496381438910175e-06, - "loss": 0.0044, + "loss": 1.1645, "step": 5420 }, { "epoch": 20.01609195402299, - "grad_norm": 32.8420295715332, + "grad_norm": 60.118404388427734, "learning_rate": 6.48786717752235e-06, - "loss": 1.5779, + "loss": 0.7353, "step": 5430 }, { "epoch": 20.016858237547893, - "grad_norm": 0.11765594035387039, + "grad_norm": 0.03267025575041771, "learning_rate": 6.479352916134526e-06, - "loss": 1.1985, + "loss": 0.2205, "step": 5440 }, { "epoch": 20.017624521072797, - "grad_norm": 206.24264526367188, + "grad_norm": 252.74171447753906, "learning_rate": 6.470838654746702e-06, - "loss": 1.4409, + "loss": 2.0702, "step": 5450 }, { "epoch": 20.0183908045977, - "grad_norm": 0.022082997485995293, + "grad_norm": 0.02057657577097416, "learning_rate": 6.462324393358877e-06, - "loss": 0.6816, + "loss": 0.6836, "step": 5460 }, { "epoch": 20.019157088122604, - "grad_norm": 441.4811096191406, + "grad_norm": 51.96356201171875, "learning_rate": 6.4538101319710525e-06, - "loss": 0.9604, + "loss": 1.9424, "step": 5470 }, { "epoch": 20.01992337164751, - "grad_norm": 0.007893228903412819, + "grad_norm": 0.00483592739328742, "learning_rate": 6.445295870583227e-06, - "loss": 0.0037, + "loss": 0.0028, "step": 5480 }, { "epoch": 20.02, "eval_accuracy": 0.6444444444444445, - "eval_loss": 1.9002490043640137, - "eval_runtime": 19.3577, - "eval_samples_per_second": 2.325, - "eval_steps_per_second": 2.325, + "eval_loss": 1.4303712844848633, + "eval_runtime": 15.4647, + "eval_samples_per_second": 2.91, + "eval_steps_per_second": 2.91, "step": 5481 }, { "epoch": 21.000689655172415, - "grad_norm": 0.08814794570207596, + "grad_norm": 0.01464229915291071, "learning_rate": 6.4367816091954025e-06, - "loss": 0.5559, + "loss": 0.1482, "step": 5490 }, { "epoch": 21.00145593869732, - "grad_norm": 0.08200307190418243, + "grad_norm": 0.26727554202079773, "learning_rate": 6.428267347807578e-06, - "loss": 0.0121, + "loss": 0.1216, "step": 5500 }, { "epoch": 21.002222222222223, - "grad_norm": 108.10767364501953, + "grad_norm": 0.09710312634706497, "learning_rate": 6.419753086419753e-06, - "loss": 1.0758, + "loss": 0.5545, "step": 5510 }, { "epoch": 21.002988505747126, - "grad_norm": 0.012141243554651737, + "grad_norm": 0.0101077351719141, "learning_rate": 6.411238825031929e-06, - "loss": 1.4691, + "loss": 0.2761, "step": 5520 }, { "epoch": 21.00375478927203, - "grad_norm": 25.48621940612793, + "grad_norm": 63.09801483154297, "learning_rate": 6.402724563644105e-06, - "loss": 1.1223, + "loss": 2.0883, "step": 5530 }, { "epoch": 21.004521072796933, - "grad_norm": 0.5009303092956543, + "grad_norm": 0.5077385306358337, "learning_rate": 6.39421030225628e-06, - "loss": 0.0077, + "loss": 0.3424, "step": 5540 }, { "epoch": 21.00528735632184, - "grad_norm": 0.28618407249450684, + "grad_norm": 1.1730066537857056, "learning_rate": 6.385696040868455e-06, - "loss": 0.963, + "loss": 1.404, "step": 5550 }, { "epoch": 21.006053639846744, - "grad_norm": 0.20507201552391052, + "grad_norm": 0.9260103702545166, "learning_rate": 6.37718177948063e-06, - "loss": 0.7039, + "loss": 0.6858, "step": 5560 }, { "epoch": 21.006819923371648, - "grad_norm": 121.64978790283203, + "grad_norm": 29.845083236694336, "learning_rate": 6.368667518092806e-06, - "loss": 0.3956, + "loss": 0.0246, "step": 5570 }, { "epoch": 21.00758620689655, - "grad_norm": 0.007131490856409073, + "grad_norm": 0.0050223045982420444, "learning_rate": 6.360153256704982e-06, - "loss": 0.0067, + "loss": 0.8801, "step": 5580 }, { "epoch": 21.008352490421455, - "grad_norm": 0.010343499481678009, + "grad_norm": 0.003190841292962432, "learning_rate": 6.351638995317157e-06, - "loss": 0.0025, + "loss": 0.2213, "step": 5590 }, { "epoch": 21.00911877394636, - "grad_norm": 0.10232390463352203, + "grad_norm": 0.04965739697217941, "learning_rate": 6.3431247339293325e-06, - "loss": 0.578, + "loss": 0.6114, "step": 5600 }, { "epoch": 21.009885057471266, - "grad_norm": 0.2203950732946396, + "grad_norm": 0.12387219816446304, "learning_rate": 6.334610472541508e-06, - "loss": 1.0181, + "loss": 0.2833, "step": 5610 }, { "epoch": 21.01065134099617, - "grad_norm": 0.018481368198990822, + "grad_norm": 0.08594133704900742, "learning_rate": 6.3260962111536825e-06, - "loss": 0.4198, + "loss": 1.4213, "step": 5620 }, { "epoch": 21.011417624521073, - "grad_norm": 0.0661628246307373, + "grad_norm": 0.0800381451845169, "learning_rate": 6.317581949765858e-06, - "loss": 1.775, + "loss": 1.4918, "step": 5630 }, { "epoch": 21.012183908045976, - "grad_norm": 0.006107209250330925, + "grad_norm": 0.024857068434357643, "learning_rate": 6.309067688378033e-06, - "loss": 0.9482, + "loss": 1.4149, "step": 5640 }, { "epoch": 21.01295019157088, - "grad_norm": 0.35050517320632935, + "grad_norm": 0.1516677886247635, "learning_rate": 6.300553426990209e-06, - "loss": 0.3865, + "loss": 0.7059, "step": 5650 }, { "epoch": 21.013716475095784, - "grad_norm": 0.03160562366247177, + "grad_norm": 0.12439776957035065, "learning_rate": 6.292039165602385e-06, - "loss": 0.4907, + "loss": 0.8376, "step": 5660 }, { "epoch": 21.01448275862069, - "grad_norm": 0.06428388506174088, + "grad_norm": 0.05973615124821663, "learning_rate": 6.28352490421456e-06, - "loss": 1.2824, + "loss": 1.4964, "step": 5670 }, { "epoch": 21.015249042145594, - "grad_norm": 81.23246765136719, + "grad_norm": 37.89508819580078, "learning_rate": 6.275010642826736e-06, - "loss": 0.5794, + "loss": 0.0098, "step": 5680 }, { "epoch": 21.016015325670498, - "grad_norm": 0.019799837842583656, + "grad_norm": 0.005949839949607849, "learning_rate": 6.26649638143891e-06, - "loss": 0.5543, + "loss": 0.4408, "step": 5690 }, { "epoch": 21.0167816091954, - "grad_norm": 4.6948699951171875, + "grad_norm": 0.5323353409767151, "learning_rate": 6.257982120051086e-06, - "loss": 0.9855, + "loss": 0.8612, "step": 5700 }, { "epoch": 21.017547892720305, - "grad_norm": 3.360642671585083, + "grad_norm": 4.138088703155518, "learning_rate": 6.249467858663262e-06, - "loss": 0.4768, + "loss": 0.6564, "step": 5710 }, { "epoch": 21.018314176245212, - "grad_norm": 76.03146362304688, + "grad_norm": 82.76377868652344, "learning_rate": 6.240953597275437e-06, - "loss": 1.8223, + "loss": 1.6672, "step": 5720 }, { "epoch": 21.019080459770116, - "grad_norm": 0.036601677536964417, + "grad_norm": 0.010225473903119564, "learning_rate": 6.2324393358876125e-06, - "loss": 0.1584, + "loss": 0.0353, "step": 5730 }, { "epoch": 21.01984674329502, - "grad_norm": 0.01161945704370737, + "grad_norm": 0.013867017813026905, "learning_rate": 6.223925074499788e-06, - "loss": 0.9928, + "loss": 0.6598, "step": 5740 }, { "epoch": 21.02, - "eval_accuracy": 0.6, - "eval_loss": 1.6705849170684814, - "eval_runtime": 18.597, - "eval_samples_per_second": 2.42, - "eval_steps_per_second": 2.42, + "eval_accuracy": 0.6222222222222222, + "eval_loss": 1.8181201219558716, + "eval_runtime": 15.4793, + "eval_samples_per_second": 2.907, + "eval_steps_per_second": 2.907, "step": 5742 }, { "epoch": 22.000613026819924, - "grad_norm": 0.024481451138854027, + "grad_norm": 0.010380334220826626, "learning_rate": 6.2154108131119625e-06, - "loss": 0.3027, + "loss": 0.0102, "step": 5750 }, { "epoch": 22.001379310344827, - "grad_norm": 0.005778738763183355, + "grad_norm": 0.004199375864118338, "learning_rate": 6.206896551724138e-06, - "loss": 0.2184, + "loss": 0.9498, "step": 5760 }, { "epoch": 22.00214559386973, - "grad_norm": 0.08417612314224243, + "grad_norm": 0.12899409234523773, "learning_rate": 6.198382290336313e-06, - "loss": 0.3902, + "loss": 0.428, "step": 5770 }, { "epoch": 22.002911877394634, - "grad_norm": 0.11177084594964981, + "grad_norm": 0.041885145008563995, "learning_rate": 6.189868028948489e-06, - "loss": 1.4985, + "loss": 1.6532, "step": 5780 }, { "epoch": 22.00367816091954, - "grad_norm": 0.10094741731882095, + "grad_norm": 0.036745183169841766, "learning_rate": 6.181353767560665e-06, - "loss": 1.0908, + "loss": 0.5414, "step": 5790 }, { "epoch": 22.004444444444445, - "grad_norm": 0.01703103631734848, + "grad_norm": 0.01918080262839794, "learning_rate": 6.17283950617284e-06, - "loss": 1.7918, + "loss": 0.0043, "step": 5800 }, { "epoch": 22.00521072796935, - "grad_norm": 0.19841298460960388, + "grad_norm": 87.94882202148438, "learning_rate": 6.164325244785016e-06, - "loss": 0.5025, + "loss": 1.0764, "step": 5810 }, { "epoch": 22.005977011494252, - "grad_norm": 0.04886206239461899, + "grad_norm": 0.34861791133880615, "learning_rate": 6.155810983397192e-06, - "loss": 0.278, + "loss": 0.3269, "step": 5820 }, { "epoch": 22.006743295019156, - "grad_norm": 0.05953004211187363, + "grad_norm": 0.06668228656053543, "learning_rate": 6.147296722009366e-06, - "loss": 1.332, + "loss": 0.6777, "step": 5830 }, { "epoch": 22.00750957854406, - "grad_norm": 50.13103103637695, + "grad_norm": 295.6942443847656, "learning_rate": 6.138782460621542e-06, - "loss": 0.0362, + "loss": 0.2545, "step": 5840 }, { "epoch": 22.008275862068967, - "grad_norm": 0.010094068013131618, + "grad_norm": 0.03726373612880707, "learning_rate": 6.130268199233717e-06, - "loss": 0.0016, + "loss": 0.6615, "step": 5850 }, { "epoch": 22.00904214559387, - "grad_norm": 4.429050445556641, + "grad_norm": 11.273825645446777, "learning_rate": 6.1217539378458925e-06, - "loss": 0.4675, + "loss": 0.0115, "step": 5860 }, { "epoch": 22.009808429118774, - "grad_norm": 93.92876434326172, + "grad_norm": 64.4465103149414, "learning_rate": 6.113239676458068e-06, - "loss": 0.6319, + "loss": 0.6039, "step": 5870 }, { "epoch": 22.010574712643677, - "grad_norm": 0.04092418774962425, + "grad_norm": 0.05858124420046806, "learning_rate": 6.1047254150702425e-06, - "loss": 0.0842, + "loss": 0.5847, "step": 5880 }, { "epoch": 22.01134099616858, - "grad_norm": 88.0138168334961, + "grad_norm": 15.957605361938477, "learning_rate": 6.096211153682418e-06, - "loss": 0.0189, + "loss": 0.0033, "step": 5890 }, { "epoch": 22.01210727969349, - "grad_norm": 17.259140014648438, + "grad_norm": 114.75286865234375, "learning_rate": 6.087696892294594e-06, - "loss": 0.0094, + "loss": 0.608, "step": 5900 }, { "epoch": 22.012873563218392, - "grad_norm": 0.007783534470945597, + "grad_norm": 0.008612045086920261, "learning_rate": 6.079182630906769e-06, - "loss": 1.0441, + "loss": 1.071, "step": 5910 }, { "epoch": 22.013639846743295, - "grad_norm": 1.7238948345184326, + "grad_norm": 0.18943527340888977, "learning_rate": 6.070668369518945e-06, - "loss": 0.5834, + "loss": 0.2384, "step": 5920 }, { "epoch": 22.0144061302682, - "grad_norm": 0.0490339919924736, + "grad_norm": 0.05647897347807884, "learning_rate": 6.06215410813112e-06, - "loss": 0.4113, + "loss": 0.2845, "step": 5930 }, { "epoch": 22.015172413793103, - "grad_norm": 3.374436140060425, + "grad_norm": 665.4360961914062, "learning_rate": 6.053639846743296e-06, - "loss": 0.6895, + "loss": 0.742, "step": 5940 }, { "epoch": 22.015938697318006, - "grad_norm": 0.8085106015205383, + "grad_norm": 0.05447319522500038, "learning_rate": 6.045125585355472e-06, - "loss": 1.3184, + "loss": 1.0404, "step": 5950 }, { "epoch": 22.016704980842913, - "grad_norm": 0.5501781105995178, + "grad_norm": 28.57737159729004, "learning_rate": 6.036611323967646e-06, - "loss": 0.5072, + "loss": 0.5534, "step": 5960 }, { "epoch": 22.017471264367817, - "grad_norm": 0.051192935556173325, + "grad_norm": 0.15087071061134338, "learning_rate": 6.028097062579822e-06, - "loss": 0.5498, + "loss": 0.0056, "step": 5970 }, { "epoch": 22.01823754789272, - "grad_norm": 235.07296752929688, + "grad_norm": 89.18669128417969, "learning_rate": 6.019582801191997e-06, - "loss": 0.8918, + "loss": 0.9772, "step": 5980 }, { "epoch": 22.019003831417624, - "grad_norm": 84.09073638916016, + "grad_norm": 77.0879898071289, "learning_rate": 6.0110685398041725e-06, - "loss": 0.516, + "loss": 0.6694, "step": 5990 }, { "epoch": 22.019770114942528, - "grad_norm": 0.003185416804626584, + "grad_norm": 0.00514709809795022, "learning_rate": 6.002554278416348e-06, - "loss": 0.2311, + "loss": 1.3712, "step": 6000 }, { "epoch": 22.02, - "eval_accuracy": 0.6222222222222222, - "eval_loss": 2.097580909729004, - "eval_runtime": 17.2843, - "eval_samples_per_second": 2.604, - "eval_steps_per_second": 2.604, + "eval_accuracy": 0.6, + "eval_loss": 2.1179556846618652, + "eval_runtime": 16.1891, + "eval_samples_per_second": 2.78, + "eval_steps_per_second": 2.78, "step": 6003 }, { "epoch": 23.000536398467432, - "grad_norm": 0.0059499796479940414, + "grad_norm": 0.0029412826988846064, "learning_rate": 5.9940400170285225e-06, - "loss": 1.2484, + "loss": 0.6107, "step": 6010 }, { "epoch": 23.001302681992335, - "grad_norm": 0.01954953558743, + "grad_norm": 0.047063011676073074, "learning_rate": 5.985525755640698e-06, - "loss": 1.9125, + "loss": 0.9895, "step": 6020 }, { "epoch": 23.002068965517243, - "grad_norm": 0.08238361030817032, + "grad_norm": 0.07753095775842667, "learning_rate": 5.977011494252874e-06, - "loss": 0.4876, + "loss": 0.0054, "step": 6030 }, { "epoch": 23.002835249042146, - "grad_norm": 0.007539921905845404, + "grad_norm": 0.003820971352979541, "learning_rate": 5.968497232865049e-06, - "loss": 0.0022, + "loss": 0.0013, "step": 6040 }, { "epoch": 23.00360153256705, - "grad_norm": 0.03979509323835373, + "grad_norm": 0.028840329498052597, "learning_rate": 5.959982971477225e-06, - "loss": 0.5155, + "loss": 0.6396, "step": 6050 }, { "epoch": 23.004367816091953, - "grad_norm": 0.0263423640280962, + "grad_norm": 0.03187266364693642, "learning_rate": 5.9514687100894e-06, - "loss": 0.0028, + "loss": 0.5003, "step": 6060 }, { "epoch": 23.005134099616857, - "grad_norm": 0.021406101062893867, + "grad_norm": 0.04131568595767021, "learning_rate": 5.942954448701576e-06, - "loss": 1.1732, + "loss": 0.8872, "step": 6070 }, { "epoch": 23.005900383141764, - "grad_norm": 0.14612627029418945, + "grad_norm": 0.07650325447320938, "learning_rate": 5.934440187313752e-06, - "loss": 0.0011, + "loss": 0.4981, "step": 6080 }, { "epoch": 23.006666666666668, - "grad_norm": 0.01855178363621235, + "grad_norm": 0.24987338483333588, "learning_rate": 5.925925925925926e-06, - "loss": 0.7075, + "loss": 0.0948, "step": 6090 }, { "epoch": 23.00743295019157, - "grad_norm": 466.0125732421875, + "grad_norm": 0.08773210644721985, "learning_rate": 5.917411664538102e-06, - "loss": 0.2443, + "loss": 0.6404, "step": 6100 }, { "epoch": 23.008199233716475, - "grad_norm": 0.0023519154638051987, + "grad_norm": 0.0035792042035609484, "learning_rate": 5.9088974031502775e-06, - "loss": 0.1582, + "loss": 0.5143, "step": 6110 }, { "epoch": 23.00896551724138, - "grad_norm": 0.07642991840839386, + "grad_norm": 0.10285013169050217, "learning_rate": 5.9003831417624525e-06, - "loss": 0.0025, + "loss": 0.0073, "step": 6120 }, { "epoch": 23.009731800766282, - "grad_norm": 28.117115020751953, + "grad_norm": 0.35800856351852417, "learning_rate": 5.891868880374628e-06, - "loss": 1.5041, + "loss": 0.3087, "step": 6130 }, { "epoch": 23.01049808429119, - "grad_norm": 0.3082031011581421, + "grad_norm": 38.52192687988281, "learning_rate": 5.883354618986803e-06, - "loss": 1.1624, + "loss": 0.6344, "step": 6140 }, { "epoch": 23.011264367816093, - "grad_norm": 71.23324584960938, + "grad_norm": 0.31374216079711914, "learning_rate": 5.874840357598979e-06, - "loss": 0.4192, + "loss": 1.8244, "step": 6150 }, { "epoch": 23.012030651340996, - "grad_norm": 0.006472417153418064, + "grad_norm": 0.00432211346924305, "learning_rate": 5.866326096211154e-06, - "loss": 2.7005, + "loss": 0.6412, "step": 6160 }, { "epoch": 23.0127969348659, - "grad_norm": 0.1314040571451187, + "grad_norm": 1.1956547498703003, "learning_rate": 5.857811834823329e-06, - "loss": 1.0362, + "loss": 0.9768, "step": 6170 }, { "epoch": 23.013563218390804, - "grad_norm": 0.0061240722425282, + "grad_norm": 0.0020398390479385853, "learning_rate": 5.849297573435505e-06, - "loss": 0.5233, + "loss": 0.3601, "step": 6180 }, { "epoch": 23.014329501915707, - "grad_norm": 0.771247923374176, + "grad_norm": 0.21921543776988983, "learning_rate": 5.84078331204768e-06, - "loss": 1.4372, + "loss": 2.3915, "step": 6190 }, { "epoch": 23.015095785440614, - "grad_norm": 0.025009727105498314, + "grad_norm": 0.02065208926796913, "learning_rate": 5.832269050659856e-06, - "loss": 1.127, + "loss": 0.7163, "step": 6200 }, { "epoch": 23.015862068965518, - "grad_norm": 0.030313368886709213, + "grad_norm": 0.016166262328624725, "learning_rate": 5.823754789272032e-06, - "loss": 0.7825, + "loss": 0.3425, "step": 6210 }, { "epoch": 23.01662835249042, - "grad_norm": 0.011802282184362411, + "grad_norm": 0.007143170572817326, "learning_rate": 5.815240527884206e-06, - "loss": 1.1568, + "loss": 1.0736, "step": 6220 }, { "epoch": 23.017394636015325, - "grad_norm": 0.029546862468123436, + "grad_norm": 0.04260401055216789, "learning_rate": 5.806726266496382e-06, - "loss": 0.019, + "loss": 0.1575, "step": 6230 }, { "epoch": 23.01816091954023, - "grad_norm": 0.019168170168995857, + "grad_norm": 5.848123073577881, "learning_rate": 5.7982120051085575e-06, - "loss": 0.1813, + "loss": 0.7326, "step": 6240 }, { "epoch": 23.018927203065132, - "grad_norm": 0.17828933894634247, + "grad_norm": 0.10552279651165009, "learning_rate": 5.7896977437207325e-06, - "loss": 0.0161, + "loss": 0.599, "step": 6250 }, { "epoch": 23.01969348659004, - "grad_norm": 0.0032634164672344923, + "grad_norm": 0.0039580753073096275, "learning_rate": 5.781183482332908e-06, - "loss": 0.4825, + "loss": 0.526, "step": 6260 }, { "epoch": 23.02, "eval_accuracy": 0.6, - "eval_loss": 2.1246232986450195, - "eval_runtime": 17.3934, - "eval_samples_per_second": 2.587, - "eval_steps_per_second": 2.587, + "eval_loss": 1.9783341884613037, + "eval_runtime": 16.2696, + "eval_samples_per_second": 2.766, + "eval_steps_per_second": 2.766, "step": 6264 }, { "epoch": 24.000459770114944, - "grad_norm": 218.01425170898438, + "grad_norm": 62.192108154296875, "learning_rate": 5.772669220945083e-06, - "loss": 2.3648, + "loss": 1.7186, "step": 6270 }, { "epoch": 24.001226053639847, - "grad_norm": 0.18992196023464203, + "grad_norm": 0.14505280554294586, "learning_rate": 5.764154959557259e-06, - "loss": 0.0011, + "loss": 0.0009, "step": 6280 }, { "epoch": 24.00199233716475, - "grad_norm": 0.02763967029750347, + "grad_norm": 0.10368850827217102, "learning_rate": 5.755640698169435e-06, - "loss": 0.4559, + "loss": 0.0011, "step": 6290 }, { "epoch": 24.002758620689654, - "grad_norm": 0.006729465909302235, + "grad_norm": 0.005278023425489664, "learning_rate": 5.747126436781609e-06, - "loss": 0.0026, + "loss": 0.0023, "step": 6300 }, { "epoch": 24.003524904214558, - "grad_norm": 0.03497445583343506, + "grad_norm": 0.006146503146737814, "learning_rate": 5.738612175393785e-06, - "loss": 0.0009, + "loss": 0.0005, "step": 6310 }, { "epoch": 24.004291187739465, - "grad_norm": 0.01695236936211586, + "grad_norm": 0.012583620846271515, "learning_rate": 5.730097914005961e-06, - "loss": 0.0129, + "loss": 0.2121, "step": 6320 }, { "epoch": 24.00505747126437, - "grad_norm": 392.5536193847656, + "grad_norm": 2.30965256690979, "learning_rate": 5.721583652618136e-06, - "loss": 1.5905, + "loss": 1.7758, "step": 6330 }, { "epoch": 24.005823754789272, - "grad_norm": 0.008342803455889225, + "grad_norm": 0.014124227687716484, "learning_rate": 5.713069391230312e-06, - "loss": 0.3713, + "loss": 0.8655, "step": 6340 }, { "epoch": 24.006590038314176, - "grad_norm": 134.41932678222656, + "grad_norm": 2.7731776237487793, "learning_rate": 5.704555129842486e-06, - "loss": 0.6012, + "loss": 0.2896, "step": 6350 }, { "epoch": 24.00735632183908, - "grad_norm": 0.00639798678457737, + "grad_norm": 0.008108450099825859, "learning_rate": 5.696040868454662e-06, - "loss": 0.1175, + "loss": 0.7411, "step": 6360 }, { "epoch": 24.008122605363983, - "grad_norm": 0.1409643143415451, + "grad_norm": 274.1982116699219, "learning_rate": 5.6875266070668375e-06, - "loss": 0.4002, + "loss": 0.3413, "step": 6370 }, { "epoch": 24.00888888888889, - "grad_norm": 0.007414141204208136, + "grad_norm": 0.021320249885320663, "learning_rate": 5.6790123456790125e-06, - "loss": 0.6767, + "loss": 1.7213, "step": 6380 }, { "epoch": 24.009655172413794, - "grad_norm": 0.5357187390327454, + "grad_norm": 88.96774291992188, "learning_rate": 5.670498084291188e-06, - "loss": 0.3171, + "loss": 0.6111, "step": 6390 }, { "epoch": 24.010421455938697, - "grad_norm": 0.010539868846535683, + "grad_norm": 0.022434860467910767, "learning_rate": 5.661983822903364e-06, - "loss": 0.4161, + "loss": 0.5368, "step": 6400 }, { "epoch": 24.0111877394636, - "grad_norm": 0.00193565443623811, + "grad_norm": 0.002878761850297451, "learning_rate": 5.653469561515539e-06, - "loss": 0.5425, + "loss": 0.0598, "step": 6410 }, { "epoch": 24.011954022988505, - "grad_norm": 0.06604880839586258, + "grad_norm": 0.053979888558387756, "learning_rate": 5.644955300127715e-06, - "loss": 1.3295, + "loss": 1.1493, "step": 6420 }, { "epoch": 24.01272030651341, - "grad_norm": 0.006177594419568777, + "grad_norm": 0.007288148161023855, "learning_rate": 5.636441038739889e-06, - "loss": 0.7922, + "loss": 0.3123, "step": 6430 }, { "epoch": 24.013486590038315, - "grad_norm": 0.008410290814936161, + "grad_norm": 0.00937149953097105, "learning_rate": 5.627926777352065e-06, - "loss": 0.8108, + "loss": 0.4229, "step": 6440 }, { "epoch": 24.01425287356322, - "grad_norm": 0.052248265594244, + "grad_norm": 0.03848826512694359, "learning_rate": 5.619412515964241e-06, - "loss": 0.0011, + "loss": 0.0022, "step": 6450 }, { "epoch": 24.015019157088123, - "grad_norm": 0.0551765114068985, + "grad_norm": 0.03932293504476547, "learning_rate": 5.610898254576416e-06, - "loss": 0.0026, + "loss": 0.2756, "step": 6460 }, { "epoch": 24.015785440613026, - "grad_norm": 0.0039450800977647305, + "grad_norm": 0.0027885441668331623, "learning_rate": 5.602383993188592e-06, - "loss": 1.6733, + "loss": 1.9502, "step": 6470 }, { "epoch": 24.01655172413793, - "grad_norm": 0.02867334894835949, + "grad_norm": 0.009457943961024284, "learning_rate": 5.593869731800766e-06, - "loss": 0.4468, + "loss": 1.4441, "step": 6480 }, { "epoch": 24.017318007662837, - "grad_norm": 0.0025795865803956985, + "grad_norm": 0.0019108811393380165, "learning_rate": 5.585355470412942e-06, - "loss": 0.862, + "loss": 0.1263, "step": 6490 }, { "epoch": 24.01808429118774, - "grad_norm": 0.4221494495868683, + "grad_norm": 28.00144386291504, "learning_rate": 5.5768412090251175e-06, - "loss": 1.0196, + "loss": 1.7056, "step": 6500 }, { "epoch": 24.018850574712644, - "grad_norm": 0.002689726185053587, + "grad_norm": 0.0028895202558487654, "learning_rate": 5.5683269476372925e-06, - "loss": 0.0033, + "loss": 0.5559, "step": 6510 }, { "epoch": 24.019616858237548, - "grad_norm": 0.1802007555961609, + "grad_norm": 0.05323445051908493, "learning_rate": 5.559812686249468e-06, - "loss": 0.4687, + "loss": 1.4996, "step": 6520 }, { "epoch": 24.02, - "eval_accuracy": 0.6, - "eval_loss": 1.7901363372802734, - "eval_runtime": 17.5465, - "eval_samples_per_second": 2.565, - "eval_steps_per_second": 2.565, + "eval_accuracy": 0.6222222222222222, + "eval_loss": 1.8766776323318481, + "eval_runtime": 17.0625, + "eval_samples_per_second": 2.637, + "eval_steps_per_second": 2.637, "step": 6525 }, { "epoch": 25.000383141762452, - "grad_norm": 0.05337819829583168, + "grad_norm": 1.1415150165557861, "learning_rate": 5.551298424861644e-06, - "loss": 0.6795, + "loss": 0.8528, "step": 6530 }, { "epoch": 25.001149425287355, - "grad_norm": 0.0022355131804943085, + "grad_norm": 0.002239577006548643, "learning_rate": 5.542784163473819e-06, - "loss": 0.3671, + "loss": 0.3653, "step": 6540 }, { "epoch": 25.00191570881226, - "grad_norm": 0.3338928818702698, + "grad_norm": 352.9330139160156, "learning_rate": 5.534269902085995e-06, - "loss": 0.4908, + "loss": 0.5165, "step": 6550 }, { "epoch": 25.002681992337166, - "grad_norm": 0.035004112869501114, + "grad_norm": 0.019446222111582756, "learning_rate": 5.525755640698169e-06, - "loss": 0.3051, + "loss": 0.5935, "step": 6560 }, { "epoch": 25.00344827586207, - "grad_norm": 82.73577117919922, + "grad_norm": 1.4556376934051514, "learning_rate": 5.517241379310345e-06, - "loss": 1.3344, + "loss": 0.564, "step": 6570 }, { "epoch": 25.004214559386973, - "grad_norm": 0.3661065697669983, + "grad_norm": 205.486083984375, "learning_rate": 5.508727117922521e-06, - "loss": 0.7376, + "loss": 1.3593, "step": 6580 }, { "epoch": 25.004980842911877, - "grad_norm": 0.002585015958175063, + "grad_norm": 0.003079820889979601, "learning_rate": 5.500212856534696e-06, - "loss": 0.539, + "loss": 0.856, "step": 6590 }, { "epoch": 25.00574712643678, - "grad_norm": 69.54122161865234, + "grad_norm": 2.473658561706543, "learning_rate": 5.491698595146872e-06, - "loss": 0.5692, + "loss": 0.0082, "step": 6600 }, { "epoch": 25.006513409961684, - "grad_norm": 0.016643185168504715, + "grad_norm": 0.41060397028923035, "learning_rate": 5.4831843337590475e-06, - "loss": 0.5762, + "loss": 0.4473, "step": 6610 }, { "epoch": 25.00727969348659, - "grad_norm": 0.010798582807183266, + "grad_norm": 0.1301625669002533, "learning_rate": 5.474670072371222e-06, - "loss": 0.5421, + "loss": 0.166, "step": 6620 }, { "epoch": 25.008045977011495, - "grad_norm": 0.011680358089506626, + "grad_norm": 0.0048989043571054935, "learning_rate": 5.4661558109833975e-06, - "loss": 0.0006, + "loss": 0.0743, "step": 6630 }, { "epoch": 25.0088122605364, - "grad_norm": 0.039196815341711044, + "grad_norm": 0.33093783259391785, "learning_rate": 5.4576415495955725e-06, - "loss": 0.0521, + "loss": 0.9288, "step": 6640 }, { "epoch": 25.009578544061302, - "grad_norm": 0.003652225947007537, + "grad_norm": 0.0017813568701967597, "learning_rate": 5.449127288207748e-06, - "loss": 0.5632, + "loss": 0.8427, "step": 6650 }, { "epoch": 25.010344827586206, - "grad_norm": 0.10199687629938126, + "grad_norm": 0.13909049332141876, "learning_rate": 5.440613026819924e-06, - "loss": 0.4116, + "loss": 0.4852, "step": 6660 }, { "epoch": 25.011111111111113, - "grad_norm": 0.004501442890614271, + "grad_norm": 0.0022367651108652353, "learning_rate": 5.432098765432099e-06, - "loss": 1.6943, + "loss": 1.0386, "step": 6670 }, { "epoch": 25.011877394636016, - "grad_norm": 112.17243194580078, + "grad_norm": 377.25030517578125, "learning_rate": 5.423584504044275e-06, - "loss": 0.5224, + "loss": 0.342, "step": 6680 }, { "epoch": 25.01264367816092, - "grad_norm": 0.12843827903270721, + "grad_norm": 1.1145286560058594, "learning_rate": 5.415070242656451e-06, - "loss": 0.3836, + "loss": 0.3452, "step": 6690 }, { "epoch": 25.013409961685824, - "grad_norm": 22.080371856689453, + "grad_norm": 0.3452111780643463, "learning_rate": 5.406555981268625e-06, - "loss": 0.4742, + "loss": 0.2108, "step": 6700 }, { "epoch": 25.014176245210727, - "grad_norm": 0.007842904888093472, + "grad_norm": 0.010614615865051746, "learning_rate": 5.398041719880801e-06, - "loss": 0.6473, + "loss": 0.143, "step": 6710 }, { "epoch": 25.01494252873563, - "grad_norm": 0.23944434523582458, + "grad_norm": 385.9859619140625, "learning_rate": 5.389527458492976e-06, - "loss": 0.5198, + "loss": 1.1749, "step": 6720 }, { "epoch": 25.015708812260538, - "grad_norm": 0.17299960553646088, + "grad_norm": 0.02967614121735096, "learning_rate": 5.381013197105152e-06, - "loss": 0.0071, + "loss": 0.0006, "step": 6730 }, { "epoch": 25.01647509578544, - "grad_norm": 248.34991455078125, + "grad_norm": 0.01778342016041279, "learning_rate": 5.3724989357173275e-06, - "loss": 1.2463, + "loss": 0.4638, "step": 6740 }, { "epoch": 25.017241379310345, - "grad_norm": 0.1431325227022171, + "grad_norm": 0.004216350615024567, "learning_rate": 5.3639846743295025e-06, - "loss": 0.4398, + "loss": 0.0809, "step": 6750 }, { "epoch": 25.01800766283525, - "grad_norm": 0.004623887594789267, + "grad_norm": 0.0016005209181457758, "learning_rate": 5.3554704129416775e-06, - "loss": 0.4023, + "loss": 0.307, "step": 6760 }, { "epoch": 25.018773946360152, - "grad_norm": 0.009998872876167297, + "grad_norm": 0.10373406112194061, "learning_rate": 5.3469561515538525e-06, - "loss": 1.3267, + "loss": 1.4049, "step": 6770 }, { "epoch": 25.019540229885056, - "grad_norm": 0.7139018177986145, + "grad_norm": 0.010953652672469616, "learning_rate": 5.338441890166028e-06, - "loss": 0.0006, + "loss": 0.0002, "step": 6780 }, { "epoch": 25.02, - "eval_accuracy": 0.6, - "eval_loss": 2.4826600551605225, - "eval_runtime": 17.4343, - "eval_samples_per_second": 2.581, - "eval_steps_per_second": 2.581, + "eval_accuracy": 0.6222222222222222, + "eval_loss": 1.8230860233306885, + "eval_runtime": 17.1176, + "eval_samples_per_second": 2.629, + "eval_steps_per_second": 2.629, "step": 6786 }, { "epoch": 26.00030651340996, - "grad_norm": 0.008208601735532284, + "grad_norm": 0.02123504877090454, "learning_rate": 5.329927628778204e-06, - "loss": 0.0005, + "loss": 0.0033, "step": 6790 }, { "epoch": 26.001072796934867, - "grad_norm": 0.02884920872747898, + "grad_norm": 0.0029484149999916553, "learning_rate": 5.321413367390379e-06, - "loss": 0.9612, + "loss": 0.0007, "step": 6800 }, { "epoch": 26.00183908045977, - "grad_norm": 265.7590637207031, + "grad_norm": 234.69747924804688, "learning_rate": 5.312899106002555e-06, - "loss": 0.5891, + "loss": 0.3853, "step": 6810 }, { "epoch": 26.002605363984674, - "grad_norm": 0.007475483231246471, + "grad_norm": 0.011126531288027763, "learning_rate": 5.304384844614731e-06, - "loss": 1.9608, + "loss": 1.0729, "step": 6820 }, { "epoch": 26.003371647509578, - "grad_norm": 0.06961024552583694, + "grad_norm": 0.025200342759490013, "learning_rate": 5.295870583226905e-06, - "loss": 0.5698, + "loss": 0.6833, "step": 6830 }, { "epoch": 26.00413793103448, - "grad_norm": 0.007300609722733498, + "grad_norm": 0.0033019825350493193, "learning_rate": 5.287356321839081e-06, - "loss": 1.4155, + "loss": 0.4741, "step": 6840 }, { "epoch": 26.00490421455939, - "grad_norm": 0.018364563584327698, + "grad_norm": 0.032386571168899536, "learning_rate": 5.278842060451256e-06, - "loss": 0.0028, + "loss": 0.4217, "step": 6850 }, { "epoch": 26.005670498084292, - "grad_norm": 1.5287927389144897, + "grad_norm": 0.0679851844906807, "learning_rate": 5.270327799063432e-06, - "loss": 0.0235, + "loss": 0.6879, "step": 6860 }, { "epoch": 26.006436781609196, - "grad_norm": 0.0141092324629426, + "grad_norm": 0.011571587063372135, "learning_rate": 5.2618135376756075e-06, - "loss": 0.4761, + "loss": 0.0006, "step": 6870 }, { "epoch": 26.0072030651341, - "grad_norm": 0.008648570626974106, + "grad_norm": 0.003177380422130227, "learning_rate": 5.2532992762877825e-06, - "loss": 1.1725, + "loss": 0.8043, "step": 6880 }, { "epoch": 26.007969348659003, - "grad_norm": 0.13225506246089935, + "grad_norm": 0.005701896734535694, "learning_rate": 5.244785014899958e-06, - "loss": 0.5987, + "loss": 0.6627, "step": 6890 }, { "epoch": 26.008735632183907, - "grad_norm": 0.003979440778493881, + "grad_norm": 0.001979551510885358, "learning_rate": 5.236270753512134e-06, - "loss": 1.8085, + "loss": 1.2016, "step": 6900 }, { "epoch": 26.009501915708814, - "grad_norm": 0.2841176390647888, + "grad_norm": 955.865966796875, "learning_rate": 5.227756492124308e-06, - "loss": 0.0096, + "loss": 0.686, "step": 6910 }, { "epoch": 26.010268199233717, - "grad_norm": 0.8024605512619019, + "grad_norm": 0.5485445261001587, "learning_rate": 5.219242230736484e-06, - "loss": 1.9584, + "loss": 1.1697, "step": 6920 }, { "epoch": 26.01103448275862, - "grad_norm": 10.240833282470703, + "grad_norm": 0.0732756108045578, "learning_rate": 5.210727969348659e-06, - "loss": 0.0045, + "loss": 0.0005, "step": 6930 }, { "epoch": 26.011800766283525, - "grad_norm": 0.04226527735590935, + "grad_norm": 0.15521180629730225, "learning_rate": 5.202213707960835e-06, - "loss": 0.3538, + "loss": 0.0005, "step": 6940 }, { "epoch": 26.01256704980843, - "grad_norm": 0.0034154646564275026, + "grad_norm": 0.0039952476508915424, "learning_rate": 5.193699446573011e-06, - "loss": 0.4084, + "loss": 0.3174, "step": 6950 }, { "epoch": 26.013333333333332, - "grad_norm": 0.2548293173313141, + "grad_norm": 0.2420734316110611, "learning_rate": 5.185185185185185e-06, - "loss": 0.217, + "loss": 0.2755, "step": 6960 }, { "epoch": 26.01409961685824, - "grad_norm": 0.002543597249314189, + "grad_norm": 0.0013983905082568526, "learning_rate": 5.176670923797361e-06, - "loss": 0.0069, + "loss": 0.0006, "step": 6970 }, { "epoch": 26.014865900383143, - "grad_norm": 0.0063563138246536255, + "grad_norm": 0.004940020851790905, "learning_rate": 5.168156662409536e-06, - "loss": 0.118, + "loss": 0.5322, "step": 6980 }, { "epoch": 26.015632183908046, - "grad_norm": 0.006622265093028545, + "grad_norm": 0.002700202167034149, "learning_rate": 5.159642401021712e-06, - "loss": 0.3776, + "loss": 0.0004, "step": 6990 }, { "epoch": 26.01639846743295, - "grad_norm": 0.17534077167510986, + "grad_norm": 0.03120334818959236, "learning_rate": 5.1511281396338875e-06, - "loss": 0.9697, + "loss": 0.1878, "step": 7000 }, { "epoch": 26.017164750957853, - "grad_norm": 0.001660225447267294, + "grad_norm": 0.00177517079282552, "learning_rate": 5.1426138782460625e-06, - "loss": 0.6493, + "loss": 0.7163, "step": 7010 }, { "epoch": 26.017931034482757, - "grad_norm": 0.00405055470764637, + "grad_norm": 0.003483051899820566, "learning_rate": 5.134099616858238e-06, - "loss": 0.4476, + "loss": 0.6969, "step": 7020 }, { "epoch": 26.018697318007664, - "grad_norm": 0.0020823508966714144, + "grad_norm": 0.0023457412607967854, "learning_rate": 5.125585355470414e-06, - "loss": 0.3123, + "loss": 0.5585, "step": 7030 }, { "epoch": 26.019463601532568, - "grad_norm": 0.6022665500640869, + "grad_norm": 0.007092323154211044, "learning_rate": 5.117071094082588e-06, - "loss": 0.0052, + "loss": 0.0062, "step": 7040 }, { "epoch": 26.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 1.9289144277572632, - "eval_runtime": 16.7813, - "eval_samples_per_second": 2.682, - "eval_steps_per_second": 2.682, + "eval_accuracy": 0.6222222222222222, + "eval_loss": 1.8086061477661133, + "eval_runtime": 16.4864, + "eval_samples_per_second": 2.73, + "eval_steps_per_second": 2.73, "step": 7047 }, { "epoch": 27.000229885057472, - "grad_norm": 0.00214813812635839, + "grad_norm": 0.0012838696129620075, "learning_rate": 5.108556832694764e-06, - "loss": 0.0457, + "loss": 0.6535, "step": 7050 }, { "epoch": 27.000996168582375, - "grad_norm": 0.0017765769734978676, + "grad_norm": 0.002292887307703495, "learning_rate": 5.100042571306939e-06, - "loss": 0.4251, + "loss": 0.3, "step": 7060 }, { "epoch": 27.00176245210728, - "grad_norm": 0.016686828806996346, + "grad_norm": 0.011790307238698006, "learning_rate": 5.091528309919115e-06, - "loss": 0.001, + "loss": 0.0006, "step": 7070 }, { "epoch": 27.002528735632183, - "grad_norm": 0.061154428869485855, + "grad_norm": 0.0721253827214241, "learning_rate": 5.083014048531291e-06, - "loss": 0.2773, + "loss": 0.0051, "step": 7080 }, { "epoch": 27.00329501915709, - "grad_norm": 0.0013891103444620967, + "grad_norm": 0.0010579659137874842, "learning_rate": 5.074499787143465e-06, - "loss": 0.0008, + "loss": 0.5285, "step": 7090 }, { "epoch": 27.004061302681993, - "grad_norm": 0.01212016399949789, + "grad_norm": 0.005082032643258572, "learning_rate": 5.065985525755641e-06, - "loss": 0.0009, + "loss": 0.0006, "step": 7100 }, { "epoch": 27.004827586206897, - "grad_norm": 0.00479173706844449, + "grad_norm": 0.17441536486148834, "learning_rate": 5.057471264367817e-06, - "loss": 0.666, + "loss": 0.0007, "step": 7110 }, { "epoch": 27.0055938697318, - "grad_norm": 0.004911317024379969, + "grad_norm": 0.01261579804122448, "learning_rate": 5.048957002979992e-06, - "loss": 0.1085, + "loss": 0.0102, "step": 7120 }, { "epoch": 27.006360153256704, - "grad_norm": 0.0023150211200118065, + "grad_norm": 0.0015620223712176085, "learning_rate": 5.0404427415921675e-06, - "loss": 0.009, + "loss": 0.0027, "step": 7130 }, { "epoch": 27.007126436781608, - "grad_norm": 0.0023754690773785114, + "grad_norm": 0.0020450339652597904, "learning_rate": 5.0319284802043425e-06, - "loss": 0.8262, + "loss": 0.4822, "step": 7140 }, { "epoch": 27.007892720306515, - "grad_norm": 0.02463066764175892, + "grad_norm": 0.034193504601716995, "learning_rate": 5.023414218816518e-06, - "loss": 1.2092, + "loss": 0.4366, "step": 7150 }, { "epoch": 27.00865900383142, - "grad_norm": 0.002340584062039852, + "grad_norm": 0.00120554119348526, "learning_rate": 5.014899957428694e-06, - "loss": 1.1459, + "loss": 0.615, "step": 7160 }, { "epoch": 27.009425287356322, - "grad_norm": 0.0069943745620548725, + "grad_norm": 0.0034140916541218758, "learning_rate": 5.006385696040868e-06, - "loss": 0.0005, + "loss": 0.0002, "step": 7170 }, { "epoch": 27.010191570881226, - "grad_norm": 37.36365509033203, + "grad_norm": 76.81916046142578, "learning_rate": 4.997871434653044e-06, - "loss": 0.6511, + "loss": 1.1749, "step": 7180 }, { "epoch": 27.01095785440613, - "grad_norm": 0.024715274572372437, + "grad_norm": 0.012213127687573433, "learning_rate": 4.98935717326522e-06, - "loss": 0.0067, + "loss": 0.3825, "step": 7190 }, { "epoch": 27.011724137931033, - "grad_norm": 0.0027485694736242294, + "grad_norm": 0.0028636474162340164, "learning_rate": 4.980842911877395e-06, - "loss": 0.5901, + "loss": 0.3608, "step": 7200 }, { "epoch": 27.01249042145594, - "grad_norm": 59.89057159423828, + "grad_norm": 313.3858947753906, "learning_rate": 4.972328650489571e-06, - "loss": 1.6122, + "loss": 1.6128, "step": 7210 }, { "epoch": 27.013256704980844, - "grad_norm": 0.07165471464395523, + "grad_norm": 0.26451390981674194, "learning_rate": 4.963814389101746e-06, - "loss": 0.5769, + "loss": 0.0004, "step": 7220 }, { "epoch": 27.014022988505747, - "grad_norm": 0.004633300006389618, + "grad_norm": 0.003248663619160652, "learning_rate": 4.955300127713921e-06, - "loss": 0.8309, + "loss": 0.0028, "step": 7230 }, { "epoch": 27.01478927203065, - "grad_norm": 467.2062683105469, + "grad_norm": 351.29296875, "learning_rate": 4.946785866326097e-06, - "loss": 1.0369, + "loss": 1.516, "step": 7240 }, { "epoch": 27.015555555555554, - "grad_norm": 0.0024944415781646967, + "grad_norm": 0.002200148534029722, "learning_rate": 4.938271604938272e-06, - "loss": 0.001, + "loss": 0.0006, "step": 7250 }, { "epoch": 27.016321839080458, - "grad_norm": 0.01116662472486496, + "grad_norm": 0.003352625295519829, "learning_rate": 4.9297573435504475e-06, - "loss": 0.9212, + "loss": 1.1179, "step": 7260 }, { "epoch": 27.017088122605365, - "grad_norm": 0.008394837379455566, + "grad_norm": 0.002079174155369401, "learning_rate": 4.9212430821626225e-06, - "loss": 0.0573, + "loss": 0.1111, "step": 7270 }, { "epoch": 27.01785440613027, - "grad_norm": 0.005024654325097799, + "grad_norm": 0.023824848234653473, "learning_rate": 4.912728820774798e-06, - "loss": 0.2413, + "loss": 0.0007, "step": 7280 }, { "epoch": 27.018620689655172, - "grad_norm": 0.0030793044716119766, + "grad_norm": 0.0021569812670350075, "learning_rate": 4.904214559386973e-06, - "loss": 0.0686, + "loss": 0.5168, "step": 7290 }, { "epoch": 27.019386973180076, - "grad_norm": 0.10558171570301056, + "grad_norm": 0.03218558803200722, "learning_rate": 4.895700297999149e-06, - "loss": 0.0019, + "loss": 0.7647, "step": 7300 }, { "epoch": 27.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.4350714683532715, - "eval_runtime": 17.4232, - "eval_samples_per_second": 2.583, - "eval_steps_per_second": 2.583, + "eval_accuracy": 0.6222222222222222, + "eval_loss": 1.8100167512893677, + "eval_runtime": 16.2961, + "eval_samples_per_second": 2.761, + "eval_steps_per_second": 2.761, "step": 7308 }, { "epoch": 28.00015325670498, - "grad_norm": 0.001691475510597229, + "grad_norm": 0.0013634850038215518, "learning_rate": 4.887186036611324e-06, - "loss": 0.9948, + "loss": 0.6324, "step": 7310 }, { "epoch": 28.000919540229884, - "grad_norm": 0.0037469547241926193, + "grad_norm": 0.0016124506946653128, "learning_rate": 4.8786717752235e-06, - "loss": 0.7077, + "loss": 0.3921, "step": 7320 }, { "epoch": 28.00168582375479, - "grad_norm": 0.008109981194138527, + "grad_norm": 0.004137418698519468, "learning_rate": 4.870157513835675e-06, - "loss": 0.3536, + "loss": 0.4809, "step": 7330 }, { "epoch": 28.002452107279694, - "grad_norm": 0.001695164479315281, + "grad_norm": 0.0009404192096553743, "learning_rate": 4.861643252447851e-06, - "loss": 0.4912, + "loss": 0.6028, "step": 7340 }, { "epoch": 28.003218390804598, - "grad_norm": 74.68633270263672, + "grad_norm": 114.4256820678711, "learning_rate": 4.853128991060026e-06, - "loss": 0.4964, + "loss": 0.024, "step": 7350 }, { "epoch": 28.0039846743295, - "grad_norm": 0.02763885259628296, + "grad_norm": 183.51992797851562, "learning_rate": 4.844614729672202e-06, - "loss": 0.6268, + "loss": 1.6159, "step": 7360 }, { "epoch": 28.004750957854405, - "grad_norm": 0.0012636043829843402, + "grad_norm": 0.002991782035678625, "learning_rate": 4.836100468284377e-06, - "loss": 0.6321, + "loss": 0.6655, "step": 7370 }, { "epoch": 28.00551724137931, - "grad_norm": 0.7330262660980225, + "grad_norm": 0.2887578308582306, "learning_rate": 4.8275862068965525e-06, - "loss": 0.6382, + "loss": 0.6675, "step": 7380 }, { "epoch": 28.006283524904216, - "grad_norm": 0.0024992988910526037, + "grad_norm": 0.0012824555160477757, "learning_rate": 4.8190719455087275e-06, - "loss": 1.1927, + "loss": 1.0204, "step": 7390 }, { "epoch": 28.00704980842912, - "grad_norm": 0.003963688388466835, + "grad_norm": 0.00152094557415694, "learning_rate": 4.8105576841209025e-06, - "loss": 0.4827, + "loss": 0.0002, "step": 7400 }, { "epoch": 28.007816091954023, - "grad_norm": 0.0027215732261538506, + "grad_norm": 0.0010664139408618212, "learning_rate": 4.802043422733078e-06, - "loss": 0.9836, + "loss": 0.0956, "step": 7410 }, { "epoch": 28.008582375478927, - "grad_norm": 23.04977798461914, + "grad_norm": 978.6150512695312, "learning_rate": 4.793529161345254e-06, - "loss": 0.0552, + "loss": 1.4493, "step": 7420 }, { "epoch": 28.00934865900383, - "grad_norm": 271.8680725097656, + "grad_norm": 1.1554425954818726, "learning_rate": 4.785014899957429e-06, - "loss": 0.689, + "loss": 0.2764, "step": 7430 }, { "epoch": 28.010114942528734, - "grad_norm": 0.032220322638750076, + "grad_norm": 0.016275620087981224, "learning_rate": 4.776500638569604e-06, - "loss": 0.2867, + "loss": 0.3619, "step": 7440 }, { "epoch": 28.01088122605364, - "grad_norm": 0.038638029247522354, + "grad_norm": 0.03950939700007439, "learning_rate": 4.76798637718178e-06, - "loss": 2.0507, + "loss": 1.8931, "step": 7450 }, { "epoch": 28.011647509578545, - "grad_norm": 0.00666879303753376, + "grad_norm": 0.008054444566369057, "learning_rate": 4.759472115793956e-06, - "loss": 0.8896, + "loss": 0.232, "step": 7460 }, { "epoch": 28.01241379310345, - "grad_norm": 0.004105005878955126, + "grad_norm": 0.0020887332502752542, "learning_rate": 4.750957854406131e-06, - "loss": 0.2395, + "loss": 0.28, "step": 7470 }, { "epoch": 28.013180076628352, - "grad_norm": 0.001758299651555717, + "grad_norm": 0.002126912819221616, "learning_rate": 4.742443593018306e-06, - "loss": 0.0238, + "loss": 0.001, "step": 7480 }, { "epoch": 28.013946360153255, - "grad_norm": 0.08146839588880539, + "grad_norm": 0.028837980702519417, "learning_rate": 4.733929331630482e-06, - "loss": 0.3282, + "loss": 0.3433, "step": 7490 }, { "epoch": 28.014712643678163, - "grad_norm": 0.007566448301076889, + "grad_norm": 0.0597766675055027, "learning_rate": 4.7254150702426575e-06, - "loss": 0.4946, + "loss": 0.0015, "step": 7500 }, { "epoch": 28.015478927203066, - "grad_norm": 0.08981310576200485, + "grad_norm": 0.00515392143279314, "learning_rate": 4.7169008088548325e-06, - "loss": 0.6864, + "loss": 1.1767, "step": 7510 }, { "epoch": 28.01624521072797, - "grad_norm": 0.009543518535792828, + "grad_norm": 0.005934328306466341, "learning_rate": 4.7083865474670075e-06, - "loss": 0.001, + "loss": 0.003, "step": 7520 }, { "epoch": 28.017011494252873, - "grad_norm": 42.25970458984375, + "grad_norm": 50.15550994873047, "learning_rate": 4.6998722860791825e-06, - "loss": 1.1194, + "loss": 0.752, "step": 7530 }, { "epoch": 28.017777777777777, - "grad_norm": 0.8251566886901855, + "grad_norm": 0.04433823376893997, "learning_rate": 4.691358024691358e-06, - "loss": 0.5021, + "loss": 0.5981, "step": 7540 }, { "epoch": 28.01854406130268, - "grad_norm": 0.014469806104898453, + "grad_norm": 0.004963343497365713, "learning_rate": 4.682843763303534e-06, - "loss": 0.0087, + "loss": 0.4705, "step": 7550 }, { "epoch": 28.019310344827588, - "grad_norm": 475.01300048828125, + "grad_norm": 0.01040688157081604, "learning_rate": 4.674329501915709e-06, - "loss": 0.5917, + "loss": 0.0002, "step": 7560 }, { "epoch": 28.02, - "eval_accuracy": 0.6222222222222222, - "eval_loss": 2.1613752841949463, - "eval_runtime": 17.3885, - "eval_samples_per_second": 2.588, - "eval_steps_per_second": 2.588, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 2.3122503757476807, + "eval_runtime": 17.029, + "eval_samples_per_second": 2.643, + "eval_steps_per_second": 2.643, "step": 7569 }, { "epoch": 29.000076628352492, - "grad_norm": 0.002209088299423456, + "grad_norm": 0.002148628933355212, "learning_rate": 4.665815240527884e-06, - "loss": 0.6208, + "loss": 0.762, "step": 7570 }, { "epoch": 29.000842911877395, - "grad_norm": 0.08406301587820053, + "grad_norm": 0.08913122862577438, "learning_rate": 4.65730097914006e-06, - "loss": 0.8105, + "loss": 0.1866, "step": 7580 }, { "epoch": 29.0016091954023, - "grad_norm": 0.0028736847452819347, + "grad_norm": 0.0012043421156704426, "learning_rate": 4.648786717752236e-06, - "loss": 0.2003, + "loss": 0.0008, "step": 7590 }, { "epoch": 29.002375478927203, - "grad_norm": 0.0018835192313417792, + "grad_norm": 0.000842480338178575, "learning_rate": 4.640272456364411e-06, - "loss": 0.0007, + "loss": 0.0006, "step": 7600 }, { "epoch": 29.003141762452106, - "grad_norm": 122.16399383544922, + "grad_norm": 117.10675048828125, "learning_rate": 4.631758194976586e-06, - "loss": 1.0619, + "loss": 0.0222, "step": 7610 }, { "epoch": 29.00390804597701, - "grad_norm": 0.25623956322669983, + "grad_norm": 0.44371485710144043, "learning_rate": 4.623243933588762e-06, - "loss": 1.083, + "loss": 0.3829, "step": 7620 }, { "epoch": 29.004674329501917, - "grad_norm": 0.25189438462257385, + "grad_norm": 0.0215621255338192, "learning_rate": 4.6147296722009375e-06, - "loss": 0.0018, + "loss": 0.5976, "step": 7630 }, { "epoch": 29.00544061302682, - "grad_norm": 0.002854238962754607, + "grad_norm": 0.0018318990478292108, "learning_rate": 4.6062154108131125e-06, - "loss": 1.3297, + "loss": 1.1538, "step": 7640 }, { "epoch": 29.006206896551724, - "grad_norm": 0.1321992576122284, + "grad_norm": 0.0030592659022659063, "learning_rate": 4.5977011494252875e-06, - "loss": 0.1907, + "loss": 0.0004, "step": 7650 }, { "epoch": 29.006973180076628, - "grad_norm": 0.002835531486198306, + "grad_norm": 0.003692545695230365, "learning_rate": 4.589186888037463e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 7660 }, { "epoch": 29.00773946360153, - "grad_norm": 0.5523931980133057, + "grad_norm": 0.0039315600879490376, "learning_rate": 4.580672626649638e-06, - "loss": 0.0021, + "loss": 0.5883, "step": 7670 }, { "epoch": 29.00850574712644, - "grad_norm": 0.04471282660961151, + "grad_norm": 0.008651458658277988, "learning_rate": 4.572158365261814e-06, - "loss": 0.0014, + "loss": 0.0047, "step": 7680 }, { "epoch": 29.009272030651342, - "grad_norm": 0.012600434012711048, + "grad_norm": 0.003915166482329369, "learning_rate": 4.563644103873989e-06, - "loss": 0.0012, + "loss": 0.4173, "step": 7690 }, { "epoch": 29.010038314176246, - "grad_norm": 0.045478709042072296, + "grad_norm": 0.01087894942611456, "learning_rate": 4.555129842486164e-06, - "loss": 0.9044, + "loss": 0.8188, "step": 7700 }, { "epoch": 29.01080459770115, - "grad_norm": 0.36786070466041565, + "grad_norm": 0.1352248638868332, "learning_rate": 4.54661558109834e-06, - "loss": 0.0308, + "loss": 0.3454, "step": 7710 }, { "epoch": 29.011570881226053, - "grad_norm": 0.010155444964766502, + "grad_norm": 0.002162589691579342, "learning_rate": 4.538101319710516e-06, - "loss": 0.4757, + "loss": 0.2552, "step": 7720 }, { "epoch": 29.012337164750956, - "grad_norm": 108.50332641601562, + "grad_norm": 26.094942092895508, "learning_rate": 4.529587058322691e-06, - "loss": 0.0081, + "loss": 0.0044, "step": 7730 }, { "epoch": 29.013103448275864, - "grad_norm": 0.005563146434724331, + "grad_norm": 0.0034353185910731554, "learning_rate": 4.521072796934866e-06, - "loss": 0.3228, + "loss": 0.6705, "step": 7740 }, { "epoch": 29.013869731800767, - "grad_norm": 0.17820726335048676, + "grad_norm": 0.11335699260234833, "learning_rate": 4.512558535547042e-06, - "loss": 0.0008, + "loss": 0.7386, "step": 7750 }, { "epoch": 29.01463601532567, - "grad_norm": 0.03194405883550644, + "grad_norm": 0.05063920468091965, "learning_rate": 4.5040442741592175e-06, - "loss": 0.0825, + "loss": 0.9943, "step": 7760 }, { "epoch": 29.015402298850574, - "grad_norm": 0.20218154788017273, + "grad_norm": 0.402839720249176, "learning_rate": 4.4955300127713925e-06, - "loss": 0.6149, + "loss": 0.6932, "step": 7770 }, { "epoch": 29.016168582375478, - "grad_norm": 0.0084315100684762, + "grad_norm": 0.006094322539865971, "learning_rate": 4.4870157513835675e-06, - "loss": 0.5895, + "loss": 0.2366, "step": 7780 }, { "epoch": 29.01693486590038, - "grad_norm": 0.0013169109588488936, + "grad_norm": 0.0007567451684735715, "learning_rate": 4.478501489995743e-06, - "loss": 0.6104, + "loss": 0.7505, "step": 7790 }, { "epoch": 29.01770114942529, - "grad_norm": 0.0030624265782535076, + "grad_norm": 0.0028736265376210213, "learning_rate": 4.469987228607919e-06, - "loss": 1.3294, + "loss": 0.2802, "step": 7800 }, { "epoch": 29.018467432950192, - "grad_norm": 0.006027798168361187, + "grad_norm": 0.0036474340595304966, "learning_rate": 4.461472967220094e-06, - "loss": 0.5671, + "loss": 0.4585, "step": 7810 }, { "epoch": 29.019233716475096, - "grad_norm": 160.5233154296875, + "grad_norm": 846.0031127929688, "learning_rate": 4.452958705832269e-06, - "loss": 0.0112, + "loss": 0.2674, "step": 7820 }, { "epoch": 29.02, - "grad_norm": 0.002737992675974965, + "grad_norm": 0.004186100792139769, "learning_rate": 4.444444444444444e-06, - "loss": 1.164, + "loss": 0.3838, "step": 7830 }, { "epoch": 29.02, - "eval_accuracy": 0.6222222222222222, - "eval_loss": 2.628338575363159, - "eval_runtime": 16.6956, - "eval_samples_per_second": 2.695, - "eval_steps_per_second": 2.695, + "eval_accuracy": 0.6888888888888889, + "eval_loss": 2.0821938514709473, + "eval_runtime": 17.1298, + "eval_samples_per_second": 2.627, + "eval_steps_per_second": 2.627, "step": 7830 }, { "epoch": 30.000766283524904, - "grad_norm": 0.0017403337405994534, + "grad_norm": 0.0013206526637077332, "learning_rate": 4.43593018305662e-06, - "loss": 1.548, + "loss": 0.5457, "step": 7840 }, { "epoch": 30.001532567049807, - "grad_norm": 0.0043546645902097225, + "grad_norm": 0.0016494260635226965, "learning_rate": 4.427415921668796e-06, - "loss": 0.6772, + "loss": 0.6652, "step": 7850 }, { "epoch": 30.002298850574714, - "grad_norm": 0.3664500415325165, + "grad_norm": 0.003054765984416008, "learning_rate": 4.418901660280971e-06, - "loss": 0.002, + "loss": 0.0779, "step": 7860 }, { "epoch": 30.003065134099618, - "grad_norm": 0.0019415492424741387, + "grad_norm": 0.0009527786169201136, "learning_rate": 4.410387398893146e-06, - "loss": 0.0059, + "loss": 0.0008, "step": 7870 }, { "epoch": 30.00383141762452, - "grad_norm": 1522.8701171875, + "grad_norm": 39.282928466796875, "learning_rate": 4.401873137505322e-06, - "loss": 0.573, + "loss": 0.0105, "step": 7880 }, { "epoch": 30.004597701149425, - "grad_norm": 0.4469618499279022, + "grad_norm": 0.41454750299453735, "learning_rate": 4.3933588761174975e-06, - "loss": 0.0011, + "loss": 0.0116, "step": 7890 }, { "epoch": 30.00536398467433, - "grad_norm": 0.0040297298692166805, + "grad_norm": 0.002337217330932617, "learning_rate": 4.3848446147296725e-06, - "loss": 0.0005, + "loss": 0.0044, "step": 7900 }, { "epoch": 30.006130268199232, - "grad_norm": 0.05007871612906456, + "grad_norm": 0.022408273071050644, "learning_rate": 4.3763303533418475e-06, - "loss": 0.9817, + "loss": 0.5814, "step": 7910 }, { "epoch": 30.00689655172414, - "grad_norm": 750.298095703125, + "grad_norm": 0.06461034715175629, "learning_rate": 4.367816091954023e-06, - "loss": 0.7312, + "loss": 0.9275, "step": 7920 }, { "epoch": 30.007662835249043, - "grad_norm": 0.04309046268463135, + "grad_norm": 0.15553386509418488, "learning_rate": 4.359301830566199e-06, - "loss": 0.0008, + "loss": 0.0116, "step": 7930 }, { "epoch": 30.008429118773947, - "grad_norm": 0.1190996766090393, + "grad_norm": 0.03268583491444588, "learning_rate": 4.350787569178374e-06, - "loss": 0.5812, + "loss": 0.8124, "step": 7940 }, { "epoch": 30.00919540229885, - "grad_norm": 0.01408596895635128, + "grad_norm": 0.01738172210752964, "learning_rate": 4.342273307790549e-06, - "loss": 0.1637, + "loss": 0.5301, "step": 7950 }, { "epoch": 30.009961685823754, - "grad_norm": 0.08507414907217026, + "grad_norm": 0.030108315870165825, "learning_rate": 4.333759046402725e-06, - "loss": 0.6358, + "loss": 0.0587, "step": 7960 }, { "epoch": 30.010727969348657, - "grad_norm": 0.017958883196115494, + "grad_norm": 0.08870036154985428, "learning_rate": 4.325244785014901e-06, - "loss": 0.0022, + "loss": 0.0068, "step": 7970 }, { "epoch": 30.011494252873565, - "grad_norm": 0.020431462675333023, + "grad_norm": 825.009521484375, "learning_rate": 4.316730523627076e-06, - "loss": 0.0011, + "loss": 0.4914, "step": 7980 }, { "epoch": 30.01226053639847, - "grad_norm": 0.0014786423416808248, + "grad_norm": 0.0009111390681937337, "learning_rate": 4.308216262239251e-06, - "loss": 0.0005, + "loss": 0.5104, "step": 7990 }, { "epoch": 30.013026819923372, - "grad_norm": 0.013602971099317074, + "grad_norm": 0.02633812092244625, "learning_rate": 4.299702000851427e-06, - "loss": 0.0002, + "loss": 0.0349, "step": 8000 }, { "epoch": 30.013793103448275, - "grad_norm": 0.0018909159116446972, + "grad_norm": 0.0009325495921075344, "learning_rate": 4.291187739463602e-06, - "loss": 2.4216, + "loss": 1.5482, "step": 8010 }, { "epoch": 30.01455938697318, - "grad_norm": 0.16683825850486755, + "grad_norm": 0.004731109831482172, "learning_rate": 4.2826734780757775e-06, - "loss": 0.0008, + "loss": 0.0003, "step": 8020 }, { "epoch": 30.015325670498083, - "grad_norm": 0.003589870408177376, + "grad_norm": 0.0038051034789532423, "learning_rate": 4.2741592166879525e-06, - "loss": 0.4971, + "loss": 0.418, "step": 8030 }, { "epoch": 30.01609195402299, - "grad_norm": 0.13243216276168823, + "grad_norm": 2.8870105743408203, "learning_rate": 4.2656449553001275e-06, - "loss": 0.0012, + "loss": 0.001, "step": 8040 }, { "epoch": 30.016858237547893, - "grad_norm": 0.00374067691154778, + "grad_norm": 0.004462406970560551, "learning_rate": 4.257130693912303e-06, - "loss": 0.0047, + "loss": 0.008, "step": 8050 }, { "epoch": 30.017624521072797, - "grad_norm": 424.7337646484375, + "grad_norm": 124.01143646240234, "learning_rate": 4.248616432524479e-06, - "loss": 0.7271, + "loss": 0.6256, "step": 8060 }, { "epoch": 30.0183908045977, - "grad_norm": 0.0047300742007792, + "grad_norm": 0.0011578965932130814, "learning_rate": 4.240102171136654e-06, - "loss": 0.0019, + "loss": 0.9633, "step": 8070 }, { "epoch": 30.019157088122604, - "grad_norm": 0.010775229893624783, + "grad_norm": 0.0010130584705621004, "learning_rate": 4.231587909748829e-06, - "loss": 0.0009, + "loss": 0.0084, "step": 8080 }, { "epoch": 30.01992337164751, - "grad_norm": 0.006654116325080395, + "grad_norm": 0.0014935218496248126, "learning_rate": 4.223073648361005e-06, - "loss": 0.676, + "loss": 0.3837, "step": 8090 }, { "epoch": 30.02, - "eval_accuracy": 0.7555555555555555, - "eval_loss": 1.6462864875793457, - "eval_runtime": 17.4619, - "eval_samples_per_second": 2.577, - "eval_steps_per_second": 2.577, + "eval_accuracy": 0.7111111111111111, + "eval_loss": 2.0535550117492676, + "eval_runtime": 18.0382, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 2.495, "step": 8091 }, { "epoch": 31.000689655172415, - "grad_norm": 0.08413773030042648, + "grad_norm": 0.018839921802282333, "learning_rate": 4.214559386973181e-06, - "loss": 0.3963, + "loss": 0.2228, "step": 8100 }, { "epoch": 31.00145593869732, - "grad_norm": 0.014156692661345005, + "grad_norm": 0.003570414148271084, "learning_rate": 4.206045125585356e-06, - "loss": 0.6865, + "loss": 0.1435, "step": 8110 }, { "epoch": 31.002222222222223, - "grad_norm": 0.0010045836679637432, + "grad_norm": 0.0007479687919840217, "learning_rate": 4.197530864197531e-06, - "loss": 0.426, + "loss": 0.0727, "step": 8120 }, { "epoch": 31.002988505747126, - "grad_norm": 1383.915283203125, + "grad_norm": 0.062240712344646454, "learning_rate": 4.189016602809707e-06, - "loss": 0.9189, + "loss": 0.7809, "step": 8130 }, { "epoch": 31.00375478927203, - "grad_norm": 0.0045251562260091305, + "grad_norm": 0.005926563870161772, "learning_rate": 4.180502341421882e-06, - "loss": 0.4047, + "loss": 0.0005, "step": 8140 }, { "epoch": 31.004521072796933, - "grad_norm": 48.45637130737305, + "grad_norm": 0.1666710078716278, "learning_rate": 4.1719880800340575e-06, - "loss": 0.0063, + "loss": 0.8318, "step": 8150 }, { "epoch": 31.00528735632184, - "grad_norm": 0.0015083706239238381, + "grad_norm": 0.006349637638777494, "learning_rate": 4.1634738186462325e-06, - "loss": 0.0006, + "loss": 0.0009, "step": 8160 }, { "epoch": 31.006053639846744, - "grad_norm": 0.002367808949202299, + "grad_norm": 0.0018807012820616364, "learning_rate": 4.154959557258408e-06, - "loss": 0.001, + "loss": 0.7813, "step": 8170 }, { "epoch": 31.006819923371648, - "grad_norm": 0.03869802504777908, + "grad_norm": 0.021228015422821045, "learning_rate": 4.146445295870583e-06, - "loss": 0.0004, + "loss": 0.1962, "step": 8180 }, { "epoch": 31.00758620689655, - "grad_norm": 0.0017894607735797763, + "grad_norm": 0.0012813645880669355, "learning_rate": 4.137931034482759e-06, - "loss": 0.1406, + "loss": 0.0776, "step": 8190 }, { "epoch": 31.008352490421455, - "grad_norm": 0.0027988513465970755, + "grad_norm": 0.03293246403336525, "learning_rate": 4.129416773094934e-06, - "loss": 0.0022, + "loss": 0.5161, "step": 8200 }, { "epoch": 31.00911877394636, - "grad_norm": 0.0025696379598230124, + "grad_norm": 0.043992917984724045, "learning_rate": 4.12090251170711e-06, - "loss": 0.5572, + "loss": 1.4302, "step": 8210 }, { "epoch": 31.009885057471266, - "grad_norm": 0.10699835419654846, + "grad_norm": 0.05921050161123276, "learning_rate": 4.112388250319285e-06, - "loss": 0.0001, + "loss": 0.0003, "step": 8220 }, { "epoch": 31.01065134099617, - "grad_norm": 0.03235405683517456, + "grad_norm": 0.01283930242061615, "learning_rate": 4.103873988931461e-06, - "loss": 0.0013, + "loss": 0.3371, "step": 8230 }, { "epoch": 31.011417624521073, - "grad_norm": 0.0010920802596956491, + "grad_norm": 0.0008328980184160173, "learning_rate": 4.095359727543636e-06, - "loss": 0.1774, + "loss": 0.0018, "step": 8240 }, { "epoch": 31.012183908045976, - "grad_norm": 0.0032772994600236416, + "grad_norm": 0.0029625671450048685, "learning_rate": 4.086845466155812e-06, - "loss": 0.9424, + "loss": 0.0004, "step": 8250 }, { "epoch": 31.01295019157088, - "grad_norm": 0.13287808001041412, + "grad_norm": 0.011506871320307255, "learning_rate": 4.078331204767987e-06, - "loss": 1.021, + "loss": 0.8298, "step": 8260 }, { "epoch": 31.013716475095784, - "grad_norm": 0.035149626433849335, + "grad_norm": 0.013462170027196407, "learning_rate": 4.0698169433801625e-06, - "loss": 0.6229, + "loss": 0.121, "step": 8270 }, { "epoch": 31.01448275862069, - "grad_norm": 0.0018004273297265172, + "grad_norm": 0.0030199168249964714, "learning_rate": 4.0613026819923375e-06, - "loss": 0.0014, + "loss": 0.0004, "step": 8280 }, { "epoch": 31.015249042145594, - "grad_norm": 0.0022860304452478886, + "grad_norm": 0.0018084817565977573, "learning_rate": 4.052788420604513e-06, - "loss": 0.5154, + "loss": 0.6492, "step": 8290 }, { "epoch": 31.016015325670498, - "grad_norm": 0.009974866174161434, + "grad_norm": 0.004171090200543404, "learning_rate": 4.044274159216688e-06, - "loss": 0.0008, + "loss": 0.3672, "step": 8300 }, { "epoch": 31.0167816091954, - "grad_norm": 0.026273837313055992, + "grad_norm": 41.73890686035156, "learning_rate": 4.035759897828863e-06, - "loss": 0.0011, + "loss": 0.0036, "step": 8310 }, { "epoch": 31.017547892720305, - "grad_norm": 0.001352796913124621, + "grad_norm": 0.004839760717004538, "learning_rate": 4.027245636441039e-06, - "loss": 0.0015, + "loss": 0.2028, "step": 8320 }, { "epoch": 31.018314176245212, - "grad_norm": 406.7122497558594, + "grad_norm": 403.3126220703125, "learning_rate": 4.018731375053214e-06, - "loss": 0.663, + "loss": 0.6316, "step": 8330 }, { "epoch": 31.019080459770116, - "grad_norm": 0.0034345497842878103, + "grad_norm": 0.005046645179390907, "learning_rate": 4.01021711366539e-06, - "loss": 0.0008, + "loss": 0.0025, "step": 8340 }, { "epoch": 31.01984674329502, - "grad_norm": 0.04289378225803375, + "grad_norm": 0.04529179260134697, "learning_rate": 4.001702852277565e-06, - "loss": 0.0004, + "loss": 0.0003, "step": 8350 }, { "epoch": 31.02, - "eval_accuracy": 0.6666666666666666, - "eval_loss": 2.187613010406494, - "eval_runtime": 17.5172, - "eval_samples_per_second": 2.569, - "eval_steps_per_second": 2.569, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 2.291869640350342, + "eval_runtime": 17.1072, + "eval_samples_per_second": 2.63, + "eval_steps_per_second": 2.63, "step": 8352 }, { "epoch": 32.00061302681992, - "grad_norm": 0.050760891288518906, + "grad_norm": 0.029900701716542244, "learning_rate": 3.993188590889741e-06, - "loss": 0.0007, + "loss": 0.0005, "step": 8360 }, { "epoch": 32.00137931034483, - "grad_norm": 0.044432133436203, + "grad_norm": 0.022851264104247093, "learning_rate": 3.984674329501916e-06, - "loss": 0.0003, + "loss": 0.0017, "step": 8370 }, { "epoch": 32.002145593869734, - "grad_norm": 0.02326607145369053, + "grad_norm": 0.0930880680680275, "learning_rate": 3.976160068114092e-06, - "loss": 0.0005, + "loss": 0.657, "step": 8380 }, { "epoch": 32.00291187739464, - "grad_norm": 331.4932556152344, + "grad_norm": 183.4925537109375, "learning_rate": 3.967645806726267e-06, - "loss": 0.4065, + "loss": 0.6549, "step": 8390 }, { "epoch": 32.00367816091954, - "grad_norm": 0.007163768634200096, + "grad_norm": 0.040463775396347046, "learning_rate": 3.9591315453384425e-06, - "loss": 0.7046, + "loss": 0.9336, "step": 8400 }, { "epoch": 32.004444444444445, - "grad_norm": 0.02402699738740921, + "grad_norm": 0.030311554670333862, "learning_rate": 3.9506172839506175e-06, - "loss": 0.6798, + "loss": 0.0735, "step": 8410 }, { "epoch": 32.00521072796935, - "grad_norm": 0.07197339087724686, + "grad_norm": 0.197578564286232, "learning_rate": 3.942103022562793e-06, - "loss": 0.912, + "loss": 0.613, "step": 8420 }, { "epoch": 32.00597701149425, - "grad_norm": 0.1396198272705078, + "grad_norm": 0.027124224230647087, "learning_rate": 3.933588761174968e-06, - "loss": 0.0008, + "loss": 0.0009, "step": 8430 }, { "epoch": 32.006743295019156, - "grad_norm": 0.05314707010984421, + "grad_norm": 0.0229842197149992, "learning_rate": 3.925074499787143e-06, - "loss": 0.5205, + "loss": 0.0007, "step": 8440 }, { "epoch": 32.00750957854406, - "grad_norm": 0.0016349204815924168, + "grad_norm": 0.0008010246674530208, "learning_rate": 3.916560238399319e-06, - "loss": 0.5112, + "loss": 0.0746, "step": 8450 }, { "epoch": 32.00827586206896, - "grad_norm": 0.02777864784002304, + "grad_norm": 0.1851923167705536, "learning_rate": 3.908045977011495e-06, - "loss": 0.0005, + "loss": 0.0003, "step": 8460 }, { "epoch": 32.00904214559387, - "grad_norm": 0.0014905523275956511, + "grad_norm": 0.0020714455749839544, "learning_rate": 3.89953171562367e-06, - "loss": 0.6179, + "loss": 0.0006, "step": 8470 }, { "epoch": 32.00980842911878, - "grad_norm": 0.011606418527662754, + "grad_norm": 0.025280645117163658, "learning_rate": 3.891017454235845e-06, - "loss": 0.0043, + "loss": 0.0002, "step": 8480 }, { "epoch": 32.01057471264368, - "grad_norm": 1.9649847745895386, + "grad_norm": 0.025958703830838203, "learning_rate": 3.882503192848021e-06, - "loss": 0.3938, + "loss": 0.0007, "step": 8490 }, { "epoch": 32.011340996168585, - "grad_norm": 0.039615459740161896, + "grad_norm": 0.06702405959367752, "learning_rate": 3.873988931460197e-06, - "loss": 1.3381, + "loss": 0.6785, "step": 8500 }, { "epoch": 32.01210727969349, - "grad_norm": 4.187680244445801, + "grad_norm": 4.9963788986206055, "learning_rate": 3.865474670072372e-06, - "loss": 0.002, + "loss": 0.0012, "step": 8510 }, { "epoch": 32.01287356321839, - "grad_norm": 0.0010674932273104787, + "grad_norm": 0.0007506079273298383, "learning_rate": 3.856960408684547e-06, - "loss": 0.0968, + "loss": 0.9654, "step": 8520 }, { "epoch": 32.013639846743295, - "grad_norm": 0.0013790702214464545, + "grad_norm": 0.0017522679409012198, "learning_rate": 3.8484461472967225e-06, - "loss": 0.5977, + "loss": 0.003, "step": 8530 }, { "epoch": 32.0144061302682, - "grad_norm": 0.009065764956176281, + "grad_norm": 0.012696490623056889, "learning_rate": 3.839931885908898e-06, - "loss": 0.0043, + "loss": 0.4043, "step": 8540 }, { "epoch": 32.0151724137931, - "grad_norm": 0.2218812257051468, + "grad_norm": 1028.671875, "learning_rate": 3.831417624521073e-06, - "loss": 0.5941, + "loss": 1.0257, "step": 8550 }, { "epoch": 32.015938697318006, - "grad_norm": 0.0020185543689876795, + "grad_norm": 0.0011120631825178862, "learning_rate": 3.822903363133248e-06, - "loss": 0.0008, + "loss": 0.0002, "step": 8560 }, { "epoch": 32.01670498084291, - "grad_norm": 0.06359002739191055, + "grad_norm": 0.09038526564836502, "learning_rate": 3.8143891017454237e-06, - "loss": 0.2812, + "loss": 0.0005, "step": 8570 }, { "epoch": 32.01747126436781, - "grad_norm": 0.0026320351753383875, + "grad_norm": 0.0020666453056037426, "learning_rate": 3.805874840357599e-06, - "loss": 0.6279, + "loss": 0.7113, "step": 8580 }, { "epoch": 32.01823754789272, - "grad_norm": 0.06365342438220978, + "grad_norm": 0.059685710817575455, "learning_rate": 3.797360578969775e-06, - "loss": 0.0017, + "loss": 0.0009, "step": 8590 }, { "epoch": 32.01900383141763, - "grad_norm": 0.0018165806541219354, + "grad_norm": 0.0008020559325814247, "learning_rate": 3.78884631758195e-06, - "loss": 0.2256, + "loss": 0.5951, "step": 8600 }, { "epoch": 32.01977011494253, - "grad_norm": 0.7751949429512024, + "grad_norm": 0.24108955264091492, "learning_rate": 3.7803320561941254e-06, - "loss": 0.0005, + "loss": 0.0002, "step": 8610 }, { "epoch": 32.02, - "eval_accuracy": 0.6222222222222222, - "eval_loss": 2.8828539848327637, - "eval_runtime": 17.4465, - "eval_samples_per_second": 2.579, - "eval_steps_per_second": 2.579, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 2.578796863555908, + "eval_runtime": 16.1187, + "eval_samples_per_second": 2.792, + "eval_steps_per_second": 2.792, "step": 8613 }, { "epoch": 33.00053639846743, - "grad_norm": 0.11451853066682816, + "grad_norm": 178.100830078125, "learning_rate": 3.7718177948063004e-06, - "loss": 0.9613, + "loss": 1.5386, "step": 8620 }, { "epoch": 33.001302681992335, - "grad_norm": 23.902896881103516, + "grad_norm": 125.62797546386719, "learning_rate": 3.7633035334184762e-06, - "loss": 0.042, + "loss": 0.2597, "step": 8630 }, { "epoch": 33.00206896551724, - "grad_norm": 0.04148732125759125, + "grad_norm": 0.02392035350203514, "learning_rate": 3.7547892720306517e-06, - "loss": 0.0003, + "loss": 0.0004, "step": 8640 }, { "epoch": 33.00283524904214, - "grad_norm": 0.0014657662250101566, + "grad_norm": 0.0008896150393411517, "learning_rate": 3.746275010642827e-06, - "loss": 0.0002, + "loss": 0.0009, "step": 8650 }, { "epoch": 33.00360153256705, - "grad_norm": 0.005355422385036945, + "grad_norm": 0.003613451961427927, "learning_rate": 3.737760749255002e-06, - "loss": 0.4988, + "loss": 0.7367, "step": 8660 }, { "epoch": 33.00436781609196, - "grad_norm": 0.01895313523709774, + "grad_norm": 0.032477278262376785, "learning_rate": 3.729246487867178e-06, - "loss": 0.6797, + "loss": 0.0438, "step": 8670 }, { "epoch": 33.00513409961686, - "grad_norm": 0.12008937448263168, + "grad_norm": 203.76272583007812, "learning_rate": 3.7207322264793533e-06, - "loss": 0.7876, + "loss": 1.4811, "step": 8680 }, { "epoch": 33.005900383141764, - "grad_norm": 0.0012769072782248259, + "grad_norm": 0.0011263691121712327, "learning_rate": 3.7122179650915287e-06, - "loss": 0.0004, + "loss": 0.0005, "step": 8690 }, { "epoch": 33.00666666666667, - "grad_norm": 50.389556884765625, + "grad_norm": 1139.5672607421875, "learning_rate": 3.7037037037037037e-06, - "loss": 0.6235, + "loss": 0.4859, "step": 8700 }, { "epoch": 33.00743295019157, - "grad_norm": 0.00215235841460526, + "grad_norm": 0.0014041824033483863, "learning_rate": 3.6951894423158796e-06, - "loss": 0.6996, + "loss": 0.0159, "step": 8710 }, { "epoch": 33.008199233716475, - "grad_norm": 0.0014900616370141506, + "grad_norm": 0.0030003490392118692, "learning_rate": 3.686675180928055e-06, - "loss": 0.1534, + "loss": 0.8292, "step": 8720 }, { "epoch": 33.00896551724138, - "grad_norm": 0.0135155338793993, + "grad_norm": 0.0022226409055292606, "learning_rate": 3.67816091954023e-06, - "loss": 0.0004, + "loss": 0.0003, "step": 8730 }, { "epoch": 33.00973180076628, - "grad_norm": 0.0025974265299737453, + "grad_norm": 0.0018752004252746701, "learning_rate": 3.6696466581524054e-06, - "loss": 0.0163, + "loss": 0.4858, "step": 8740 }, { "epoch": 33.010498084291186, - "grad_norm": 0.027656665071845055, + "grad_norm": 0.016336733475327492, "learning_rate": 3.6611323967645812e-06, - "loss": 0.0093, + "loss": 0.0002, "step": 8750 }, { "epoch": 33.01126436781609, - "grad_norm": 0.0014540269039571285, + "grad_norm": 0.0008627419592812657, "learning_rate": 3.6526181353767567e-06, - "loss": 0.6131, + "loss": 0.217, "step": 8760 }, { "epoch": 33.01203065134099, - "grad_norm": 0.14054560661315918, + "grad_norm": 0.18590013682842255, "learning_rate": 3.6441038739889317e-06, - "loss": 0.0008, + "loss": 0.0007, "step": 8770 }, { "epoch": 33.012796934865904, - "grad_norm": 0.0007935257162898779, + "grad_norm": 0.0006980589823797345, "learning_rate": 3.635589612601107e-06, - "loss": 1.1133, + "loss": 0.4413, "step": 8780 }, { "epoch": 33.01356321839081, - "grad_norm": 0.0011439216323196888, + "grad_norm": 0.0013980860821902752, "learning_rate": 3.627075351213283e-06, - "loss": 0.0009, + "loss": 0.0032, "step": 8790 }, { "epoch": 33.01432950191571, - "grad_norm": 49.440616607666016, + "grad_norm": 491.8056945800781, "learning_rate": 3.618561089825458e-06, - "loss": 1.0856, + "loss": 1.2796, "step": 8800 }, { "epoch": 33.015095785440614, - "grad_norm": 0.002337819430977106, + "grad_norm": 0.000672339869197458, "learning_rate": 3.6100468284376333e-06, - "loss": 0.1034, + "loss": 0.4991, "step": 8810 }, { "epoch": 33.01586206896552, - "grad_norm": 0.07056647539138794, + "grad_norm": 0.0057693226262927055, "learning_rate": 3.6015325670498087e-06, - "loss": 0.8216, + "loss": 2.7266, "step": 8820 }, { "epoch": 33.01662835249042, - "grad_norm": 643.5972900390625, + "grad_norm": 192.10910034179688, "learning_rate": 3.5930183056619837e-06, - "loss": 0.4272, + "loss": 0.6061, "step": 8830 }, { "epoch": 33.017394636015325, - "grad_norm": 0.007394916843622923, + "grad_norm": 0.029679972678422928, "learning_rate": 3.5845040442741596e-06, - "loss": 0.0009, + "loss": 0.0004, "step": 8840 }, { "epoch": 33.01816091954023, - "grad_norm": 0.005081645678728819, + "grad_norm": 0.002240247093141079, "learning_rate": 3.575989782886335e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 8850 }, { "epoch": 33.01892720306513, - "grad_norm": 0.008987536653876305, + "grad_norm": 0.6050621271133423, "learning_rate": 3.56747552149851e-06, - "loss": 0.0023, + "loss": 0.0005, "step": 8860 }, { "epoch": 33.019693486590036, - "grad_norm": 0.023250076919794083, + "grad_norm": 0.010382554493844509, "learning_rate": 3.5589612601106854e-06, - "loss": 0.374, + "loss": 0.0007, "step": 8870 }, { "epoch": 33.02, "eval_accuracy": 0.6, - "eval_loss": 3.01200795173645, - "eval_runtime": 18.6765, - "eval_samples_per_second": 2.409, - "eval_steps_per_second": 2.409, + "eval_loss": 2.7998201847076416, + "eval_runtime": 15.5183, + "eval_samples_per_second": 2.9, + "eval_steps_per_second": 2.9, "step": 8874 }, { "epoch": 34.000459770114944, - "grad_norm": 0.026298148557543755, + "grad_norm": 0.011408054269850254, "learning_rate": 3.5504469987228612e-06, - "loss": 0.8923, + "loss": 0.0011, "step": 8880 }, { "epoch": 34.00122605363985, - "grad_norm": 0.0008303425856865942, + "grad_norm": 0.00235149753279984, "learning_rate": 3.5419327373350367e-06, - "loss": 0.5597, + "loss": 1.129, "step": 8890 }, { "epoch": 34.00199233716475, - "grad_norm": 0.38984957337379456, + "grad_norm": 92.75953674316406, "learning_rate": 3.5334184759472117e-06, - "loss": 0.6938, + "loss": 0.8057, "step": 8900 }, { "epoch": 34.002758620689654, - "grad_norm": 0.06885547190904617, + "grad_norm": 0.023898348212242126, "learning_rate": 3.524904214559387e-06, - "loss": 0.6293, + "loss": 0.0005, "step": 8910 }, { "epoch": 34.00352490421456, - "grad_norm": 0.028704319149255753, + "grad_norm": 0.02499270811676979, "learning_rate": 3.516389953171563e-06, - "loss": 0.6162, + "loss": 0.7686, "step": 8920 }, { "epoch": 34.00429118773946, - "grad_norm": 0.0016891061095520854, + "grad_norm": 0.0013700644485652447, "learning_rate": 3.507875691783738e-06, - "loss": 0.0026, + "loss": 0.0005, "step": 8930 }, { "epoch": 34.005057471264365, - "grad_norm": 0.014870488084852695, + "grad_norm": 0.002694552531465888, "learning_rate": 3.4993614303959133e-06, - "loss": 0.0047, + "loss": 0.0004, "step": 8940 }, { "epoch": 34.00582375478927, - "grad_norm": 225.62716674804688, + "grad_norm": 252.3150177001953, "learning_rate": 3.4908471690080887e-06, - "loss": 0.6202, + "loss": 0.627, "step": 8950 }, { "epoch": 34.00659003831418, - "grad_norm": 0.004021927714347839, + "grad_norm": 0.00319548137485981, "learning_rate": 3.4823329076202646e-06, - "loss": 0.01, + "loss": 0.3893, "step": 8960 }, { "epoch": 34.00735632183908, - "grad_norm": 0.132020965218544, + "grad_norm": 0.08170146495103836, "learning_rate": 3.4738186462324396e-06, - "loss": 0.0222, + "loss": 0.0015, "step": 8970 }, { "epoch": 34.00812260536399, - "grad_norm": 0.002630164846777916, + "grad_norm": 0.0025141071528196335, "learning_rate": 3.465304384844615e-06, - "loss": 0.6791, + "loss": 0.0038, "step": 8980 }, { "epoch": 34.00888888888889, - "grad_norm": 0.001463883207179606, + "grad_norm": 0.0012387968599796295, "learning_rate": 3.4567901234567904e-06, - "loss": 0.1738, + "loss": 0.0002, "step": 8990 }, { "epoch": 34.009655172413794, - "grad_norm": 0.5066991448402405, + "grad_norm": 0.015458296053111553, "learning_rate": 3.448275862068966e-06, - "loss": 0.0006, + "loss": 0.0002, "step": 9000 }, { "epoch": 34.0104214559387, - "grad_norm": 0.0008095522643998265, + "grad_norm": 0.0011621768353506923, "learning_rate": 3.4397616006811412e-06, - "loss": 0.6062, + "loss": 0.0002, "step": 9010 }, { "epoch": 34.0111877394636, - "grad_norm": 0.03945675864815712, + "grad_norm": 0.007999553345143795, "learning_rate": 3.4312473392933167e-06, - "loss": 1.1932, + "loss": 1.4172, "step": 9020 }, { "epoch": 34.011954022988505, - "grad_norm": 0.0018284351099282503, + "grad_norm": 0.0016818393487483263, "learning_rate": 3.4227330779054917e-06, - "loss": 0.0155, + "loss": 0.3123, "step": 9030 }, { "epoch": 34.01272030651341, - "grad_norm": 0.7912819981575012, + "grad_norm": 2.0030250549316406, "learning_rate": 3.4142188165176675e-06, - "loss": 0.0015, + "loss": 0.0021, "step": 9040 }, { "epoch": 34.01348659003831, - "grad_norm": 0.0008633221150375903, + "grad_norm": 0.0008237874717451632, "learning_rate": 3.405704555129843e-06, - "loss": 0.0018, + "loss": 0.2649, "step": 9050 }, { "epoch": 34.014252873563215, - "grad_norm": 0.8620066046714783, + "grad_norm": 804.2601318359375, "learning_rate": 3.3971902937420183e-06, - "loss": 0.0016, + "loss": 1.1015, "step": 9060 }, { "epoch": 34.01501915708812, - "grad_norm": 0.0023911388125270605, + "grad_norm": 0.0055848085321486, "learning_rate": 3.3886760323541933e-06, - "loss": 0.0054, + "loss": 0.0002, "step": 9070 }, { "epoch": 34.01578544061303, - "grad_norm": 0.0172291062772274, + "grad_norm": 0.19129914045333862, "learning_rate": 3.380161770966369e-06, - "loss": 0.0094, + "loss": 0.0003, "step": 9080 }, { "epoch": 34.01655172413793, - "grad_norm": 41.288482666015625, + "grad_norm": 6.2389421463012695, "learning_rate": 3.3716475095785446e-06, - "loss": 2.1483, + "loss": 1.3826, "step": 9090 }, { "epoch": 34.01731800766284, - "grad_norm": 111.49554443359375, + "grad_norm": 84.3442153930664, "learning_rate": 3.3631332481907196e-06, - "loss": 0.8263, + "loss": 0.8471, "step": 9100 }, { "epoch": 34.01808429118774, - "grad_norm": 0.00186067633330822, + "grad_norm": 0.009653395973145962, "learning_rate": 3.354618986802895e-06, - "loss": 0.004, + "loss": 0.5785, "step": 9110 }, { "epoch": 34.018850574712644, - "grad_norm": 0.008797138929367065, + "grad_norm": 0.008063157089054585, "learning_rate": 3.3461047254150704e-06, - "loss": 0.0043, + "loss": 0.3305, "step": 9120 }, { "epoch": 34.01961685823755, - "grad_norm": 50.060359954833984, + "grad_norm": 20.905275344848633, "learning_rate": 3.3375904640272463e-06, - "loss": 0.5055, + "loss": 0.0073, "step": 9130 }, { "epoch": 34.02, - "eval_accuracy": 0.5777777777777777, - "eval_loss": 2.9882922172546387, - "eval_runtime": 17.484, - "eval_samples_per_second": 2.574, - "eval_steps_per_second": 2.574, + "eval_accuracy": 0.5555555555555556, + "eval_loss": 2.900055170059204, + "eval_runtime": 15.5257, + "eval_samples_per_second": 2.898, + "eval_steps_per_second": 2.898, "step": 9135 }, { "epoch": 35.000383141762455, - "grad_norm": 0.0052855294197797775, + "grad_norm": 0.002380785532295704, "learning_rate": 3.3290762026394212e-06, - "loss": 0.0009, + "loss": 0.0017, "step": 9140 }, { "epoch": 35.00114942528736, - "grad_norm": 0.12240047007799149, + "grad_norm": 0.3021116852760315, "learning_rate": 3.3205619412515967e-06, - "loss": 0.6306, + "loss": 0.6334, "step": 9150 }, { "epoch": 35.00191570881226, - "grad_norm": 0.0038787233643233776, + "grad_norm": 0.0064330194145441055, "learning_rate": 3.3120476798637717e-06, - "loss": 0.0057, + "loss": 0.5454, "step": 9160 }, { "epoch": 35.002681992337166, - "grad_norm": 0.0007793721160851419, + "grad_norm": 0.000918998965062201, "learning_rate": 3.3035334184759475e-06, - "loss": 0.0023, + "loss": 0.0015, "step": 9170 }, { "epoch": 35.00344827586207, - "grad_norm": 0.000989857828244567, + "grad_norm": 0.0010305899195373058, "learning_rate": 3.295019157088123e-06, - "loss": 0.0004, + "loss": 0.0002, "step": 9180 }, { "epoch": 35.00421455938697, - "grad_norm": 0.010914674960076809, + "grad_norm": 0.03052903711795807, "learning_rate": 3.2865048957002983e-06, - "loss": 0.3631, + "loss": 0.1216, "step": 9190 }, { "epoch": 35.00498084291188, - "grad_norm": 0.0008215695852413774, + "grad_norm": 0.0021850867196917534, "learning_rate": 3.2779906343124733e-06, - "loss": 1.2651, + "loss": 0.348, "step": 9200 }, { "epoch": 35.00574712643678, - "grad_norm": 0.005741764325648546, + "grad_norm": 0.03568919003009796, "learning_rate": 3.269476372924649e-06, - "loss": 0.5143, + "loss": 0.6668, "step": 9210 }, { "epoch": 35.006513409961684, - "grad_norm": 118.9592514038086, + "grad_norm": 0.012625262141227722, "learning_rate": 3.2609621115368246e-06, - "loss": 0.9502, + "loss": 0.0004, "step": 9220 }, { "epoch": 35.00727969348659, - "grad_norm": 4.2004618644714355, + "grad_norm": 0.037941791117191315, "learning_rate": 3.2524478501489996e-06, - "loss": 0.0021, + "loss": 0.3085, "step": 9230 }, { "epoch": 35.00804597701149, - "grad_norm": 0.09331857413053513, + "grad_norm": 0.008392606861889362, "learning_rate": 3.243933588761175e-06, "loss": 0.0016, "step": 9240 }, { "epoch": 35.008812260536395, - "grad_norm": 0.023068580776453018, + "grad_norm": 0.11013925820589066, "learning_rate": 3.235419327373351e-06, - "loss": 0.6491, + "loss": 1.2189, "step": 9250 }, { "epoch": 35.009578544061306, - "grad_norm": 381.2834777832031, + "grad_norm": 0.061698272824287415, "learning_rate": 3.2269050659855262e-06, - "loss": 0.5825, + "loss": 0.0123, "step": 9260 }, { "epoch": 35.01034482758621, - "grad_norm": 0.568932056427002, + "grad_norm": 282.1393737792969, "learning_rate": 3.2183908045977012e-06, - "loss": 0.0012, + "loss": 0.5376, "step": 9270 }, { "epoch": 35.01111111111111, - "grad_norm": 0.07121097296476364, + "grad_norm": 0.016884762793779373, "learning_rate": 3.2098765432098767e-06, - "loss": 0.8559, + "loss": 0.4277, "step": 9280 }, { "epoch": 35.01187739463602, - "grad_norm": 0.042752351611852646, + "grad_norm": 0.045929163694381714, "learning_rate": 3.2013622818220525e-06, - "loss": 0.0008, + "loss": 0.0019, "step": 9290 }, { "epoch": 35.01264367816092, - "grad_norm": 0.0024915547110140324, + "grad_norm": 0.0032573333010077477, "learning_rate": 3.1928480204342275e-06, - "loss": 0.0009, + "loss": 0.0012, "step": 9300 }, { "epoch": 35.013409961685824, - "grad_norm": 0.013996579684317112, + "grad_norm": 0.02101743035018444, "learning_rate": 3.184333759046403e-06, - "loss": 0.8324, + "loss": 0.0004, "step": 9310 }, { "epoch": 35.01417624521073, - "grad_norm": 0.0009798520477488637, + "grad_norm": 0.0008551854407414794, "learning_rate": 3.1758194976585783e-06, - "loss": 0.084, + "loss": 0.0002, "step": 9320 }, { "epoch": 35.01494252873563, - "grad_norm": 0.001850913162343204, + "grad_norm": 0.00165683520026505, "learning_rate": 3.167305236270754e-06, - "loss": 0.7948, + "loss": 0.6198, "step": 9330 }, { "epoch": 35.015708812260534, - "grad_norm": 0.002051458926871419, + "grad_norm": 0.0010720925638452172, "learning_rate": 3.158790974882929e-06, - "loss": 0.0005, + "loss": 0.0002, "step": 9340 }, { "epoch": 35.01647509578544, - "grad_norm": 0.003204555017873645, + "grad_norm": 0.001333705266006291, "learning_rate": 3.1502767134951046e-06, - "loss": 0.0019, + "loss": 0.0005, "step": 9350 }, { "epoch": 35.01724137931034, - "grad_norm": 0.0024066295009106398, + "grad_norm": 0.0030160106252878904, "learning_rate": 3.14176245210728e-06, - "loss": 0.0007, + "loss": 0.0003, "step": 9360 }, { "epoch": 35.01800766283525, - "grad_norm": 0.1171986311674118, + "grad_norm": 220.75767517089844, "learning_rate": 3.133248190719455e-06, - "loss": 0.0008, + "loss": 0.6829, "step": 9370 }, { "epoch": 35.018773946360156, - "grad_norm": 0.02009454183280468, + "grad_norm": 0.026932761073112488, "learning_rate": 3.124733929331631e-06, - "loss": 0.7997, + "loss": 0.1942, "step": 9380 }, { "epoch": 35.01954022988506, - "grad_norm": 273.1219177246094, + "grad_norm": 1080.871337890625, "learning_rate": 3.1162196679438062e-06, - "loss": 1.1475, + "loss": 0.1746, "step": 9390 }, { "epoch": 35.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.298203229904175, - "eval_runtime": 16.8472, - "eval_samples_per_second": 2.671, - "eval_steps_per_second": 2.671, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 2.546032190322876, + "eval_runtime": 15.4635, + "eval_samples_per_second": 2.91, + "eval_steps_per_second": 2.91, "step": 9396 }, { "epoch": 36.00030651340996, - "grad_norm": 0.021218191832304, + "grad_norm": 0.010618367232382298, "learning_rate": 3.1077054065559812e-06, - "loss": 0.813, + "loss": 0.0106, "step": 9400 }, { "epoch": 36.001072796934864, - "grad_norm": 0.02907707169651985, + "grad_norm": 0.03759034350514412, "learning_rate": 3.0991911451681567e-06, - "loss": 0.0003, + "loss": 0.4728, "step": 9410 }, { "epoch": 36.00183908045977, - "grad_norm": 0.0010853084968402982, + "grad_norm": 0.0007568824221380055, "learning_rate": 3.0906768837803325e-06, - "loss": 0.0004, + "loss": 0.0001, "step": 9420 }, { "epoch": 36.00260536398467, - "grad_norm": 0.01685185544192791, + "grad_norm": 0.0033626488875597715, "learning_rate": 3.082162622392508e-06, - "loss": 0.0018, + "loss": 0.0004, "step": 9430 }, { "epoch": 36.00337164750958, - "grad_norm": 0.002827772404998541, + "grad_norm": 0.0024997724685817957, "learning_rate": 3.073648361004683e-06, "loss": 0.0003, "step": 9440 }, { "epoch": 36.004137931034485, - "grad_norm": 0.00204670918174088, + "grad_norm": 0.002448135521262884, "learning_rate": 3.0651340996168583e-06, - "loss": 0.0002, + "loss": 0.2476, "step": 9450 }, { "epoch": 36.00490421455939, - "grad_norm": 0.001061600516550243, + "grad_norm": 0.0012070827651768923, "learning_rate": 3.056619838229034e-06, - "loss": 0.5661, + "loss": 0.0003, "step": 9460 }, { "epoch": 36.00567049808429, - "grad_norm": 0.001115455524995923, + "grad_norm": 0.0005170078366063535, "learning_rate": 3.048105576841209e-06, - "loss": 0.6575, + "loss": 0.7726, "step": 9470 }, { "epoch": 36.006436781609196, - "grad_norm": 0.015395056456327438, + "grad_norm": 0.0032920632511377335, "learning_rate": 3.0395913154533846e-06, - "loss": 0.0031, + "loss": 0.5944, "step": 9480 }, { "epoch": 36.0072030651341, - "grad_norm": 67.76371765136719, + "grad_norm": 1.639153242111206, "learning_rate": 3.03107705406556e-06, - "loss": 0.4773, + "loss": 0.7153, "step": 9490 }, { "epoch": 36.007969348659, - "grad_norm": 0.01183448638767004, + "grad_norm": 0.0422653891146183, "learning_rate": 3.022562792677736e-06, - "loss": 0.0004, + "loss": 0.0001, "step": 9500 }, { "epoch": 36.00873563218391, - "grad_norm": 0.0008166850893758237, + "grad_norm": 0.0006986379739828408, "learning_rate": 3.014048531289911e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 9510 }, { "epoch": 36.00950191570881, - "grad_norm": 0.0011333675356581807, + "grad_norm": 0.0008206897182390094, "learning_rate": 3.0055342699020862e-06, - "loss": 0.0019, + "loss": 0.0031, "step": 9520 }, { "epoch": 36.010268199233714, - "grad_norm": 0.0010564734693616629, + "grad_norm": 0.0010992834577336907, "learning_rate": 2.9970200085142612e-06, - "loss": 1.3967, + "loss": 0.9547, "step": 9530 }, { "epoch": 36.01103448275862, - "grad_norm": 0.001182054285891354, + "grad_norm": 0.0007167965522967279, "learning_rate": 2.988505747126437e-06, - "loss": 0.0005, + "loss": 0.0002, "step": 9540 }, { "epoch": 36.01180076628353, - "grad_norm": 0.0008806926780380309, + "grad_norm": 0.0008066099253483117, "learning_rate": 2.9799914857386125e-06, - "loss": 0.6012, + "loss": 0.0234, "step": 9550 }, { "epoch": 36.01256704980843, - "grad_norm": 0.1948493868112564, + "grad_norm": 1.7508395910263062, "learning_rate": 2.971477224350788e-06, - "loss": 0.5894, + "loss": 0.2927, "step": 9560 }, { "epoch": 36.013333333333335, - "grad_norm": 0.0010413166601210833, + "grad_norm": 0.000883891130797565, "learning_rate": 2.962962962962963e-06, - "loss": 0.2734, + "loss": 0.0002, "step": 9570 }, { "epoch": 36.01409961685824, - "grad_norm": 0.002227040473371744, + "grad_norm": 0.0010294230887666345, "learning_rate": 2.9544487015751387e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 9580 }, { "epoch": 36.01486590038314, - "grad_norm": 0.0006657686899416149, + "grad_norm": 0.0005822263192385435, "learning_rate": 2.945934440187314e-06, - "loss": 0.0005, + "loss": 0.0842, "step": 9590 }, { "epoch": 36.015632183908046, - "grad_norm": 0.02568192034959793, + "grad_norm": 0.003924645483493805, "learning_rate": 2.9374201787994896e-06, - "loss": 1.654, + "loss": 0.0203, "step": 9600 }, { "epoch": 36.01639846743295, - "grad_norm": 0.01002002228051424, + "grad_norm": 0.002536638407036662, "learning_rate": 2.9289059174116646e-06, - "loss": 0.613, + "loss": 1.1536, "step": 9610 }, { "epoch": 36.01716475095785, - "grad_norm": 0.003658324247226119, + "grad_norm": 0.0047698477283120155, "learning_rate": 2.92039165602384e-06, - "loss": 0.5829, + "loss": 0.1412, "step": 9620 }, { "epoch": 36.01793103448276, - "grad_norm": 0.009807663038372993, + "grad_norm": 826.2704467773438, "learning_rate": 2.911877394636016e-06, - "loss": 0.6286, + "loss": 0.9301, "step": 9630 }, { "epoch": 36.01869731800766, - "grad_norm": 0.03617067262530327, + "grad_norm": 0.038875650614500046, "learning_rate": 2.903363133248191e-06, - "loss": 0.096, + "loss": 0.0114, "step": 9640 }, { "epoch": 36.019463601532564, - "grad_norm": 0.0008152202935889363, + "grad_norm": 0.0006504143821075559, "learning_rate": 2.8948488718603662e-06, - "loss": 0.0004, + "loss": 0.0007, "step": 9650 }, { "epoch": 36.02, - "eval_accuracy": 0.6666666666666666, - "eval_loss": 2.4585556983947754, - "eval_runtime": 17.8463, - "eval_samples_per_second": 2.522, - "eval_steps_per_second": 2.522, + "eval_accuracy": 0.6888888888888889, + "eval_loss": 2.4709599018096924, + "eval_runtime": 15.4949, + "eval_samples_per_second": 2.904, + "eval_steps_per_second": 2.904, "step": 9657 }, { "epoch": 37.00022988505747, - "grad_norm": 200.47195434570312, + "grad_norm": 0.2688674032688141, "learning_rate": 2.8863346104725417e-06, - "loss": 1.3222, + "loss": 0.8514, "step": 9660 }, { "epoch": 37.000996168582375, - "grad_norm": 0.0013116474729031324, + "grad_norm": 0.0010026432573795319, "learning_rate": 2.8778203490847175e-06, - "loss": 0.0149, + "loss": 0.0001, "step": 9670 }, { "epoch": 37.00176245210728, - "grad_norm": 0.0014711101539433002, + "grad_norm": 0.0008987862966023386, "learning_rate": 2.8693060876968925e-06, - "loss": 0.0007, + "loss": 0.8497, "step": 9680 }, { "epoch": 37.00252873563218, - "grad_norm": 0.0025564427487552166, + "grad_norm": 0.000963296159170568, "learning_rate": 2.860791826309068e-06, - "loss": 0.001, + "loss": 0.0001, "step": 9690 }, { "epoch": 37.003295019157086, - "grad_norm": 0.008313476108014584, + "grad_norm": 0.005000424105674028, "learning_rate": 2.852277564921243e-06, - "loss": 0.1117, + "loss": 0.0115, "step": 9700 }, { "epoch": 37.00406130268199, - "grad_norm": 0.0024045847821980715, + "grad_norm": 0.0008265956421382725, "learning_rate": 2.8437633035334187e-06, - "loss": 0.6268, + "loss": 0.0004, "step": 9710 }, { "epoch": 37.00482758620689, - "grad_norm": 0.0008887048461474478, + "grad_norm": 0.0005889981403015554, "learning_rate": 2.835249042145594e-06, - "loss": 0.7695, + "loss": 0.8605, "step": 9720 }, { "epoch": 37.005593869731804, - "grad_norm": 0.0014032486360520124, + "grad_norm": 0.002040229272097349, "learning_rate": 2.8267347807577696e-06, - "loss": 0.001, + "loss": 0.0003, "step": 9730 }, { "epoch": 37.00636015325671, - "grad_norm": 0.009179745800793171, + "grad_norm": 0.03565753996372223, "learning_rate": 2.8182205193699446e-06, - "loss": 0.0339, + "loss": 0.6127, "step": 9740 }, { "epoch": 37.00712643678161, - "grad_norm": 0.03631802275776863, + "grad_norm": 0.08163590729236603, "learning_rate": 2.8097062579821204e-06, "loss": 0.0002, "step": 9750 }, { "epoch": 37.007892720306515, - "grad_norm": 0.009984351694583893, + "grad_norm": 0.03897847607731819, "learning_rate": 2.801191996594296e-06, - "loss": 0.0004, + "loss": 0.0002, "step": 9760 }, { "epoch": 37.00865900383142, - "grad_norm": 0.001846793806180358, + "grad_norm": 0.0021099550649523735, "learning_rate": 2.792677735206471e-06, - "loss": 0.0005, + "loss": 0.0003, "step": 9770 }, { "epoch": 37.00942528735632, - "grad_norm": 0.09551486372947693, + "grad_norm": 0.023031258955597878, "learning_rate": 2.7841634738186462e-06, - "loss": 0.0003, + "loss": 0.0002, "step": 9780 }, { "epoch": 37.010191570881226, - "grad_norm": 37.85218811035156, + "grad_norm": 0.03526505082845688, "learning_rate": 2.775649212430822e-06, - "loss": 0.16, + "loss": 0.5452, "step": 9790 }, { "epoch": 37.01095785440613, - "grad_norm": 0.041355930268764496, + "grad_norm": 0.01183415949344635, "learning_rate": 2.7671349510429975e-06, - "loss": 0.7911, + "loss": 0.0004, "step": 9800 }, { "epoch": 37.01172413793103, - "grad_norm": 0.0038186986930668354, + "grad_norm": 0.0010402931366115808, "learning_rate": 2.7586206896551725e-06, - "loss": 0.0002, + "loss": 0.7477, "step": 9810 }, { "epoch": 37.01249042145594, - "grad_norm": 0.0010613175109028816, + "grad_norm": 0.0010816905414685607, "learning_rate": 2.750106428267348e-06, - "loss": 0.768, + "loss": 0.745, "step": 9820 }, { "epoch": 37.01325670498084, - "grad_norm": 0.10113216936588287, + "grad_norm": 0.05686232075095177, "learning_rate": 2.7415921668795238e-06, - "loss": 0.0008, + "loss": 0.0281, "step": 9830 }, { "epoch": 37.014022988505744, - "grad_norm": 0.002089802408590913, + "grad_norm": 0.001344469259493053, "learning_rate": 2.7330779054916987e-06, - "loss": 0.871, + "loss": 0.6785, "step": 9840 }, { "epoch": 37.014789272030654, - "grad_norm": 0.02318667247891426, + "grad_norm": 0.10435260832309723, "learning_rate": 2.724563644103874e-06, - "loss": 0.0015, + "loss": 0.0003, "step": 9850 }, { "epoch": 37.01555555555556, - "grad_norm": 0.017728768289089203, + "grad_norm": 0.02326204441487789, "learning_rate": 2.7160493827160496e-06, "loss": 0.0002, "step": 9860 }, { "epoch": 37.01632183908046, - "grad_norm": 0.0325637087225914, + "grad_norm": 11.89132022857666, "learning_rate": 2.7075351213282254e-06, - "loss": 0.0006, + "loss": 0.6928, "step": 9870 }, { "epoch": 37.017088122605365, - "grad_norm": 0.0021824336145073175, + "grad_norm": 0.0017847528215497732, "learning_rate": 2.6990208599404004e-06, - "loss": 1.0912, + "loss": 0.2962, "step": 9880 }, { "epoch": 37.01785440613027, - "grad_norm": 0.056245166808366776, + "grad_norm": 0.06890422850847244, "learning_rate": 2.690506598552576e-06, - "loss": 0.0004, + "loss": 0.0003, "step": 9890 }, { "epoch": 37.01862068965517, - "grad_norm": 0.0116423349827528, + "grad_norm": 0.005729100201278925, "learning_rate": 2.6819923371647512e-06, - "loss": 0.7028, + "loss": 0.6449, "step": 9900 }, { "epoch": 37.019386973180076, - "grad_norm": 0.001408185693435371, + "grad_norm": 0.0009230448049493134, "learning_rate": 2.6734780757769262e-06, - "loss": 0.0022, + "loss": 0.0006, "step": 9910 }, { "epoch": 37.02, - "eval_accuracy": 0.7111111111111111, - "eval_loss": 1.9075928926467896, - "eval_runtime": 16.6503, - "eval_samples_per_second": 2.703, - "eval_steps_per_second": 2.703, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 2.1948297023773193, + "eval_runtime": 15.4399, + "eval_samples_per_second": 2.915, + "eval_steps_per_second": 2.915, "step": 9918 }, { "epoch": 38.000153256704984, - "grad_norm": 0.011020908132195473, + "grad_norm": 0.04153349995613098, "learning_rate": 2.664963814389102e-06, - "loss": 0.5495, + "loss": 0.0007, "step": 9920 }, { "epoch": 38.00091954022989, - "grad_norm": 0.03686055913567543, + "grad_norm": 0.014882294461131096, "learning_rate": 2.6564495530012775e-06, - "loss": 1.3663, + "loss": 0.0005, "step": 9930 }, { "epoch": 38.00168582375479, - "grad_norm": 0.1972457617521286, + "grad_norm": 0.016202867031097412, "learning_rate": 2.6479352916134525e-06, - "loss": 0.0003, + "loss": 0.0008, "step": 9940 }, { "epoch": 38.002452107279694, - "grad_norm": 0.01467108353972435, + "grad_norm": 0.01640430837869644, "learning_rate": 2.639421030225628e-06, - "loss": 0.9214, + "loss": 0.3814, "step": 9950 }, { "epoch": 38.0032183908046, - "grad_norm": 0.039561737328767776, + "grad_norm": 0.10321377962827682, "learning_rate": 2.6309067688378037e-06, - "loss": 0.6999, + "loss": 0.0004, "step": 9960 }, { "epoch": 38.0039846743295, - "grad_norm": 0.01594482734799385, + "grad_norm": 0.007109183818101883, "learning_rate": 2.622392507449979e-06, - "loss": 0.0007, + "loss": 0.0002, "step": 9970 }, { "epoch": 38.004750957854405, - "grad_norm": 0.06000465899705887, + "grad_norm": 0.004593124147504568, "learning_rate": 2.613878246062154e-06, - "loss": 0.0009, + "loss": 0.8623, "step": 9980 }, { "epoch": 38.00551724137931, - "grad_norm": 0.0008072266937233508, + "grad_norm": 0.000765863573178649, "learning_rate": 2.6053639846743296e-06, - "loss": 1.2176, + "loss": 0.0035, "step": 9990 }, { "epoch": 38.00628352490421, - "grad_norm": 0.003703795373439789, + "grad_norm": 0.0017961942357942462, "learning_rate": 2.5968497232865054e-06, - "loss": 0.1955, + "loss": 0.0291, "step": 10000 }, { "epoch": 38.007049808429116, - "grad_norm": 0.0016375769628211856, + "grad_norm": 0.001367215532809496, "learning_rate": 2.5883354618986804e-06, - "loss": 0.0004, + "loss": 0.0005, "step": 10010 }, { "epoch": 38.00781609195402, - "grad_norm": 2236.270751953125, + "grad_norm": 85.33036804199219, "learning_rate": 2.579821200510856e-06, - "loss": 0.2737, + "loss": 0.9284, "step": 10020 }, { "epoch": 38.00858237547893, - "grad_norm": 0.0018821221310645342, + "grad_norm": 0.0018648827681317925, "learning_rate": 2.5713069391230312e-06, - "loss": 0.0013, + "loss": 0.3321, "step": 10030 }, { "epoch": 38.009348659003834, - "grad_norm": 0.007677056826651096, + "grad_norm": 0.0035153606440871954, "learning_rate": 2.562792677735207e-06, - "loss": 0.0006, + "loss": 0.0002, "step": 10040 }, { "epoch": 38.01011494252874, - "grad_norm": 0.0023062315303832293, + "grad_norm": 0.007890434004366398, "learning_rate": 2.554278416347382e-06, - "loss": 0.0001, + "loss": 0.0002, "step": 10050 }, { "epoch": 38.01088122605364, - "grad_norm": 0.16336049139499664, + "grad_norm": 0.0384494923055172, "learning_rate": 2.5457641549595575e-06, - "loss": 0.0005, + "loss": 0.0003, "step": 10060 }, { "epoch": 38.011647509578545, - "grad_norm": 0.1091916635632515, + "grad_norm": 0.042203500866889954, "learning_rate": 2.5372498935717325e-06, - "loss": 0.0008, + "loss": 0.0002, "step": 10070 }, { "epoch": 38.01241379310345, - "grad_norm": 0.07184675335884094, + "grad_norm": 0.03912093862891197, "learning_rate": 2.5287356321839083e-06, - "loss": 0.0003, + "loss": 0.0001, "step": 10080 }, { "epoch": 38.01318007662835, - "grad_norm": 0.02561718225479126, + "grad_norm": 0.00810239091515541, "learning_rate": 2.5202213707960837e-06, - "loss": 0.0204, + "loss": 0.0002, "step": 10090 }, { "epoch": 38.013946360153255, - "grad_norm": 0.0021183625794947147, + "grad_norm": 0.0013092505978420377, "learning_rate": 2.511707109408259e-06, - "loss": 0.3358, + "loss": 0.0009, "step": 10100 }, { "epoch": 38.01471264367816, - "grad_norm": 0.002804600400850177, + "grad_norm": 0.009226065129041672, "learning_rate": 2.503192848020434e-06, - "loss": 0.0006, + "loss": 0.0002, "step": 10110 }, { "epoch": 38.01547892720306, - "grad_norm": 0.004418718628585339, + "grad_norm": 0.003829770255833864, "learning_rate": 2.49467858663261e-06, - "loss": 0.0004, + "loss": 0.0002, "step": 10120 }, { "epoch": 38.016245210727966, - "grad_norm": 0.013804151676595211, + "grad_norm": 0.08662468194961548, "learning_rate": 2.4861643252447854e-06, - "loss": 0.2445, + "loss": 0.0006, "step": 10130 }, { "epoch": 38.01701149425288, - "grad_norm": 29.872575759887695, + "grad_norm": 0.018477967008948326, "learning_rate": 2.4776500638569604e-06, - "loss": 0.4659, + "loss": 0.0001, "step": 10140 }, { "epoch": 38.01777777777778, - "grad_norm": 0.0012199718039482832, + "grad_norm": 0.001064899144694209, "learning_rate": 2.469135802469136e-06, - "loss": 0.0002, + "loss": 0.6477, "step": 10150 }, { "epoch": 38.018544061302684, - "grad_norm": 0.0008654805715195835, + "grad_norm": 0.0012717880308628082, "learning_rate": 2.4606215410813112e-06, - "loss": 0.6034, + "loss": 0.0002, "step": 10160 }, { "epoch": 38.01931034482759, - "grad_norm": 0.00714363157749176, + "grad_norm": 0.019613996148109436, "learning_rate": 2.4521072796934867e-06, - "loss": 0.0006, + "loss": 0.0001, "step": 10170 }, { "epoch": 38.02, - "eval_accuracy": 0.6888888888888889, - "eval_loss": 2.016727924346924, - "eval_runtime": 16.5617, - "eval_samples_per_second": 2.717, - "eval_steps_per_second": 2.717, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 2.455037832260132, + "eval_runtime": 15.46, + "eval_samples_per_second": 2.911, + "eval_steps_per_second": 2.911, "step": 10179 }, { "epoch": 39.00007662835249, - "grad_norm": 0.01116449199616909, + "grad_norm": 0.0035114935599267483, "learning_rate": 2.443593018305662e-06, - "loss": 0.0001, + "loss": 0.0, "step": 10180 }, { "epoch": 39.00084291187739, - "grad_norm": 0.0020209902431815863, + "grad_norm": 0.006250754930078983, "learning_rate": 2.4350787569178375e-06, - "loss": 0.001, + "loss": 0.0002, "step": 10190 }, { "epoch": 39.001609195402295, - "grad_norm": 0.016085512936115265, + "grad_norm": 0.00705630611628294, "learning_rate": 2.426564495530013e-06, - "loss": 0.3644, + "loss": 0.3949, "step": 10200 }, { "epoch": 39.002375478927206, - "grad_norm": 0.001203613937832415, + "grad_norm": 0.0009550215909257531, "learning_rate": 2.4180502341421883e-06, - "loss": 0.5964, + "loss": 0.0001, "step": 10210 }, { "epoch": 39.00314176245211, - "grad_norm": 0.0018107658252120018, + "grad_norm": 0.002219496527686715, "learning_rate": 2.4095359727543637e-06, - "loss": 0.0932, + "loss": 0.0001, "step": 10220 }, { "epoch": 39.00390804597701, - "grad_norm": 2055.353271484375, + "grad_norm": 17.295181274414062, "learning_rate": 2.401021711366539e-06, - "loss": 0.2535, + "loss": 0.002, "step": 10230 }, { "epoch": 39.00467432950192, - "grad_norm": 0.002151814289391041, + "grad_norm": 0.008441106416285038, "learning_rate": 2.3925074499787146e-06, "loss": 0.0001, "step": 10240 }, { "epoch": 39.00544061302682, - "grad_norm": 0.0017315262230113149, + "grad_norm": 0.0014597218250855803, "learning_rate": 2.38399318859089e-06, - "loss": 0.0012, + "loss": 0.0302, "step": 10250 }, { "epoch": 39.006206896551724, - "grad_norm": 183.34739685058594, + "grad_norm": 75.42328643798828, "learning_rate": 2.3754789272030654e-06, - "loss": 0.7811, + "loss": 0.9088, "step": 10260 }, { "epoch": 39.00697318007663, - "grad_norm": 0.20508258044719696, + "grad_norm": 0.2702087163925171, "learning_rate": 2.366964665815241e-06, - "loss": 0.7933, + "loss": 0.8998, "step": 10270 }, { "epoch": 39.00773946360153, - "grad_norm": 0.3998939096927643, + "grad_norm": 1109.0517578125, "learning_rate": 2.3584504044274162e-06, - "loss": 0.0009, + "loss": 0.1558, "step": 10280 }, { "epoch": 39.008505747126435, - "grad_norm": 418.20013427734375, + "grad_norm": 6.476412773132324, "learning_rate": 2.3499361430395912e-06, - "loss": 0.3733, + "loss": 0.2076, "step": 10290 }, { "epoch": 39.00927203065134, - "grad_norm": 0.16842807829380035, + "grad_norm": 0.009519490413367748, "learning_rate": 2.341421881651767e-06, - "loss": 0.0004, + "loss": 0.0001, "step": 10300 }, { "epoch": 39.01003831417624, - "grad_norm": 0.0012611760757863522, + "grad_norm": 0.0008983574225567281, "learning_rate": 2.332907620263942e-06, - "loss": 0.6382, + "loss": 0.0009, "step": 10310 }, { "epoch": 39.01080459770115, - "grad_norm": 0.001306567108258605, + "grad_norm": 0.0010324233444407582, "learning_rate": 2.324393358876118e-06, - "loss": 0.5825, + "loss": 0.0002, "step": 10320 }, { "epoch": 39.011570881226056, - "grad_norm": 46.51072692871094, + "grad_norm": 0.31206342577934265, "learning_rate": 2.315879097488293e-06, - "loss": 1.2026, + "loss": 0.0002, "step": 10330 }, { "epoch": 39.01233716475096, - "grad_norm": 0.002485692733898759, + "grad_norm": 0.0030093074310570955, "learning_rate": 2.3073648361004688e-06, - "loss": 0.001, + "loss": 0.0002, "step": 10340 }, { "epoch": 39.013103448275864, - "grad_norm": 137.24148559570312, + "grad_norm": 0.050876811146736145, "learning_rate": 2.2988505747126437e-06, - "loss": 0.4545, + "loss": 0.0001, "step": 10350 }, { "epoch": 39.01386973180077, - "grad_norm": 0.04249175637960434, + "grad_norm": 0.035219959914684296, "learning_rate": 2.290336313324819e-06, - "loss": 0.0006, + "loss": 0.0001, "step": 10360 }, { "epoch": 39.01463601532567, - "grad_norm": 0.0036637112498283386, + "grad_norm": 0.010045896284282207, "learning_rate": 2.2818220519369946e-06, - "loss": 0.1212, + "loss": 0.0002, "step": 10370 }, { "epoch": 39.015402298850574, - "grad_norm": 0.002580595901235938, + "grad_norm": 0.0030850691255182028, "learning_rate": 2.27330779054917e-06, - "loss": 0.0011, + "loss": 0.0001, "step": 10380 }, { "epoch": 39.01616858237548, - "grad_norm": 0.0033924446906894445, + "grad_norm": 0.0019529566634446383, "learning_rate": 2.2647935291613454e-06, - "loss": 0.0012, + "loss": 0.0002, "step": 10390 }, { "epoch": 39.01693486590038, - "grad_norm": 0.04098842293024063, + "grad_norm": 0.007099506910890341, "learning_rate": 2.256279267773521e-06, - "loss": 0.0004, + "loss": 0.0001, "step": 10400 }, { "epoch": 39.017701149425285, - "grad_norm": 0.0013437381712719798, + "grad_norm": 0.00163712736684829, "learning_rate": 2.2477650063856962e-06, - "loss": 0.0536, + "loss": 0.0001, "step": 10410 }, { "epoch": 39.01846743295019, - "grad_norm": 0.07073049992322922, + "grad_norm": 0.0032862075604498386, "learning_rate": 2.2392507449978717e-06, - "loss": 1.2115, + "loss": 0.0001, "step": 10420 }, { "epoch": 39.01923371647509, - "grad_norm": 0.001191325020045042, + "grad_norm": 0.0006715281633660197, "learning_rate": 2.230736483610047e-06, - "loss": 0.3743, + "loss": 0.0003, "step": 10430 }, { "epoch": 39.02, - "grad_norm": 0.0008125228341668844, + "grad_norm": 0.0007948668790049851, "learning_rate": 2.222222222222222e-06, - "loss": 0.0065, + "loss": 0.3026, "step": 10440 }, { "epoch": 39.02, - "eval_accuracy": 0.6666666666666666, - "eval_loss": 2.2368969917297363, - "eval_runtime": 17.413, - "eval_samples_per_second": 2.584, - "eval_steps_per_second": 2.584, + "eval_accuracy": 0.7333333333333333, + "eval_loss": 2.123791217803955, + "eval_runtime": 15.4642, + "eval_samples_per_second": 2.91, + "eval_steps_per_second": 2.91, "step": 10440 }, { "epoch": 40.000766283524904, - "grad_norm": 0.03135787695646286, + "grad_norm": 0.019171781837940216, "learning_rate": 2.213707960834398e-06, - "loss": 0.0006, + "loss": 0.086, "step": 10450 }, { "epoch": 40.00153256704981, - "grad_norm": 0.002667344408109784, + "grad_norm": 0.0036889431066811085, "learning_rate": 2.205193699446573e-06, - "loss": 0.0006, + "loss": 0.0001, "step": 10460 }, { "epoch": 40.00229885057471, - "grad_norm": 0.0018900452414527535, + "grad_norm": 0.0026837738696485758, "learning_rate": 2.1966794380587487e-06, - "loss": 0.5149, + "loss": 0.8073, "step": 10470 }, { "epoch": 40.003065134099614, - "grad_norm": 0.0032193083316087723, + "grad_norm": 0.0014597310218960047, "learning_rate": 2.1881651766709237e-06, - "loss": 0.0335, + "loss": 0.6343, "step": 10480 }, { "epoch": 40.00383141762452, - "grad_norm": 13.062385559082031, + "grad_norm": 0.0073768338188529015, "learning_rate": 2.1796509152830996e-06, - "loss": 0.0013, + "loss": 0.0011, "step": 10490 }, { "epoch": 40.00459770114943, - "grad_norm": 0.0008159828721545637, + "grad_norm": 0.001270446227863431, "learning_rate": 2.1711366538952746e-06, - "loss": 0.0005, + "loss": 0.1745, "step": 10500 }, { "epoch": 40.00536398467433, - "grad_norm": 0.0534515343606472, + "grad_norm": 0.030193252488970757, "learning_rate": 2.1626223925074504e-06, - "loss": 0.6755, + "loss": 0.7578, "step": 10510 }, { "epoch": 40.006130268199236, - "grad_norm": 0.015131569467484951, + "grad_norm": 0.006081745959818363, "learning_rate": 2.1541081311196254e-06, - "loss": 0.0101, + "loss": 0.0002, "step": 10520 }, { "epoch": 40.00689655172414, - "grad_norm": 0.002069084206596017, + "grad_norm": 0.0020429452415555716, "learning_rate": 2.145593869731801e-06, - "loss": 0.0022, + "loss": 0.3721, "step": 10530 }, { "epoch": 40.00766283524904, - "grad_norm": 0.019438011571764946, + "grad_norm": 0.008285444229841232, "learning_rate": 2.1370796083439762e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 10540 }, { "epoch": 40.00842911877395, - "grad_norm": 0.03702215477824211, + "grad_norm": 0.009783162735402584, "learning_rate": 2.1285653469561517e-06, - "loss": 0.0006, + "loss": 0.0003, "step": 10550 }, { "epoch": 40.00919540229885, - "grad_norm": 0.0005562976002693176, + "grad_norm": 0.0005393492174334824, "learning_rate": 2.120051085568327e-06, - "loss": 0.0243, + "loss": 0.0002, "step": 10560 }, { "epoch": 40.009961685823754, - "grad_norm": 0.015588171780109406, + "grad_norm": 12.202210426330566, "learning_rate": 2.1115368241805025e-06, - "loss": 0.7303, + "loss": 1.0265, "step": 10570 }, { "epoch": 40.01072796934866, - "grad_norm": 0.0017444725381210446, + "grad_norm": 0.0012344133574515581, "learning_rate": 2.103022562792678e-06, - "loss": 0.9014, + "loss": 0.646, "step": 10580 }, { "epoch": 40.01149425287356, - "grad_norm": 0.0032911712769418955, + "grad_norm": 0.008871200494468212, "learning_rate": 2.0945083014048533e-06, - "loss": 0.3793, + "loss": 0.0001, "step": 10590 }, { "epoch": 40.012260536398465, - "grad_norm": 0.0006082048639655113, + "grad_norm": 0.0005161252338439226, "learning_rate": 2.0859940400170287e-06, "loss": 0.0001, "step": 10600 }, { "epoch": 40.01302681992337, - "grad_norm": 0.05320386961102486, + "grad_norm": 0.5706034302711487, "learning_rate": 2.077479778629204e-06, - "loss": 0.3931, + "loss": 0.9366, "step": 10610 }, { "epoch": 40.01379310344828, - "grad_norm": 0.0004442330973688513, + "grad_norm": 0.000537372543476522, "learning_rate": 2.0689655172413796e-06, - "loss": 0.5867, + "loss": 0.0001, "step": 10620 }, { "epoch": 40.01455938697318, - "grad_norm": 0.02673637866973877, + "grad_norm": 276.76025390625, "learning_rate": 2.060451255853555e-06, - "loss": 0.0007, + "loss": 1.5447, "step": 10630 }, { "epoch": 40.015325670498086, - "grad_norm": 0.0006200252682901919, + "grad_norm": 0.0017163667362183332, "learning_rate": 2.0519369944657304e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 10640 }, { "epoch": 40.01609195402299, - "grad_norm": 0.0006501683965325356, + "grad_norm": 0.0017713200068101287, "learning_rate": 2.043422733077906e-06, - "loss": 0.0012, + "loss": 0.0008, "step": 10650 }, { "epoch": 40.01685823754789, - "grad_norm": 0.00576308136805892, + "grad_norm": 0.03505133464932442, "learning_rate": 2.0349084716900813e-06, - "loss": 0.0003, + "loss": 0.0002, "step": 10660 }, { "epoch": 40.0176245210728, - "grad_norm": 0.0021502794697880745, + "grad_norm": 0.011485176160931587, "learning_rate": 2.0263942103022567e-06, - "loss": 0.3379, + "loss": 0.1483, "step": 10670 }, { "epoch": 40.0183908045977, - "grad_norm": 0.003732713870704174, + "grad_norm": 0.02830810472369194, "learning_rate": 2.0178799489144317e-06, - "loss": 0.0068, + "loss": 0.0026, "step": 10680 }, { "epoch": 40.019157088122604, - "grad_norm": 0.0006824551965110004, + "grad_norm": 0.002873759949579835, "learning_rate": 2.009365687526607e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 10690 }, { "epoch": 40.01992337164751, - "grad_norm": 0.0008392931777052581, + "grad_norm": 0.0009510298259556293, "learning_rate": 2.0008514261387825e-06, "loss": 0.0002, "step": 10700 }, { "epoch": 40.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.4551186561584473, - "eval_runtime": 16.8531, - "eval_samples_per_second": 2.67, - "eval_steps_per_second": 2.67, + "eval_accuracy": 0.6222222222222222, + "eval_loss": 2.3737952709198, + "eval_runtime": 16.8955, + "eval_samples_per_second": 2.663, + "eval_steps_per_second": 2.663, "step": 10701 }, { "epoch": 41.000689655172415, - "grad_norm": 0.024441132321953773, + "grad_norm": 0.009723528288304806, "learning_rate": 1.992337164750958e-06, "loss": 0.0002, "step": 10710 }, { "epoch": 41.00145593869732, - "grad_norm": 0.0009095760178752244, + "grad_norm": 0.004431157372891903, "learning_rate": 1.9838229033631333e-06, - "loss": 0.0016, + "loss": 0.7002, "step": 10720 }, { "epoch": 41.00222222222222, - "grad_norm": 0.07211485505104065, + "grad_norm": 0.08255797624588013, "learning_rate": 1.9753086419753087e-06, - "loss": 0.0003, + "loss": 0.0012, "step": 10730 }, { "epoch": 41.002988505747126, - "grad_norm": 0.0007702154689468443, + "grad_norm": 0.0013805850176140666, "learning_rate": 1.966794380587484e-06, - "loss": 0.0003, + "loss": 0.005, "step": 10740 }, { "epoch": 41.00375478927203, - "grad_norm": 0.015284047462046146, + "grad_norm": 0.03276257589459419, "learning_rate": 1.9582801191996596e-06, - "loss": 0.1773, + "loss": 0.0673, "step": 10750 }, { "epoch": 41.00452107279693, - "grad_norm": 0.0014508262975141406, + "grad_norm": 0.00983472727239132, "learning_rate": 1.949765857811835e-06, - "loss": 0.0003, + "loss": 0.001, "step": 10760 }, { "epoch": 41.00528735632184, - "grad_norm": 0.05831890180706978, + "grad_norm": 0.029675589874386787, "learning_rate": 1.9412515964240104e-06, - "loss": 0.0135, + "loss": 0.1886, "step": 10770 }, { "epoch": 41.00605363984674, - "grad_norm": 31.497249603271484, + "grad_norm": 0.008304531686007977, "learning_rate": 1.932737335036186e-06, - "loss": 0.0024, + "loss": 0.0001, "step": 10780 }, { "epoch": 41.006819923371644, - "grad_norm": 0.002052170457318425, + "grad_norm": 0.004280910827219486, "learning_rate": 1.9242230736483612e-06, "loss": 0.0005, "step": 10790 }, { "epoch": 41.007586206896555, - "grad_norm": 0.004367075860500336, + "grad_norm": 0.002753842156380415, "learning_rate": 1.9157088122605367e-06, "loss": 0.0002, "step": 10800 }, { "epoch": 41.00835249042146, - "grad_norm": 0.008073161356151104, + "grad_norm": 0.002911026356741786, "learning_rate": 1.9071945508727119e-06, - "loss": 0.0002, + "loss": 0.7337, "step": 10810 }, { "epoch": 41.00911877394636, - "grad_norm": 0.004105108790099621, + "grad_norm": 0.023087352514266968, "learning_rate": 1.8986802894848875e-06, - "loss": 0.0003, + "loss": 0.7313, "step": 10820 }, { "epoch": 41.009885057471266, - "grad_norm": 0.0014760576887056231, + "grad_norm": 0.0058591291308403015, "learning_rate": 1.8901660280970627e-06, - "loss": 0.0, + "loss": 0.0001, "step": 10830 }, { "epoch": 41.01065134099617, - "grad_norm": 0.0004776821588166058, + "grad_norm": 0.0008538307738490403, "learning_rate": 1.8816517667092381e-06, - "loss": 0.0001, + "loss": 0.0002, "step": 10840 }, { "epoch": 41.01141762452107, - "grad_norm": 0.0006523674819618464, + "grad_norm": 0.0007184819551184773, "learning_rate": 1.8731375053214135e-06, - "loss": 0.0002, + "loss": 0.0005, "step": 10850 }, { "epoch": 41.01218390804598, - "grad_norm": 0.027973128482699394, + "grad_norm": 0.02994411811232567, "learning_rate": 1.864623243933589e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 10860 }, { "epoch": 41.01295019157088, - "grad_norm": 0.014188108034431934, + "grad_norm": 0.0067585003562271595, "learning_rate": 1.8561089825457644e-06, - "loss": 0.0708, + "loss": 0.2346, "step": 10870 }, { "epoch": 41.013716475095784, - "grad_norm": 0.008099140599370003, + "grad_norm": 0.032415423542261124, "learning_rate": 1.8475947211579398e-06, "loss": 0.0001, "step": 10880 }, { "epoch": 41.01448275862069, - "grad_norm": 0.0016017964808270335, + "grad_norm": 0.0020037516951560974, "learning_rate": 1.839080459770115e-06, "loss": 0.0001, "step": 10890 }, { "epoch": 41.01524904214559, - "grad_norm": 0.0033067059703171253, + "grad_norm": 0.035809263586997986, "learning_rate": 1.8305661983822906e-06, - "loss": 0.9468, + "loss": 0.1834, "step": 10900 }, { "epoch": 41.0160153256705, - "grad_norm": 0.0012311979662626982, + "grad_norm": 0.001537501229904592, "learning_rate": 1.8220519369944658e-06, - "loss": 0.8283, + "loss": 0.0001, "step": 10910 }, { "epoch": 41.016781609195405, - "grad_norm": 0.027496883645653725, + "grad_norm": 0.007630740292370319, "learning_rate": 1.8135376756066415e-06, - "loss": 0.0007, + "loss": 0.0002, "step": 10920 }, { "epoch": 41.01754789272031, - "grad_norm": 0.011013705283403397, + "grad_norm": 0.13188746571540833, "learning_rate": 1.8050234142188167e-06, - "loss": 0.3909, + "loss": 0.0003, "step": 10930 }, { "epoch": 41.01831417624521, - "grad_norm": 0.01968281716108322, + "grad_norm": 0.004163428675383329, "learning_rate": 1.7965091528309919e-06, - "loss": 0.0003, + "loss": 0.3379, "step": 10940 }, { "epoch": 41.019080459770116, - "grad_norm": 0.06489856541156769, + "grad_norm": 0.13015137612819672, "learning_rate": 1.7879948914431675e-06, "loss": 0.0002, "step": 10950 }, { "epoch": 41.01984674329502, - "grad_norm": 0.0006830870406702161, + "grad_norm": 0.0007329225190915167, "learning_rate": 1.7794806300553427e-06, - "loss": 1.6484, + "loss": 0.7121, "step": 10960 }, { "epoch": 41.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.6842098236083984, - "eval_runtime": 16.7694, - "eval_samples_per_second": 2.683, - "eval_steps_per_second": 2.683, + "eval_accuracy": 0.6, + "eval_loss": 2.800002098083496, + "eval_runtime": 17.1343, + "eval_samples_per_second": 2.626, + "eval_steps_per_second": 2.626, "step": 10962 }, { "epoch": 42.00061302681992, - "grad_norm": 0.003737035673111677, + "grad_norm": 0.0018277441849932075, "learning_rate": 1.7709663686675183e-06, - "loss": 0.7984, + "loss": 0.7695, "step": 10970 }, { "epoch": 42.00137931034483, - "grad_norm": 0.006563944276422262, + "grad_norm": 0.0021305852569639683, "learning_rate": 1.7624521072796935e-06, - "loss": 2.0839, + "loss": 0.0003, "step": 10980 }, { "epoch": 42.002145593869734, - "grad_norm": 0.0021956306882202625, + "grad_norm": 0.0008735191659070551, "learning_rate": 1.753937845891869e-06, - "loss": 0.0003, + "loss": 0.0004, "step": 10990 }, { "epoch": 42.00291187739464, - "grad_norm": 0.0005605145706795156, + "grad_norm": 0.0005789535935036838, "learning_rate": 1.7454235845040444e-06, - "loss": 0.2977, + "loss": 0.0002, "step": 11000 }, { "epoch": 42.00367816091954, - "grad_norm": 0.004420015960931778, + "grad_norm": 0.007676248904317617, "learning_rate": 1.7369093231162198e-06, - "loss": 0.0003, + "loss": 0.0002, "step": 11010 }, { "epoch": 42.004444444444445, - "grad_norm": 0.0011342117795720696, + "grad_norm": 0.0009522740729153156, "learning_rate": 1.7283950617283952e-06, - "loss": 0.9465, + "loss": 0.0005, "step": 11020 }, { "epoch": 42.00521072796935, - "grad_norm": 0.0026795840822160244, + "grad_norm": 0.004853380378335714, "learning_rate": 1.7198808003405706e-06, - "loss": 0.7014, + "loss": 0.0001, "step": 11030 }, { "epoch": 42.00597701149425, - "grad_norm": 0.03267119452357292, + "grad_norm": 0.008901674300432205, "learning_rate": 1.7113665389527458e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11040 }, { "epoch": 42.006743295019156, - "grad_norm": 0.07558189332485199, + "grad_norm": 0.03224329277873039, "learning_rate": 1.7028522775649215e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11050 }, { "epoch": 42.00750957854406, - "grad_norm": 0.0011101922718808055, + "grad_norm": 0.0010887272655963898, "learning_rate": 1.6943380161770967e-06, - "loss": 0.0002, + "loss": 0.0004, "step": 11060 }, { "epoch": 42.00827586206896, - "grad_norm": 0.001017472124658525, + "grad_norm": 0.0007746301707811654, "learning_rate": 1.6858237547892723e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11070 }, { "epoch": 42.00904214559387, - "grad_norm": 0.0012388168834149837, + "grad_norm": 0.0011528526665642858, "learning_rate": 1.6773094934014475e-06, - "loss": 0.0002, + "loss": 0.6807, "step": 11080 }, { "epoch": 42.00980842911878, - "grad_norm": 0.0067512462846934795, + "grad_norm": 0.005464628804475069, "learning_rate": 1.6687952320136231e-06, - "loss": 0.0003, + "loss": 0.0002, "step": 11090 }, { "epoch": 42.01057471264368, - "grad_norm": 0.00831934530287981, + "grad_norm": 0.006515026092529297, "learning_rate": 1.6602809706257983e-06, - "loss": 0.6729, + "loss": 0.0002, "step": 11100 }, { "epoch": 42.011340996168585, - "grad_norm": 0.0007535541080869734, + "grad_norm": 0.0007797977887094021, "learning_rate": 1.6517667092379737e-06, - "loss": 0.7259, + "loss": 0.7708, "step": 11110 }, { "epoch": 42.01210727969349, - "grad_norm": 0.012178517878055573, + "grad_norm": 0.008833660744130611, "learning_rate": 1.6432524478501492e-06, - "loss": 0.0001, + "loss": 0.0005, "step": 11120 }, { "epoch": 42.01287356321839, - "grad_norm": 598.53857421875, + "grad_norm": 512.608642578125, "learning_rate": 1.6347381864623246e-06, - "loss": 0.4692, + "loss": 0.8008, "step": 11130 }, { "epoch": 42.013639846743295, - "grad_norm": 0.0009179671760648489, + "grad_norm": 0.0006178829935379326, "learning_rate": 1.6262239250744998e-06, - "loss": 0.0004, + "loss": 0.0165, "step": 11140 }, { "epoch": 42.0144061302682, - "grad_norm": 0.0012064093025401235, + "grad_norm": 0.0010599381057545543, "learning_rate": 1.6177096636866754e-06, - "loss": 0.0006, + "loss": 0.0002, "step": 11150 }, { "epoch": 42.0151724137931, - "grad_norm": 0.007685741875320673, + "grad_norm": 0.002509585116058588, "learning_rate": 1.6091954022988506e-06, - "loss": 0.5118, + "loss": 0.2462, "step": 11160 }, { "epoch": 42.015938697318006, - "grad_norm": 0.0005009214510209858, + "grad_norm": 0.0007025519735179842, "learning_rate": 1.6006811409110262e-06, - "loss": 0.5406, + "loss": 0.0002, "step": 11170 }, { "epoch": 42.01670498084291, - "grad_norm": 0.0013565759873017669, + "grad_norm": 0.002086550695821643, "learning_rate": 1.5921668795232015e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11180 }, { "epoch": 42.01747126436781, - "grad_norm": 0.011023270897567272, + "grad_norm": 0.007844015955924988, "learning_rate": 1.583652618135377e-06, - "loss": 0.0002, + "loss": 0.0007, "step": 11190 }, { "epoch": 42.01823754789272, - "grad_norm": 0.0007169665768742561, + "grad_norm": 0.000691623892635107, "learning_rate": 1.5751383567475523e-06, - "loss": 0.1129, + "loss": 0.5722, "step": 11200 }, { "epoch": 42.01900383141763, - "grad_norm": 0.003950014710426331, + "grad_norm": 0.009208764880895615, "learning_rate": 1.5666240953597275e-06, - "loss": 0.2655, + "loss": 0.0035, "step": 11210 }, { "epoch": 42.01977011494253, - "grad_norm": 0.0006281138048507273, + "grad_norm": 0.0007144392584450543, "learning_rate": 1.5581098339719031e-06, "loss": 0.0001, "step": 11220 @@ -8243,1381 +8243,1381 @@ { "epoch": 42.02, "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.5582633018493652, - "eval_runtime": 17.742, - "eval_samples_per_second": 2.536, - "eval_steps_per_second": 2.536, + "eval_loss": 2.4507229328155518, + "eval_runtime": 15.4436, + "eval_samples_per_second": 2.914, + "eval_steps_per_second": 2.914, "step": 11223 }, { "epoch": 43.00053639846743, - "grad_norm": 0.002467693528160453, + "grad_norm": 0.0035489224828779697, "learning_rate": 1.5495955725840783e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11230 }, { "epoch": 43.001302681992335, - "grad_norm": 0.011197173967957497, + "grad_norm": 0.005358237307518721, "learning_rate": 1.541081311196254e-06, - "loss": 0.0002, + "loss": 0.0003, "step": 11240 }, { "epoch": 43.00206896551724, - "grad_norm": 0.0005595132824964821, + "grad_norm": 0.0005342309013940394, "learning_rate": 1.5325670498084292e-06, - "loss": 0.0013, + "loss": 0.0001, "step": 11250 }, { "epoch": 43.00283524904214, - "grad_norm": 0.002914645243436098, + "grad_norm": 0.0033035441301763058, "learning_rate": 1.5240527884206046e-06, - "loss": 0.0001, + "loss": 0.001, "step": 11260 }, { "epoch": 43.00360153256705, - "grad_norm": 0.0005361768417060375, + "grad_norm": 0.0006273703766055405, "learning_rate": 1.51553852703278e-06, - "loss": 0.0005, + "loss": 0.0001, "step": 11270 }, { "epoch": 43.00436781609196, - "grad_norm": 0.0007744549657218158, + "grad_norm": 0.0006616446771658957, "learning_rate": 1.5070242656449554e-06, "loss": 0.0001, "step": 11280 }, { "epoch": 43.00513409961686, - "grad_norm": 0.0026675465051084757, + "grad_norm": 0.003187242429703474, "learning_rate": 1.4985100042571306e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11290 }, { "epoch": 43.005900383141764, - "grad_norm": 0.0009361604461446404, + "grad_norm": 0.0012190892593935132, "learning_rate": 1.4899957428693062e-06, - "loss": 0.0008, + "loss": 0.0001, "step": 11300 }, { "epoch": 43.00666666666667, - "grad_norm": 282.3617858886719, + "grad_norm": 0.10489514470100403, "learning_rate": 1.4814814814814815e-06, - "loss": 0.0162, + "loss": 0.0005, "step": 11310 }, { "epoch": 43.00743295019157, - "grad_norm": 0.1838153898715973, + "grad_norm": 1.683571696281433, "learning_rate": 1.472967220093657e-06, - "loss": 0.2798, + "loss": 0.0402, "step": 11320 }, { "epoch": 43.008199233716475, - "grad_norm": 0.0005142433219589293, + "grad_norm": 0.0008869575685821474, "learning_rate": 1.4644529587058323e-06, - "loss": 0.0001, + "loss": 0.0, "step": 11330 }, { "epoch": 43.00896551724138, - "grad_norm": 0.00041536259232088923, + "grad_norm": 0.00047170917969197035, "learning_rate": 1.455938697318008e-06, "loss": 0.0001, "step": 11340 }, { "epoch": 43.00973180076628, - "grad_norm": 0.019814182072877884, + "grad_norm": 0.006080332677811384, "learning_rate": 1.4474244359301831e-06, - "loss": 0.3242, + "loss": 0.0225, "step": 11350 }, { "epoch": 43.010498084291186, - "grad_norm": 0.0018902852898463607, + "grad_norm": 0.0007999380468390882, "learning_rate": 1.4389101745423588e-06, - "loss": 0.0915, + "loss": 0.0006, "step": 11360 }, { "epoch": 43.01126436781609, - "grad_norm": 0.0141270337626338, + "grad_norm": 0.009966306388378143, "learning_rate": 1.430395913154534e-06, - "loss": 0.0018, + "loss": 0.0001, "step": 11370 }, { "epoch": 43.01203065134099, - "grad_norm": 0.06093072518706322, + "grad_norm": 0.008979124948382378, "learning_rate": 1.4218816517667094e-06, - "loss": 0.0036, + "loss": 0.2525, "step": 11380 }, { "epoch": 43.012796934865904, - "grad_norm": 0.05929204821586609, + "grad_norm": 0.10514181107282639, "learning_rate": 1.4133673903788848e-06, - "loss": 0.639, + "loss": 0.0001, "step": 11390 }, { "epoch": 43.01356321839081, - "grad_norm": 0.0008058378007262945, + "grad_norm": 0.0005648400401696563, "learning_rate": 1.4048531289910602e-06, "loss": 0.0001, "step": 11400 }, { "epoch": 43.01432950191571, - "grad_norm": 0.015182328410446644, + "grad_norm": 0.004483636002987623, "learning_rate": 1.3963388676032354e-06, - "loss": 0.2756, + "loss": 0.0006, "step": 11410 }, { "epoch": 43.015095785440614, - "grad_norm": 0.0007476222235709429, + "grad_norm": 0.0007238201797008514, "learning_rate": 1.387824606215411e-06, - "loss": 0.6009, + "loss": 0.0002, "step": 11420 }, { "epoch": 43.01586206896552, - "grad_norm": 0.00041066156700253487, + "grad_norm": 0.0004756802518386394, "learning_rate": 1.3793103448275862e-06, - "loss": 0.0003, + "loss": 0.0799, "step": 11430 }, { "epoch": 43.01662835249042, - "grad_norm": 0.0007477315375581384, + "grad_norm": 0.0006640653591603041, "learning_rate": 1.3707960834397619e-06, "loss": 0.0001, "step": 11440 }, { "epoch": 43.017394636015325, - "grad_norm": 0.002539550419896841, + "grad_norm": 0.001089440076611936, "learning_rate": 1.362281822051937e-06, - "loss": 1.6895, + "loss": 0.0001, "step": 11450 }, { "epoch": 43.01816091954023, - "grad_norm": 0.0005627285572700202, + "grad_norm": 0.00042758844210766256, "learning_rate": 1.3537675606641127e-06, - "loss": 0.0001, + "loss": 0.2889, "step": 11460 }, { "epoch": 43.01892720306513, - "grad_norm": 1.011839509010315, + "grad_norm": 0.011997051537036896, "learning_rate": 1.345253299276288e-06, - "loss": 0.0021, + "loss": 0.0001, "step": 11470 }, { "epoch": 43.019693486590036, - "grad_norm": 0.004878724459558725, + "grad_norm": 0.0412277951836586, "learning_rate": 1.3367390378884631e-06, - "loss": 0.6628, + "loss": 0.0001, "step": 11480 }, { "epoch": 43.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.7012693881988525, - "eval_runtime": 17.5974, - "eval_samples_per_second": 2.557, - "eval_steps_per_second": 2.557, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 2.8657193183898926, + "eval_runtime": 15.4375, + "eval_samples_per_second": 2.915, + "eval_steps_per_second": 2.915, "step": 11484 }, { "epoch": 44.000459770114944, - "grad_norm": 0.0019307074835523963, + "grad_norm": 0.0020878692157566547, "learning_rate": 1.3282247765006387e-06, - "loss": 0.0001, + "loss": 0.0, "step": 11490 }, { "epoch": 44.00122605363985, - "grad_norm": 0.055655524134635925, + "grad_norm": 0.007016981020569801, "learning_rate": 1.319710515112814e-06, "loss": 0.0002, "step": 11500 }, { "epoch": 44.00199233716475, - "grad_norm": 2003.3311767578125, + "grad_norm": 0.030711213126778603, "learning_rate": 1.3111962537249896e-06, - "loss": 0.171, + "loss": 0.0001, "step": 11510 }, { "epoch": 44.002758620689654, - "grad_norm": 0.006533639505505562, + "grad_norm": 0.0025869414675980806, "learning_rate": 1.3026819923371648e-06, - "loss": 0.0024, + "loss": 0.0172, "step": 11520 }, { "epoch": 44.00352490421456, - "grad_norm": 0.005293133202940226, + "grad_norm": 0.0015399837866425514, "learning_rate": 1.2941677309493402e-06, - "loss": 0.6071, + "loss": 0.0001, "step": 11530 }, { "epoch": 44.00429118773946, - "grad_norm": 0.0009481435990892351, + "grad_norm": 0.0005842315149493515, "learning_rate": 1.2856534695615156e-06, - "loss": 0.0003, + "loss": 0.0002, "step": 11540 }, { "epoch": 44.005057471264365, - "grad_norm": 0.0012919035507366061, + "grad_norm": 0.0006337865488603711, "learning_rate": 1.277139208173691e-06, - "loss": 0.0016, + "loss": 0.0005, "step": 11550 }, { "epoch": 44.00582375478927, - "grad_norm": 0.0007059547351673245, + "grad_norm": 0.0005269469111226499, "learning_rate": 1.2686249467858662e-06, "loss": 0.0001, "step": 11560 }, { "epoch": 44.00659003831418, - "grad_norm": 0.00047920530778355896, + "grad_norm": 0.0004617535450961441, "learning_rate": 1.2601106853980419e-06, - "loss": 0.0001, + "loss": 0.0004, "step": 11570 }, { "epoch": 44.00735632183908, - "grad_norm": 0.0019831156823784113, + "grad_norm": 0.0007663178839720786, "learning_rate": 1.251596424010217e-06, - "loss": 0.0001, + "loss": 0.0, "step": 11580 }, { "epoch": 44.00812260536399, - "grad_norm": 0.0037783291190862656, + "grad_norm": 0.002579664345830679, "learning_rate": 1.2430821626223927e-06, - "loss": 0.0005, + "loss": 0.0, "step": 11590 }, { "epoch": 44.00888888888889, - "grad_norm": 0.0005204785265959799, + "grad_norm": 0.0005029678577557206, "learning_rate": 1.234567901234568e-06, - "loss": 0.5252, + "loss": 0.0001, "step": 11600 }, { "epoch": 44.009655172413794, - "grad_norm": 0.0004216206434648484, + "grad_norm": 0.0003673324245028198, "learning_rate": 1.2260536398467433e-06, - "loss": 0.6395, + "loss": 0.0002, "step": 11610 }, { "epoch": 44.0104214559387, - "grad_norm": 0.00044106217683292925, + "grad_norm": 0.000421113392803818, "learning_rate": 1.2175393784589187e-06, - "loss": 0.0006, + "loss": 0.0001, "step": 11620 }, { "epoch": 44.0111877394636, - "grad_norm": 0.00045257454621605575, + "grad_norm": 0.0004428716201800853, "learning_rate": 1.2090251170710942e-06, - "loss": 0.0464, + "loss": 0.0001, "step": 11630 }, { "epoch": 44.011954022988505, - "grad_norm": 0.011230125091969967, + "grad_norm": 0.002549306023865938, "learning_rate": 1.2005108556832696e-06, "loss": 0.0002, "step": 11640 }, { "epoch": 44.01272030651341, - "grad_norm": 0.0027283676899969578, + "grad_norm": 0.001247661653906107, "learning_rate": 1.191996594295445e-06, - "loss": 0.3651, + "loss": 0.0002, "step": 11650 }, { "epoch": 44.01348659003831, - "grad_norm": 0.0009611598215997219, + "grad_norm": 0.0004867380193900317, "learning_rate": 1.1834823329076204e-06, - "loss": 0.7852, + "loss": 0.7989, "step": 11660 }, { "epoch": 44.014252873563215, - "grad_norm": 0.0011120650451630354, + "grad_norm": 0.0007086402038112283, "learning_rate": 1.1749680715197956e-06, - "loss": 0.0138, + "loss": 0.0001, "step": 11670 }, { "epoch": 44.01501915708812, - "grad_norm": 0.00035269823274575174, + "grad_norm": 0.0003546969383023679, "learning_rate": 1.166453810131971e-06, "loss": 0.0001, "step": 11680 }, { "epoch": 44.01578544061303, - "grad_norm": 0.00039521194412373006, + "grad_norm": 0.0005051234038546681, "learning_rate": 1.1579395487441465e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11690 }, { "epoch": 44.01655172413793, - "grad_norm": 0.050507210195064545, + "grad_norm": 0.002481834962964058, "learning_rate": 1.1494252873563219e-06, - "loss": 1.493, + "loss": 1.3444, "step": 11700 }, { "epoch": 44.01731800766284, - "grad_norm": 0.016909992322325706, + "grad_norm": 0.005611900240182877, "learning_rate": 1.1409110259684973e-06, - "loss": 0.0006, + "loss": 0.0001, "step": 11710 }, { "epoch": 44.01808429118774, - "grad_norm": 0.0003902670869138092, + "grad_norm": 0.0004132241301704198, "learning_rate": 1.1323967645806727e-06, - "loss": 0.5287, + "loss": 0.0001, "step": 11720 }, { "epoch": 44.018850574712644, - "grad_norm": 0.027824433520436287, + "grad_norm": 0.026718739420175552, "learning_rate": 1.1238825031928481e-06, - "loss": 0.0001, + "loss": 0.0003, "step": 11730 }, { "epoch": 44.01961685823755, - "grad_norm": 0.0007525761611759663, + "grad_norm": 0.0003907082136720419, "learning_rate": 1.1153682418050235e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11740 }, { "epoch": 44.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.3759961128234863, - "eval_runtime": 18.2316, - "eval_samples_per_second": 2.468, - "eval_steps_per_second": 2.468, + "eval_accuracy": 0.5777777777777777, + "eval_loss": 3.091642141342163, + "eval_runtime": 15.476, + "eval_samples_per_second": 2.908, + "eval_steps_per_second": 2.908, "step": 11745 }, { "epoch": 45.000383141762455, - "grad_norm": 0.0250843595713377, + "grad_norm": 0.003295178757980466, "learning_rate": 1.106853980417199e-06, - "loss": 0.0001, + "loss": 0.0, "step": 11750 }, { "epoch": 45.00114942528736, - "grad_norm": 0.0013624239945784211, + "grad_norm": 0.0005989453056827188, "learning_rate": 1.0983397190293744e-06, - "loss": 0.0004, + "loss": 0.0001, "step": 11760 }, { "epoch": 45.00191570881226, - "grad_norm": 0.004461857955902815, + "grad_norm": 0.0044679827988147736, "learning_rate": 1.0898254576415498e-06, - "loss": 0.0003, + "loss": 0.0001, "step": 11770 }, { "epoch": 45.002681992337166, - "grad_norm": 0.022097058594226837, + "grad_norm": 0.004033550154417753, "learning_rate": 1.0813111962537252e-06, - "loss": 0.744, + "loss": 0.0001, "step": 11780 }, { "epoch": 45.00344827586207, - "grad_norm": 0.002466842532157898, + "grad_norm": 0.0029175083618611097, "learning_rate": 1.0727969348659004e-06, - "loss": 0.6909, + "loss": 0.0001, "step": 11790 }, { "epoch": 45.00421455938697, - "grad_norm": 0.01806120201945305, + "grad_norm": 0.0032916399650275707, "learning_rate": 1.0642826734780758e-06, - "loss": 0.0001, + "loss": 0.0, "step": 11800 }, { "epoch": 45.00498084291188, - "grad_norm": 0.00046323775313794613, + "grad_norm": 0.0003622255171649158, "learning_rate": 1.0557684120902512e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11810 }, { "epoch": 45.00574712643678, - "grad_norm": 0.012510493397712708, + "grad_norm": 0.007126408629119396, "learning_rate": 1.0472541507024267e-06, - "loss": 0.0002, + "loss": 0.3855, "step": 11820 }, { "epoch": 45.006513409961684, - "grad_norm": 0.0005203433684073389, + "grad_norm": 0.0009091575630009174, "learning_rate": 1.038739889314602e-06, - "loss": 0.5127, + "loss": 0.0002, "step": 11830 }, { "epoch": 45.00727969348659, - "grad_norm": 0.013946869410574436, + "grad_norm": 0.002755401423200965, "learning_rate": 1.0302256279267775e-06, - "loss": 0.7997, + "loss": 0.6148, "step": 11840 }, { "epoch": 45.00804597701149, - "grad_norm": 0.00224207597784698, + "grad_norm": 0.002057574689388275, "learning_rate": 1.021711366538953e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 11850 }, { "epoch": 45.008812260536395, - "grad_norm": 0.0009158807224594057, + "grad_norm": 0.0010924681555479765, "learning_rate": 1.0131971051511283e-06, - "loss": 1.1883, + "loss": 0.0001, "step": 11860 }, { "epoch": 45.009578544061306, - "grad_norm": 592.2891235351562, + "grad_norm": 7.1477203369140625, "learning_rate": 1.0046828437633035e-06, - "loss": 0.0155, + "loss": 0.0007, "step": 11870 }, { "epoch": 45.01034482758621, - "grad_norm": 0.012307756580412388, + "grad_norm": 0.004534535575658083, "learning_rate": 9.96168582375479e-07, - "loss": 0.0003, + "loss": 0.7214, "step": 11880 }, { "epoch": 45.01111111111111, - "grad_norm": 0.011197790503501892, + "grad_norm": 0.0037969916593283415, "learning_rate": 9.876543209876544e-07, "loss": 0.0001, "step": 11890 }, { "epoch": 45.01187739463602, - "grad_norm": 0.00954184215515852, + "grad_norm": 0.0063390471041202545, "learning_rate": 9.791400595998298e-07, - "loss": 0.0005, + "loss": 0.0001, "step": 11900 }, { "epoch": 45.01264367816092, - "grad_norm": 0.0004253602819517255, + "grad_norm": 0.0003883058961946517, "learning_rate": 9.706257982120052e-07, - "loss": 0.0002, + "loss": 0.0001, "step": 11910 }, { "epoch": 45.013409961685824, - "grad_norm": 0.15199287235736847, + "grad_norm": 0.0020889630541205406, "learning_rate": 9.621115368241806e-07, - "loss": 0.0002, + "loss": 0.0001, "step": 11920 }, { "epoch": 45.01417624521073, - "grad_norm": 0.007570572663098574, + "grad_norm": 0.00480302982032299, "learning_rate": 9.535972754363559e-07, - "loss": 0.0003, + "loss": 0.0001, "step": 11930 }, { "epoch": 45.01494252873563, - "grad_norm": 0.002134187612682581, + "grad_norm": 0.0006098590674810112, "learning_rate": 9.450830140485314e-07, "loss": 0.0001, "step": 11940 }, { "epoch": 45.015708812260534, - "grad_norm": 0.007931654341518879, + "grad_norm": 0.0027735389303416014, "learning_rate": 9.365687526607068e-07, - "loss": 0.428, + "loss": 0.4917, "step": 11950 }, { "epoch": 45.01647509578544, - "grad_norm": 0.00045044583384878933, + "grad_norm": 0.000519889872521162, "learning_rate": 9.280544912728822e-07, "loss": 0.0001, "step": 11960 }, { "epoch": 45.01724137931034, - "grad_norm": 0.0004889002884738147, + "grad_norm": 0.0005059523973613977, "learning_rate": 9.195402298850575e-07, "loss": 0.0, "step": 11970 }, { "epoch": 45.01800766283525, - "grad_norm": 24.784332275390625, + "grad_norm": 0.009951411746442318, "learning_rate": 9.110259684972329e-07, - "loss": 0.8222, + "loss": 0.0006, "step": 11980 }, { "epoch": 45.018773946360156, - "grad_norm": 0.000832526886370033, + "grad_norm": 0.0023185701575130224, "learning_rate": 9.025117071094083e-07, "loss": 0.0001, "step": 11990 }, { "epoch": 45.01954022988506, - "grad_norm": 0.00791728775948286, + "grad_norm": 0.006416243966668844, "learning_rate": 8.939974457215837e-07, - "loss": 0.0002, + "loss": 0.4721, "step": 12000 }, { "epoch": 45.02, - "eval_accuracy": 0.6888888888888889, - "eval_loss": 2.239468574523926, - "eval_runtime": 18.2995, - "eval_samples_per_second": 2.459, - "eval_steps_per_second": 2.459, + "eval_accuracy": 0.6444444444444445, + "eval_loss": 2.8460350036621094, + "eval_runtime": 15.443, + "eval_samples_per_second": 2.914, + "eval_steps_per_second": 2.914, "step": 12006 }, { "epoch": 46.00030651340996, - "grad_norm": 0.0007431419799104333, + "grad_norm": 0.0010364438639953732, "learning_rate": 8.854831843337592e-07, "loss": 0.0001, "step": 12010 }, { "epoch": 46.001072796934864, - "grad_norm": 0.0005324999801814556, + "grad_norm": 0.000408830470405519, "learning_rate": 8.769689229459345e-07, "loss": 0.0001, "step": 12020 }, { "epoch": 46.00183908045977, - "grad_norm": 0.012998624704778194, + "grad_norm": 0.00960585568100214, "learning_rate": 8.684546615581099e-07, - "loss": 0.0002, + "loss": 0.001, "step": 12030 }, { "epoch": 46.00260536398467, - "grad_norm": 0.0017463216790929437, + "grad_norm": 0.005681024398654699, "learning_rate": 8.599404001702853e-07, - "loss": 0.0001, + "loss": 0.0002, "step": 12040 }, { "epoch": 46.00337164750958, - "grad_norm": 0.24495509266853333, + "grad_norm": 0.008698999881744385, "learning_rate": 8.514261387824607e-07, - "loss": 0.7849, + "loss": 0.4739, "step": 12050 }, { "epoch": 46.004137931034485, - "grad_norm": 0.00179698271676898, + "grad_norm": 0.0016830030363053083, "learning_rate": 8.429118773946361e-07, - "loss": 0.0002, + "loss": 0.0026, "step": 12060 }, { "epoch": 46.00490421455939, - "grad_norm": 0.0003565971564967185, + "grad_norm": 0.00033984475885517895, "learning_rate": 8.343976160068116e-07, - "loss": 0.0001, + "loss": 0.0002, "step": 12070 }, { "epoch": 46.00567049808429, - "grad_norm": 0.017211956903338432, + "grad_norm": 0.0023653090465813875, "learning_rate": 8.258833546189869e-07, "loss": 0.0001, "step": 12080 }, { "epoch": 46.006436781609196, - "grad_norm": 0.0014137937687337399, + "grad_norm": 0.0022981781512498856, "learning_rate": 8.173690932311623e-07, - "loss": 0.0136, + "loss": 0.0061, "step": 12090 }, { "epoch": 46.0072030651341, - "grad_norm": 0.001585201476700604, + "grad_norm": 0.0015142176998779178, "learning_rate": 8.088548318433377e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12100 }, { "epoch": 46.007969348659, - "grad_norm": 0.057490527629852295, + "grad_norm": 0.001842895639128983, "learning_rate": 8.003405704555131e-07, - "loss": 0.6275, + "loss": 0.0001, "step": 12110 }, { "epoch": 46.00873563218391, - "grad_norm": 0.01561444066464901, + "grad_norm": 0.003916316665709019, "learning_rate": 7.918263090676885e-07, - "loss": 0.2482, + "loss": 0.0004, "step": 12120 }, { "epoch": 46.00950191570881, - "grad_norm": 0.0018783870618790388, + "grad_norm": 0.000873864337336272, "learning_rate": 7.833120476798637e-07, - "loss": 0.0002, + "loss": 0.0001, "step": 12130 }, { "epoch": 46.010268199233714, - "grad_norm": 0.0004460242635104805, + "grad_norm": 0.0007449012482538819, "learning_rate": 7.747977862920392e-07, - "loss": 0.0005, + "loss": 0.1823, "step": 12140 }, { "epoch": 46.01103448275862, - "grad_norm": 0.0003904117038473487, + "grad_norm": 0.0006293251644819975, "learning_rate": 7.662835249042146e-07, - "loss": 0.0005, + "loss": 0.0, "step": 12150 }, { "epoch": 46.01180076628353, - "grad_norm": 0.002104612533003092, + "grad_norm": 0.003284325823187828, "learning_rate": 7.5776926351639e-07, - "loss": 0.0002, + "loss": 0.0006, "step": 12160 }, { "epoch": 46.01256704980843, - "grad_norm": 0.0229890625923872, + "grad_norm": 0.004199387971311808, "learning_rate": 7.492550021285653e-07, - "loss": 0.0009, + "loss": 0.0, "step": 12170 }, { "epoch": 46.013333333333335, - "grad_norm": 0.0012344926362857223, + "grad_norm": 0.0017086719162762165, "learning_rate": 7.407407407407407e-07, - "loss": 0.0031, + "loss": 0.0002, "step": 12180 }, { "epoch": 46.01409961685824, - "grad_norm": 0.0024939023423939943, + "grad_norm": 0.001854647765867412, "learning_rate": 7.322264793529161e-07, - "loss": 0.0002, + "loss": 0.0, "step": 12190 }, { "epoch": 46.01486590038314, - "grad_norm": 0.0005137289990670979, + "grad_norm": 0.00047755189007148147, "learning_rate": 7.237122179650916e-07, "loss": 0.0001, "step": 12200 }, { "epoch": 46.015632183908046, - "grad_norm": 0.037449710071086884, + "grad_norm": 0.0031646632123738527, "learning_rate": 7.15197956577267e-07, - "loss": 0.0009, + "loss": 0.0003, "step": 12210 }, { "epoch": 46.01639846743295, - "grad_norm": 0.006246969569474459, + "grad_norm": 0.009772502817213535, "learning_rate": 7.066836951894424e-07, - "loss": 0.0002, + "loss": 0.0009, "step": 12220 }, { "epoch": 46.01716475095785, - "grad_norm": 0.00038191780913621187, + "grad_norm": 0.00039410495082847774, "learning_rate": 6.981694338016177e-07, - "loss": 0.0517, + "loss": 0.0001, "step": 12230 }, { "epoch": 46.01793103448276, - "grad_norm": 0.0009960097959265113, + "grad_norm": 0.002781163202598691, "learning_rate": 6.896551724137931e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12240 }, { "epoch": 46.01869731800766, - "grad_norm": 0.0023353062570095062, + "grad_norm": 0.0022156797349452972, "learning_rate": 6.811409110259685e-07, "loss": 0.0001, "step": 12250 }, { "epoch": 46.019463601532564, - "grad_norm": 0.00037725086440332234, + "grad_norm": 0.0003918297588825226, "learning_rate": 6.72626649638144e-07, - "loss": 0.0001, + "loss": 0.1128, "step": 12260 }, { "epoch": 46.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.6189231872558594, - "eval_runtime": 18.3975, - "eval_samples_per_second": 2.446, - "eval_steps_per_second": 2.446, + "eval_accuracy": 0.6, + "eval_loss": 3.490434169769287, + "eval_runtime": 16.2284, + "eval_samples_per_second": 2.773, + "eval_steps_per_second": 2.773, "step": 12267 }, { "epoch": 47.00022988505747, - "grad_norm": 0.736779510974884, + "grad_norm": 374.71063232421875, "learning_rate": 6.641123882503194e-07, - "loss": 0.0004, + "loss": 0.729, "step": 12270 }, { "epoch": 47.000996168582375, - "grad_norm": 0.0004097692435607314, + "grad_norm": 0.00039766172994859517, "learning_rate": 6.555981268624948e-07, - "loss": 0.3106, + "loss": 0.0027, "step": 12280 }, { "epoch": 47.00176245210728, - "grad_norm": 0.0007033857400529087, + "grad_norm": 0.0009915289701893926, "learning_rate": 6.470838654746701e-07, - "loss": 0.0257, + "loss": 0.0025, "step": 12290 }, { "epoch": 47.00252873563218, - "grad_norm": 0.0004144683189224452, + "grad_norm": 0.0003794320218730718, "learning_rate": 6.385696040868455e-07, - "loss": 0.0003, + "loss": 0.8632, "step": 12300 }, { "epoch": 47.003295019157086, - "grad_norm": 0.0004815524735022336, + "grad_norm": 0.0004160934186074883, "learning_rate": 6.300553426990209e-07, - "loss": 0.0001, + "loss": 0.0003, "step": 12310 }, { "epoch": 47.00406130268199, - "grad_norm": 0.0003867338818963617, + "grad_norm": 0.0003883976023644209, "learning_rate": 6.215410813111964e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12320 }, { "epoch": 47.00482758620689, - "grad_norm": 0.001194644602946937, + "grad_norm": 0.0005818352801725268, "learning_rate": 6.130268199233717e-07, "loss": 0.0001, "step": 12330 }, { "epoch": 47.005593869731804, - "grad_norm": 0.028938276693224907, + "grad_norm": 697.9662475585938, "learning_rate": 6.045125585355471e-07, - "loss": 0.6223, + "loss": 0.4879, "step": 12340 }, { "epoch": 47.00636015325671, - "grad_norm": 0.0003941210452467203, + "grad_norm": 0.0003793842042796314, "learning_rate": 5.959982971477225e-07, - "loss": 0.3268, + "loss": 0.4052, "step": 12350 }, { "epoch": 47.00712643678161, - "grad_norm": 0.03742298111319542, + "grad_norm": 0.003470742143690586, "learning_rate": 5.874840357598978e-07, "loss": 0.0001, "step": 12360 }, { "epoch": 47.007892720306515, - "grad_norm": 0.0003373978252056986, + "grad_norm": 0.00047233240911737084, "learning_rate": 5.789697743720732e-07, - "loss": 0.5375, + "loss": 0.0001, "step": 12370 }, { "epoch": 47.00865900383142, - "grad_norm": 0.002957494929432869, + "grad_norm": 0.0016107996925711632, "learning_rate": 5.704555129842486e-07, - "loss": 0.0006, + "loss": 0.7282, "step": 12380 }, { "epoch": 47.00942528735632, - "grad_norm": 0.012526136822998524, + "grad_norm": 0.022575262933969498, "learning_rate": 5.619412515964241e-07, "loss": 0.0001, "step": 12390 }, { "epoch": 47.010191570881226, - "grad_norm": 0.00046426666085608304, + "grad_norm": 0.0006176336319185793, "learning_rate": 5.534269902085995e-07, "loss": 0.0001, "step": 12400 }, { "epoch": 47.01095785440613, - "grad_norm": 0.0008831802988424897, + "grad_norm": 0.0005281046032905579, "learning_rate": 5.449127288207749e-07, - "loss": 0.7104, + "loss": 0.0001, "step": 12410 }, { "epoch": 47.01172413793103, - "grad_norm": 0.029599444940686226, + "grad_norm": 0.0016648982418701053, "learning_rate": 5.363984674329502e-07, "loss": 0.0002, "step": 12420 }, { "epoch": 47.01249042145594, - "grad_norm": 0.005943891126662493, + "grad_norm": 0.004870912525802851, "learning_rate": 5.278842060451256e-07, - "loss": 0.0001, + "loss": 0.0002, "step": 12430 }, { "epoch": 47.01325670498084, - "grad_norm": 0.00040955268195830286, + "grad_norm": 0.0005756067112088203, "learning_rate": 5.19369944657301e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12440 }, { "epoch": 47.014022988505744, - "grad_norm": 0.00037998644984327257, + "grad_norm": 0.00042045823647640646, "learning_rate": 5.108556832694765e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12450 }, { "epoch": 47.014789272030654, - "grad_norm": 0.001552705536596477, + "grad_norm": 0.0009925250196829438, "learning_rate": 5.023414218816518e-07, - "loss": 0.0016, + "loss": 0.0, "step": 12460 }, { "epoch": 47.01555555555556, - "grad_norm": 0.02872062474489212, + "grad_norm": 0.011042957194149494, "learning_rate": 4.938271604938272e-07, - "loss": 0.0251, + "loss": 0.0001, "step": 12470 }, { "epoch": 47.01632183908046, - "grad_norm": 0.026964617893099785, + "grad_norm": 0.003685299539938569, "learning_rate": 4.853128991060026e-07, "loss": 0.0001, "step": 12480 }, { "epoch": 47.017088122605365, - "grad_norm": 0.006699676159769297, + "grad_norm": 0.001552742556668818, "learning_rate": 4.7679863771817797e-07, - "loss": 0.6866, + "loss": 0.0044, "step": 12490 }, { "epoch": 47.01785440613027, - "grad_norm": 0.010123691521584988, + "grad_norm": 0.0014841716038063169, "learning_rate": 4.682843763303534e-07, - "loss": 0.001, + "loss": 0.0001, "step": 12500 }, { "epoch": 47.01862068965517, - "grad_norm": 0.0003556281153578311, + "grad_norm": 0.00035557564115151763, "learning_rate": 4.5977011494252875e-07, - "loss": 0.5733, + "loss": 0.0721, "step": 12510 }, { "epoch": 47.019386973180076, - "grad_norm": 0.00044446007814258337, + "grad_norm": 0.00034435518318787217, "learning_rate": 4.5125585355470417e-07, - "loss": 0.0003, + "loss": 0.0001, "step": 12520 }, { "epoch": 47.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.524125576019287, - "eval_runtime": 16.6742, - "eval_samples_per_second": 2.699, - "eval_steps_per_second": 2.699, + "eval_accuracy": 0.6, + "eval_loss": 3.4742181301116943, + "eval_runtime": 15.4395, + "eval_samples_per_second": 2.915, + "eval_steps_per_second": 2.915, "step": 12528 }, { "epoch": 48.000153256704984, - "grad_norm": 0.00034069380490109324, + "grad_norm": 0.00036939739948138595, "learning_rate": 4.427415921668796e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12530 }, { "epoch": 48.00091954022989, - "grad_norm": 0.0005042839329689741, + "grad_norm": 0.0009727678843773901, "learning_rate": 4.3422733077905495e-07, - "loss": 0.382, + "loss": 0.0002, "step": 12540 }, { "epoch": 48.00168582375479, - "grad_norm": 0.018302783370018005, + "grad_norm": 0.0022727411706000566, "learning_rate": 4.2571306939123036e-07, - "loss": 0.0145, + "loss": 0.0001, "step": 12550 }, { "epoch": 48.002452107279694, - "grad_norm": 0.5595752596855164, + "grad_norm": 0.1918380707502365, "learning_rate": 4.171988080034058e-07, - "loss": 0.7313, + "loss": 0.849, "step": 12560 }, { "epoch": 48.0032183908046, - "grad_norm": 0.0004967560525983572, + "grad_norm": 0.00033304598764516413, "learning_rate": 4.0868454661558115e-07, - "loss": 0.7463, + "loss": 1.6565, "step": 12570 }, { "epoch": 48.0039846743295, - "grad_norm": 593.8219604492188, + "grad_norm": 0.26823627948760986, "learning_rate": 4.0017028522775656e-07, - "loss": 0.5942, + "loss": 0.0001, "step": 12580 }, { "epoch": 48.004750957854405, - "grad_norm": 3.3011648654937744, + "grad_norm": 0.39764878153800964, "learning_rate": 3.916560238399319e-07, - "loss": 0.0005, + "loss": 0.0002, "step": 12590 }, { "epoch": 48.00551724137931, - "grad_norm": 0.015013687312602997, + "grad_norm": 0.005832442082464695, "learning_rate": 3.831417624521073e-07, - "loss": 0.2838, + "loss": 0.0, "step": 12600 }, { "epoch": 48.00628352490421, - "grad_norm": 0.002949683926999569, + "grad_norm": 0.0018820440163835883, "learning_rate": 3.7462750106428265e-07, - "loss": 0.0002, + "loss": 0.0001, "step": 12610 }, { "epoch": 48.007049808429116, - "grad_norm": 0.007660044822841883, + "grad_norm": 0.001864145859144628, "learning_rate": 3.6611323967645807e-07, - "loss": 0.0002, + "loss": 0.0001, "step": 12620 }, { "epoch": 48.00781609195402, - "grad_norm": 0.000941854901611805, + "grad_norm": 0.0004937060875818133, "learning_rate": 3.575989782886335e-07, - "loss": 0.0006, + "loss": 0.0, "step": 12630 }, { "epoch": 48.00858237547893, - "grad_norm": 0.0009806039743125439, + "grad_norm": 0.00048105468158610165, "learning_rate": 3.4908471690080885e-07, "loss": 0.0002, "step": 12640 }, { "epoch": 48.009348659003834, - "grad_norm": 0.00240034400485456, + "grad_norm": 0.006083694286644459, "learning_rate": 3.4057045551298427e-07, "loss": 0.0001, "step": 12650 }, { "epoch": 48.01011494252874, - "grad_norm": 0.0038584331050515175, + "grad_norm": 0.0019000498577952385, "learning_rate": 3.320561941251597e-07, "loss": 0.0001, "step": 12660 }, { "epoch": 48.01088122605364, - "grad_norm": 0.00046369805932044983, + "grad_norm": 0.0004663409781642258, "learning_rate": 3.2354193273733505e-07, "loss": 0.0, "step": 12670 }, { "epoch": 48.011647509578545, - "grad_norm": 0.04157138988375664, + "grad_norm": 2.294158458709717, "learning_rate": 3.1502767134951047e-07, - "loss": 0.0361, + "loss": 0.0004, "step": 12680 }, { "epoch": 48.01241379310345, - "grad_norm": 0.008389453403651714, + "grad_norm": 0.0030898479744791985, "learning_rate": 3.0651340996168583e-07, "loss": 0.0001, "step": 12690 }, { "epoch": 48.01318007662835, - "grad_norm": 0.00041886779945343733, + "grad_norm": 0.0006790203042328358, "learning_rate": 2.9799914857386125e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12700 }, { "epoch": 48.013946360153255, - "grad_norm": 0.0003623064258135855, + "grad_norm": 0.0002924101718235761, "learning_rate": 2.894848871860366e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12710 }, { "epoch": 48.01471264367816, - "grad_norm": 0.00028568890411406755, + "grad_norm": 0.00030993795371614397, "learning_rate": 2.8097062579821203e-07, - "loss": 0.0017, + "loss": 0.0, "step": 12720 }, { "epoch": 48.01547892720306, - "grad_norm": 0.006699131801724434, + "grad_norm": 0.004514725413173437, "learning_rate": 2.7245636441038745e-07, - "loss": 0.0056, + "loss": 0.0001, "step": 12730 }, { "epoch": 48.016245210727966, - "grad_norm": 0.0019495913293212652, + "grad_norm": 0.004188064020127058, "learning_rate": 2.639421030225628e-07, "loss": 0.0001, "step": 12740 }, { "epoch": 48.01701149425288, - "grad_norm": 0.0006483040051534772, + "grad_norm": 0.0005163051537238061, "learning_rate": 2.5542784163473823e-07, - "loss": 0.0004, + "loss": 0.0001, "step": 12750 }, { "epoch": 48.01777777777778, - "grad_norm": 0.0006546643562614918, + "grad_norm": 0.004421199206262827, "learning_rate": 2.469135802469136e-07, - "loss": 0.6143, + "loss": 0.001, "step": 12760 }, { "epoch": 48.018544061302684, - "grad_norm": 0.0010494455927982926, + "grad_norm": 0.00052443373715505, "learning_rate": 2.3839931885908898e-07, - "loss": 0.0002, + "loss": 0.0001, "step": 12770 }, { "epoch": 48.01931034482759, - "grad_norm": 0.00034482337650842965, + "grad_norm": 0.00033496436662971973, "learning_rate": 2.2988505747126437e-07, - "loss": 0.0002, + "loss": 0.9105, "step": 12780 }, { "epoch": 48.02, - "eval_accuracy": 0.6444444444444445, - "eval_loss": 2.578596830368042, - "eval_runtime": 16.5382, - "eval_samples_per_second": 2.721, - "eval_steps_per_second": 2.721, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 2.6414289474487305, + "eval_runtime": 16.1464, + "eval_samples_per_second": 2.787, + "eval_steps_per_second": 2.787, "step": 12789 }, { "epoch": 49.00007662835249, - "grad_norm": 0.0006881165900267661, + "grad_norm": 0.0018495968542993069, "learning_rate": 2.213707960834398e-07, "loss": 0.0001, "step": 12790 }, { "epoch": 49.00084291187739, - "grad_norm": 0.008107454515993595, + "grad_norm": 0.0006543529452756047, "learning_rate": 2.1285653469561518e-07, "loss": 0.0001, "step": 12800 }, { "epoch": 49.001609195402295, - "grad_norm": 33.39728546142578, + "grad_norm": 0.005906207486987114, "learning_rate": 2.0434227330779057e-07, - "loss": 0.0021, + "loss": 0.0001, "step": 12810 }, { "epoch": 49.002375478927206, - "grad_norm": 0.0005207730573602021, + "grad_norm": 0.00043979252222925425, "learning_rate": 1.9582801191996594e-07, - "loss": 0.0004, + "loss": 0.0001, "step": 12820 }, { "epoch": 49.00314176245211, - "grad_norm": 0.0009132600971497595, + "grad_norm": 0.0010270284255966544, "learning_rate": 1.8731375053214133e-07, - "loss": 0.4955, + "loss": 0.0001, "step": 12830 }, { "epoch": 49.00390804597701, - "grad_norm": 0.013016578741371632, + "grad_norm": 0.0020657137501984835, "learning_rate": 1.7879948914431674e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12840 }, { "epoch": 49.00467432950192, - "grad_norm": 0.0004933698801323771, + "grad_norm": 0.0005068311002105474, "learning_rate": 1.7028522775649214e-07, "loss": 0.0001, "step": 12850 }, { "epoch": 49.00544061302682, - "grad_norm": 0.0010416624136269093, + "grad_norm": 0.0015583178028464317, "learning_rate": 1.6177096636866753e-07, "loss": 0.0001, "step": 12860 }, { "epoch": 49.006206896551724, - "grad_norm": 0.000576712831389159, + "grad_norm": 0.000689724984113127, "learning_rate": 1.5325670498084292e-07, "loss": 0.0001, "step": 12870 }, { "epoch": 49.00697318007663, - "grad_norm": 0.00906006246805191, + "grad_norm": 0.008311384357511997, "learning_rate": 1.447424435930183e-07, - "loss": 0.543, + "loss": 0.0001, "step": 12880 }, { "epoch": 49.00773946360153, - "grad_norm": 0.0013503475347533822, + "grad_norm": 0.003313031978905201, "learning_rate": 1.3622818220519372e-07, "loss": 0.0001, "step": 12890 }, { "epoch": 49.008505747126435, - "grad_norm": 0.0006540995091199875, + "grad_norm": 0.0003582458011806011, "learning_rate": 1.2771392081736911e-07, - "loss": 0.6631, + "loss": 0.3322, "step": 12900 }, { "epoch": 49.00927203065134, - "grad_norm": 0.0005394623731262982, + "grad_norm": 0.0005774461315013468, "learning_rate": 1.1919965942954449e-07, "loss": 0.0001, "step": 12910 }, { "epoch": 49.01003831417624, - "grad_norm": 0.00035656490945257246, + "grad_norm": 0.0003245752304792404, "learning_rate": 1.106853980417199e-07, - "loss": 0.0001, + "loss": 0.0049, "step": 12920 }, { "epoch": 49.01080459770115, - "grad_norm": 0.015578029677271843, + "grad_norm": 0.007660223171114922, "learning_rate": 1.0217113665389529e-07, - "loss": 0.0001, + "loss": 0.0, "step": 12930 }, { "epoch": 49.011570881226056, - "grad_norm": 0.0029297862201929092, + "grad_norm": 0.0019596717320382595, "learning_rate": 9.365687526607066e-08, "loss": 0.0001, "step": 12940 }, { "epoch": 49.01233716475096, - "grad_norm": 0.0017712245462462306, + "grad_norm": 0.0024398919194936752, "learning_rate": 8.514261387824607e-08, "loss": 0.0001, "step": 12950 }, { "epoch": 49.013103448275864, - "grad_norm": 0.0006630939897149801, + "grad_norm": 0.003018255578354001, "learning_rate": 7.662835249042146e-08, - "loss": 0.6707, + "loss": 0.2785, "step": 12960 }, { "epoch": 49.01386973180077, - "grad_norm": 0.00039140330045484006, + "grad_norm": 0.0003519069869071245, "learning_rate": 6.811409110259686e-08, - "loss": 0.0036, + "loss": 0.0916, "step": 12970 }, { "epoch": 49.01463601532567, - "grad_norm": 0.005970772821456194, + "grad_norm": 0.004012451972812414, "learning_rate": 5.9599829714772246e-08, - "loss": 0.659, + "loss": 0.0001, "step": 12980 }, { "epoch": 49.015402298850574, - "grad_norm": 0.0034339451231062412, + "grad_norm": 0.004512372892349958, "learning_rate": 5.108556832694764e-08, - "loss": 0.0012, + "loss": 0.0, "step": 12990 }, { "epoch": 49.01616858237548, - "grad_norm": 0.008482662960886955, + "grad_norm": 0.005976757034659386, "learning_rate": 4.2571306939123034e-08, - "loss": 0.0003, + "loss": 0.0002, "step": 13000 }, { "epoch": 49.01693486590038, - "grad_norm": 0.02817898616194725, + "grad_norm": 0.003032307606190443, "learning_rate": 3.405704555129843e-08, - "loss": 0.0001, + "loss": 0.0, "step": 13010 }, { "epoch": 49.017701149425285, - "grad_norm": 0.0010384070919826627, + "grad_norm": 0.0062565309926867485, "learning_rate": 2.554278416347382e-08, "loss": 0.0001, "step": 13020 }, { "epoch": 49.01846743295019, - "grad_norm": 0.0022189882583916187, + "grad_norm": 0.0034854146651923656, "learning_rate": 1.7028522775649215e-08, - "loss": 1.3365, + "loss": 0.0001, "step": 13030 }, { "epoch": 49.01923371647509, - "grad_norm": 0.00095372274518013, + "grad_norm": 0.0014749406836926937, "learning_rate": 8.514261387824608e-09, - "loss": 0.0001, + "loss": 0.0, "step": 13040 }, { "epoch": 49.02, - "grad_norm": 0.004288394935429096, + "grad_norm": 0.0031646606512367725, "learning_rate": 0.0, "loss": 0.0001, "step": 13050 }, { "epoch": 49.02, - "eval_accuracy": 0.6666666666666666, - "eval_loss": 2.40472412109375, - "eval_runtime": 20.6571, - "eval_samples_per_second": 2.178, - "eval_steps_per_second": 2.178, + "eval_accuracy": 0.6888888888888889, + "eval_loss": 2.576758623123169, + "eval_runtime": 18.3467, + "eval_samples_per_second": 2.453, + "eval_steps_per_second": 2.453, "step": 13050 }, { "epoch": 49.02, "step": 13050, "total_flos": 5.730289341462282e+19, - "train_loss": 0.6079807819900999, - "train_runtime": 11680.228, - "train_samples_per_second": 1.117, - "train_steps_per_second": 1.117 + "train_loss": 0.582058622205751, + "train_runtime": 11246.0234, + "train_samples_per_second": 1.16, + "train_steps_per_second": 1.16 }, { "epoch": 49.02, - "eval_accuracy": 0.7555555555555555, - "eval_loss": 1.6462864875793457, - "eval_runtime": 17.1783, - "eval_samples_per_second": 2.62, - "eval_steps_per_second": 2.62, + "eval_accuracy": 0.7333333333333333, + "eval_loss": 2.123791217803955, + "eval_runtime": 15.9202, + "eval_samples_per_second": 2.827, + "eval_steps_per_second": 2.827, "step": 13050 }, { "epoch": 49.02, - "eval_accuracy": 0.7555555555555555, - "eval_loss": 1.6462864875793457, - "eval_runtime": 17.129, - "eval_samples_per_second": 2.627, - "eval_steps_per_second": 2.627, + "eval_accuracy": 0.7333333333333333, + "eval_loss": 2.123790979385376, + "eval_runtime": 15.206, + "eval_samples_per_second": 2.959, + "eval_steps_per_second": 2.959, "step": 13050 } ],