diff --git "a/checkpoint-22453/trainer_state.json" "b/checkpoint-22453/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-22453/trainer_state.json" @@ -0,0 +1,6341 @@ +{ + "best_metric": 0.10316640883684158, + "best_model_checkpoint": "autotrain-ai-image-detect-20250107-2358/checkpoint-22453", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 22453, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011134369571994833, + "grad_norm": 0.0017522681737318635, + "learning_rate": 3.999987764250782e-05, + "loss": 0.0449, + "step": 25 + }, + { + "epoch": 0.0022268739143989666, + "grad_norm": 2.625354051589966, + "learning_rate": 3.99995105715284e-05, + "loss": 0.0883, + "step": 50 + }, + { + "epoch": 0.00334031087159845, + "grad_norm": 0.9239026308059692, + "learning_rate": 3.9998898791553145e-05, + "loss": 0.1545, + "step": 75 + }, + { + "epoch": 0.004453747828797933, + "grad_norm": 0.01456692535430193, + "learning_rate": 3.9998042310067626e-05, + "loss": 0.0866, + "step": 100 + }, + { + "epoch": 0.005567184785997417, + "grad_norm": 0.08153713494539261, + "learning_rate": 3.999694113755154e-05, + "loss": 0.0635, + "step": 125 + }, + { + "epoch": 0.0066806217431969, + "grad_norm": 0.8362884521484375, + "learning_rate": 3.9995595287478564e-05, + "loss": 0.109, + "step": 150 + }, + { + "epoch": 0.007794058700396384, + "grad_norm": 0.1706877499818802, + "learning_rate": 3.999400477631618e-05, + "loss": 0.1077, + "step": 175 + }, + { + "epoch": 0.008907495657595866, + "grad_norm": 0.009436409920454025, + "learning_rate": 3.999216962352548e-05, + "loss": 0.0612, + "step": 200 + }, + { + "epoch": 0.01002093261479535, + "grad_norm": 0.10162385553121567, + "learning_rate": 3.9990089851560926e-05, + "loss": 0.0925, + "step": 225 + }, + { + "epoch": 0.011134369571994833, + "grad_norm": 0.8786261081695557, + "learning_rate": 3.99877654858701e-05, + "loss": 0.0724, + "step": 250 + }, + { + "epoch": 0.012247806529194317, + "grad_norm": 24.078943252563477, + "learning_rate": 3.998519655489336e-05, + "loss": 0.1674, + "step": 275 + }, + { + "epoch": 0.0133612434863938, + "grad_norm": 0.01407669484615326, + "learning_rate": 3.998238309006349e-05, + "loss": 0.0549, + "step": 300 + }, + { + "epoch": 0.014474680443593284, + "grad_norm": 0.011299420148134232, + "learning_rate": 3.997932512580533e-05, + "loss": 0.0264, + "step": 325 + }, + { + "epoch": 0.015588117400792767, + "grad_norm": 0.676752507686615, + "learning_rate": 3.997602269953539e-05, + "loss": 0.0436, + "step": 350 + }, + { + "epoch": 0.01670155435799225, + "grad_norm": 0.010241251438856125, + "learning_rate": 3.997247585166133e-05, + "loss": 0.063, + "step": 375 + }, + { + "epoch": 0.017814991315191733, + "grad_norm": 0.03709862753748894, + "learning_rate": 3.996868462558146e-05, + "loss": 0.1051, + "step": 400 + }, + { + "epoch": 0.018928428272391218, + "grad_norm": 0.1498664766550064, + "learning_rate": 3.996464906768429e-05, + "loss": 0.0495, + "step": 425 + }, + { + "epoch": 0.0200418652295907, + "grad_norm": 0.34423527121543884, + "learning_rate": 3.9960369227347905e-05, + "loss": 0.0948, + "step": 450 + }, + { + "epoch": 0.021155302186790185, + "grad_norm": 141.0285186767578, + "learning_rate": 3.995584515693934e-05, + "loss": 0.073, + "step": 475 + }, + { + "epoch": 0.022268739143989667, + "grad_norm": 1.1084500551223755, + "learning_rate": 3.995107691181399e-05, + "loss": 0.0735, + "step": 500 + }, + { + "epoch": 0.023382176101189152, + "grad_norm": 0.8122372031211853, + "learning_rate": 3.994606455031492e-05, + "loss": 0.0643, + "step": 525 + }, + { + "epoch": 0.024495613058388634, + "grad_norm": 0.2024221420288086, + "learning_rate": 3.9940808133772114e-05, + "loss": 0.0483, + "step": 550 + }, + { + "epoch": 0.02560905001558812, + "grad_norm": 0.052694253623485565, + "learning_rate": 3.9935307726501774e-05, + "loss": 0.1225, + "step": 575 + }, + { + "epoch": 0.0267224869727876, + "grad_norm": 0.2362474501132965, + "learning_rate": 3.99295633958055e-05, + "loss": 0.0988, + "step": 600 + }, + { + "epoch": 0.027835923929987082, + "grad_norm": 0.009827379137277603, + "learning_rate": 3.992357521196948e-05, + "loss": 0.0966, + "step": 625 + }, + { + "epoch": 0.028949360887186568, + "grad_norm": 8.952296257019043, + "learning_rate": 3.991734324826364e-05, + "loss": 0.1072, + "step": 650 + }, + { + "epoch": 0.03006279784438605, + "grad_norm": 7.652793884277344, + "learning_rate": 3.9910867580940705e-05, + "loss": 0.117, + "step": 675 + }, + { + "epoch": 0.031176234801585535, + "grad_norm": 27.530536651611328, + "learning_rate": 3.990414828923533e-05, + "loss": 0.157, + "step": 700 + }, + { + "epoch": 0.03228967175878502, + "grad_norm": 0.032850898802280426, + "learning_rate": 3.989718545536309e-05, + "loss": 0.054, + "step": 725 + }, + { + "epoch": 0.0334031087159845, + "grad_norm": 3.0997722148895264, + "learning_rate": 3.988997916451946e-05, + "loss": 0.078, + "step": 750 + }, + { + "epoch": 0.03451654567318398, + "grad_norm": 70.86727905273438, + "learning_rate": 3.9882529504878815e-05, + "loss": 0.129, + "step": 775 + }, + { + "epoch": 0.035629982630383465, + "grad_norm": 0.058145686984062195, + "learning_rate": 3.987483656759332e-05, + "loss": 0.0496, + "step": 800 + }, + { + "epoch": 0.036743419587582954, + "grad_norm": 0.1561250239610672, + "learning_rate": 3.986690044679183e-05, + "loss": 0.0652, + "step": 825 + }, + { + "epoch": 0.037856856544782436, + "grad_norm": 5.164768695831299, + "learning_rate": 3.9858721239578725e-05, + "loss": 0.0951, + "step": 850 + }, + { + "epoch": 0.03897029350198192, + "grad_norm": 0.022487835958600044, + "learning_rate": 3.9850299046032735e-05, + "loss": 0.0954, + "step": 875 + }, + { + "epoch": 0.0400837304591814, + "grad_norm": 15.329583168029785, + "learning_rate": 3.984163396920571e-05, + "loss": 0.0913, + "step": 900 + }, + { + "epoch": 0.04119716741638089, + "grad_norm": 0.03666286543011665, + "learning_rate": 3.983272611512135e-05, + "loss": 0.0761, + "step": 925 + }, + { + "epoch": 0.04231060437358037, + "grad_norm": 0.03395090997219086, + "learning_rate": 3.982357559277393e-05, + "loss": 0.1114, + "step": 950 + }, + { + "epoch": 0.04342404133077985, + "grad_norm": 0.05037323012948036, + "learning_rate": 3.981418251412694e-05, + "loss": 0.1333, + "step": 975 + }, + { + "epoch": 0.04453747828797933, + "grad_norm": 0.0885915756225586, + "learning_rate": 3.980454699411175e-05, + "loss": 0.0932, + "step": 1000 + }, + { + "epoch": 0.045650915245178815, + "grad_norm": 0.27377429604530334, + "learning_rate": 3.979466915062615e-05, + "loss": 0.096, + "step": 1025 + }, + { + "epoch": 0.046764352202378304, + "grad_norm": 0.03340262174606323, + "learning_rate": 3.978454910453296e-05, + "loss": 0.0788, + "step": 1050 + }, + { + "epoch": 0.047877789159577785, + "grad_norm": 1.4525907039642334, + "learning_rate": 3.9774186979658525e-05, + "loss": 0.0303, + "step": 1075 + }, + { + "epoch": 0.04899122611677727, + "grad_norm": 0.0045611136592924595, + "learning_rate": 3.976358290279121e-05, + "loss": 0.0807, + "step": 1100 + }, + { + "epoch": 0.05010466307397675, + "grad_norm": 0.43597421050071716, + "learning_rate": 3.975273700367985e-05, + "loss": 0.0809, + "step": 1125 + }, + { + "epoch": 0.05121810003117624, + "grad_norm": 0.049976006150245667, + "learning_rate": 3.9741649415032124e-05, + "loss": 0.1176, + "step": 1150 + }, + { + "epoch": 0.05233153698837572, + "grad_norm": 0.09874877333641052, + "learning_rate": 3.9730320272513006e-05, + "loss": 0.0726, + "step": 1175 + }, + { + "epoch": 0.0534449739455752, + "grad_norm": 0.05045215040445328, + "learning_rate": 3.9718749714743034e-05, + "loss": 0.1028, + "step": 1200 + }, + { + "epoch": 0.05455841090277468, + "grad_norm": 0.13014136254787445, + "learning_rate": 3.970693788329665e-05, + "loss": 0.1244, + "step": 1225 + }, + { + "epoch": 0.055671847859974165, + "grad_norm": 0.05505981668829918, + "learning_rate": 3.969488492270046e-05, + "loss": 0.1089, + "step": 1250 + }, + { + "epoch": 0.056785284817173654, + "grad_norm": 0.01733974926173687, + "learning_rate": 3.968259098043147e-05, + "loss": 0.1082, + "step": 1275 + }, + { + "epoch": 0.057898721774373135, + "grad_norm": 7.767026901245117, + "learning_rate": 3.967005620691528e-05, + "loss": 0.1032, + "step": 1300 + }, + { + "epoch": 0.05901215873157262, + "grad_norm": 10.336471557617188, + "learning_rate": 3.9657280755524227e-05, + "loss": 0.0797, + "step": 1325 + }, + { + "epoch": 0.0601255956887721, + "grad_norm": 0.03750215098261833, + "learning_rate": 3.964426478257554e-05, + "loss": 0.0697, + "step": 1350 + }, + { + "epoch": 0.06123903264597159, + "grad_norm": 15.773834228515625, + "learning_rate": 3.963100844732939e-05, + "loss": 0.0839, + "step": 1375 + }, + { + "epoch": 0.06235246960317107, + "grad_norm": 78.63215637207031, + "learning_rate": 3.9617511911986966e-05, + "loss": 0.1368, + "step": 1400 + }, + { + "epoch": 0.06346590656037056, + "grad_norm": 0.0823257565498352, + "learning_rate": 3.960377534168851e-05, + "loss": 0.1027, + "step": 1425 + }, + { + "epoch": 0.06457934351757004, + "grad_norm": 56.04353332519531, + "learning_rate": 3.9589798904511234e-05, + "loss": 0.0504, + "step": 1450 + }, + { + "epoch": 0.06569278047476952, + "grad_norm": 2.2531938552856445, + "learning_rate": 3.957558277146732e-05, + "loss": 0.1318, + "step": 1475 + }, + { + "epoch": 0.066806217431969, + "grad_norm": 0.08561531454324722, + "learning_rate": 3.9561127116501816e-05, + "loss": 0.0387, + "step": 1500 + }, + { + "epoch": 0.06791965438916849, + "grad_norm": 0.03525933623313904, + "learning_rate": 3.954643211649048e-05, + "loss": 0.1259, + "step": 1525 + }, + { + "epoch": 0.06903309134636797, + "grad_norm": 26.943761825561523, + "learning_rate": 3.953149795123766e-05, + "loss": 0.1124, + "step": 1550 + }, + { + "epoch": 0.07014652830356745, + "grad_norm": 17.139528274536133, + "learning_rate": 3.951632480347404e-05, + "loss": 0.0953, + "step": 1575 + }, + { + "epoch": 0.07125996526076693, + "grad_norm": 13.074729919433594, + "learning_rate": 3.950091285885447e-05, + "loss": 0.1227, + "step": 1600 + }, + { + "epoch": 0.07237340221796641, + "grad_norm": 0.05087602138519287, + "learning_rate": 3.948526230595562e-05, + "loss": 0.0878, + "step": 1625 + }, + { + "epoch": 0.07348683917516591, + "grad_norm": 11.556482315063477, + "learning_rate": 3.9469373336273745e-05, + "loss": 0.0345, + "step": 1650 + }, + { + "epoch": 0.07460027613236539, + "grad_norm": 0.24067726731300354, + "learning_rate": 3.9453246144222283e-05, + "loss": 0.0578, + "step": 1675 + }, + { + "epoch": 0.07571371308956487, + "grad_norm": 1.8344037532806396, + "learning_rate": 3.9436880927129515e-05, + "loss": 0.0968, + "step": 1700 + }, + { + "epoch": 0.07682715004676435, + "grad_norm": 23.347782135009766, + "learning_rate": 3.942027788523614e-05, + "loss": 0.1451, + "step": 1725 + }, + { + "epoch": 0.07794058700396383, + "grad_norm": 45.14012145996094, + "learning_rate": 3.9403437221692806e-05, + "loss": 0.1348, + "step": 1750 + }, + { + "epoch": 0.07905402396116332, + "grad_norm": 35.514225006103516, + "learning_rate": 3.938635914255766e-05, + "loss": 0.147, + "step": 1775 + }, + { + "epoch": 0.0801674609183628, + "grad_norm": 0.09194282442331314, + "learning_rate": 3.9369043856793775e-05, + "loss": 0.0951, + "step": 1800 + }, + { + "epoch": 0.08128089787556228, + "grad_norm": 33.31999969482422, + "learning_rate": 3.9351491576266675e-05, + "loss": 0.0485, + "step": 1825 + }, + { + "epoch": 0.08239433483276178, + "grad_norm": 0.5410212278366089, + "learning_rate": 3.933370251574163e-05, + "loss": 0.0664, + "step": 1850 + }, + { + "epoch": 0.08350777178996126, + "grad_norm": 25.731027603149414, + "learning_rate": 3.931567689288115e-05, + "loss": 0.1271, + "step": 1875 + }, + { + "epoch": 0.08462120874716074, + "grad_norm": 0.052043963223695755, + "learning_rate": 3.929741492824222e-05, + "loss": 0.0841, + "step": 1900 + }, + { + "epoch": 0.08573464570436022, + "grad_norm": 0.07185567915439606, + "learning_rate": 3.9278916845273665e-05, + "loss": 0.1116, + "step": 1925 + }, + { + "epoch": 0.0868480826615597, + "grad_norm": 54.373497009277344, + "learning_rate": 3.926018287031339e-05, + "loss": 0.0396, + "step": 1950 + }, + { + "epoch": 0.08796151961875918, + "grad_norm": 0.1579923927783966, + "learning_rate": 3.9241213232585607e-05, + "loss": 0.1455, + "step": 1975 + }, + { + "epoch": 0.08907495657595867, + "grad_norm": 0.19795724749565125, + "learning_rate": 3.922200816419806e-05, + "loss": 0.1301, + "step": 2000 + }, + { + "epoch": 0.09018839353315815, + "grad_norm": 10.653286933898926, + "learning_rate": 3.9202567900139134e-05, + "loss": 0.0711, + "step": 2025 + }, + { + "epoch": 0.09130183049035763, + "grad_norm": 0.07394363731145859, + "learning_rate": 3.918289267827504e-05, + "loss": 0.0728, + "step": 2050 + }, + { + "epoch": 0.09241526744755713, + "grad_norm": 0.03366103768348694, + "learning_rate": 3.916298273934685e-05, + "loss": 0.1524, + "step": 2075 + }, + { + "epoch": 0.09352870440475661, + "grad_norm": 0.06265176832675934, + "learning_rate": 3.9142838326967584e-05, + "loss": 0.086, + "step": 2100 + }, + { + "epoch": 0.09464214136195609, + "grad_norm": 0.4850364327430725, + "learning_rate": 3.9122459687619224e-05, + "loss": 0.0464, + "step": 2125 + }, + { + "epoch": 0.09575557831915557, + "grad_norm": 0.05570212006568909, + "learning_rate": 3.910184707064968e-05, + "loss": 0.0935, + "step": 2150 + }, + { + "epoch": 0.09686901527635505, + "grad_norm": 0.46077781915664673, + "learning_rate": 3.908100072826977e-05, + "loss": 0.0563, + "step": 2175 + }, + { + "epoch": 0.09798245223355453, + "grad_norm": 0.29431354999542236, + "learning_rate": 3.905992091555012e-05, + "loss": 0.1255, + "step": 2200 + }, + { + "epoch": 0.09909588919075402, + "grad_norm": 12.313212394714355, + "learning_rate": 3.903860789041802e-05, + "loss": 0.1565, + "step": 2225 + }, + { + "epoch": 0.1002093261479535, + "grad_norm": 2.2681148052215576, + "learning_rate": 3.9017061913654314e-05, + "loss": 0.1254, + "step": 2250 + }, + { + "epoch": 0.10132276310515298, + "grad_norm": 0.29880329966545105, + "learning_rate": 3.8995283248890145e-05, + "loss": 0.055, + "step": 2275 + }, + { + "epoch": 0.10243620006235248, + "grad_norm": 171.1862335205078, + "learning_rate": 3.897327216260383e-05, + "loss": 0.0868, + "step": 2300 + }, + { + "epoch": 0.10354963701955196, + "grad_norm": 10.678813934326172, + "learning_rate": 3.895102892411747e-05, + "loss": 0.1091, + "step": 2325 + }, + { + "epoch": 0.10466307397675144, + "grad_norm": 11.668232917785645, + "learning_rate": 3.892855380559376e-05, + "loss": 0.1167, + "step": 2350 + }, + { + "epoch": 0.10577651093395092, + "grad_norm": 0.5463631749153137, + "learning_rate": 3.8905847082032625e-05, + "loss": 0.1234, + "step": 2375 + }, + { + "epoch": 0.1068899478911504, + "grad_norm": 0.034784119576215744, + "learning_rate": 3.888290903126783e-05, + "loss": 0.197, + "step": 2400 + }, + { + "epoch": 0.10800338484834988, + "grad_norm": 0.06954146921634674, + "learning_rate": 3.885973993396362e-05, + "loss": 0.0356, + "step": 2425 + }, + { + "epoch": 0.10911682180554937, + "grad_norm": 0.052475456148386, + "learning_rate": 3.8836340073611245e-05, + "loss": 0.0923, + "step": 2450 + }, + { + "epoch": 0.11023025876274885, + "grad_norm": 0.033263199031353, + "learning_rate": 3.881270973652554e-05, + "loss": 0.1079, + "step": 2475 + }, + { + "epoch": 0.11134369571994833, + "grad_norm": 0.024731379002332687, + "learning_rate": 3.8788849211841386e-05, + "loss": 0.0463, + "step": 2500 + }, + { + "epoch": 0.11245713267714783, + "grad_norm": 0.13399425148963928, + "learning_rate": 3.8764758791510165e-05, + "loss": 0.0258, + "step": 2525 + }, + { + "epoch": 0.11357056963434731, + "grad_norm": 0.03367041051387787, + "learning_rate": 3.874043877029623e-05, + "loss": 0.1288, + "step": 2550 + }, + { + "epoch": 0.11468400659154679, + "grad_norm": 0.12984095513820648, + "learning_rate": 3.8715889445773263e-05, + "loss": 0.1032, + "step": 2575 + }, + { + "epoch": 0.11579744354874627, + "grad_norm": 4.606603622436523, + "learning_rate": 3.869111111832063e-05, + "loss": 0.0668, + "step": 2600 + }, + { + "epoch": 0.11691088050594575, + "grad_norm": 0.2087305188179016, + "learning_rate": 3.866610409111974e-05, + "loss": 0.1088, + "step": 2625 + }, + { + "epoch": 0.11802431746314523, + "grad_norm": 0.05369157716631889, + "learning_rate": 3.864086867015031e-05, + "loss": 0.048, + "step": 2650 + }, + { + "epoch": 0.11913775442034472, + "grad_norm": 0.21587111055850983, + "learning_rate": 3.861540516418663e-05, + "loss": 0.0999, + "step": 2675 + }, + { + "epoch": 0.1202511913775442, + "grad_norm": 0.8712963461875916, + "learning_rate": 3.858971388479375e-05, + "loss": 0.1256, + "step": 2700 + }, + { + "epoch": 0.1213646283347437, + "grad_norm": 0.18013902008533478, + "learning_rate": 3.8563795146323733e-05, + "loss": 0.1284, + "step": 2725 + }, + { + "epoch": 0.12247806529194318, + "grad_norm": 265.93426513671875, + "learning_rate": 3.8537649265911766e-05, + "loss": 0.1636, + "step": 2750 + }, + { + "epoch": 0.12359150224914266, + "grad_norm": 7.281203746795654, + "learning_rate": 3.851127656347228e-05, + "loss": 0.1206, + "step": 2775 + }, + { + "epoch": 0.12470493920634214, + "grad_norm": 0.5756829977035522, + "learning_rate": 3.8484677361695054e-05, + "loss": 0.158, + "step": 2800 + }, + { + "epoch": 0.1258183761635416, + "grad_norm": 67.49952697753906, + "learning_rate": 3.845785198604125e-05, + "loss": 0.0772, + "step": 2825 + }, + { + "epoch": 0.12693181312074112, + "grad_norm": 30.2692813873291, + "learning_rate": 3.843080076473943e-05, + "loss": 0.1661, + "step": 2850 + }, + { + "epoch": 0.1280452500779406, + "grad_norm": 0.4516218602657318, + "learning_rate": 3.840352402878156e-05, + "loss": 0.0842, + "step": 2875 + }, + { + "epoch": 0.12915868703514008, + "grad_norm": 10.178808212280273, + "learning_rate": 3.837602211191894e-05, + "loss": 0.1355, + "step": 2900 + }, + { + "epoch": 0.13027212399233956, + "grad_norm": 0.2553464472293854, + "learning_rate": 3.834829535065812e-05, + "loss": 0.1288, + "step": 2925 + }, + { + "epoch": 0.13138556094953904, + "grad_norm": 15.92351245880127, + "learning_rate": 3.832034408425682e-05, + "loss": 0.1105, + "step": 2950 + }, + { + "epoch": 0.13249899790673852, + "grad_norm": 0.41279423236846924, + "learning_rate": 3.82921686547197e-05, + "loss": 0.0394, + "step": 2975 + }, + { + "epoch": 0.133612434863938, + "grad_norm": 0.03520906716585159, + "learning_rate": 3.826376940679426e-05, + "loss": 0.118, + "step": 3000 + }, + { + "epoch": 0.1347258718211375, + "grad_norm": 156.47885131835938, + "learning_rate": 3.823514668796657e-05, + "loss": 0.0569, + "step": 3025 + }, + { + "epoch": 0.13583930877833697, + "grad_norm": 0.1534019112586975, + "learning_rate": 3.8206300848457054e-05, + "loss": 0.1154, + "step": 3050 + }, + { + "epoch": 0.13695274573553645, + "grad_norm": 14.097362518310547, + "learning_rate": 3.817723224121616e-05, + "loss": 0.0839, + "step": 3075 + }, + { + "epoch": 0.13806618269273593, + "grad_norm": 0.008445854298770428, + "learning_rate": 3.814794122192008e-05, + "loss": 0.0128, + "step": 3100 + }, + { + "epoch": 0.13917961964993542, + "grad_norm": 0.1724538505077362, + "learning_rate": 3.811842814896637e-05, + "loss": 0.099, + "step": 3125 + }, + { + "epoch": 0.1402930566071349, + "grad_norm": 9.441699981689453, + "learning_rate": 3.8088693383469604e-05, + "loss": 0.1023, + "step": 3150 + }, + { + "epoch": 0.14140649356433438, + "grad_norm": 22.488603591918945, + "learning_rate": 3.805873728925691e-05, + "loss": 0.1259, + "step": 3175 + }, + { + "epoch": 0.14251993052153386, + "grad_norm": 1.3374933004379272, + "learning_rate": 3.802856023286354e-05, + "loss": 0.0282, + "step": 3200 + }, + { + "epoch": 0.14363336747873334, + "grad_norm": 0.08540292829275131, + "learning_rate": 3.799816258352839e-05, + "loss": 0.1088, + "step": 3225 + }, + { + "epoch": 0.14474680443593282, + "grad_norm": 0.20437736809253693, + "learning_rate": 3.796754471318948e-05, + "loss": 0.1092, + "step": 3250 + }, + { + "epoch": 0.14586024139313233, + "grad_norm": 9.652408599853516, + "learning_rate": 3.7936706996479383e-05, + "loss": 0.1386, + "step": 3275 + }, + { + "epoch": 0.14697367835033182, + "grad_norm": 0.07140795141458511, + "learning_rate": 3.7905649810720685e-05, + "loss": 0.0751, + "step": 3300 + }, + { + "epoch": 0.1480871153075313, + "grad_norm": 0.33767127990722656, + "learning_rate": 3.78743735359213e-05, + "loss": 0.0276, + "step": 3325 + }, + { + "epoch": 0.14920055226473078, + "grad_norm": 12.312952995300293, + "learning_rate": 3.78428785547699e-05, + "loss": 0.1186, + "step": 3350 + }, + { + "epoch": 0.15031398922193026, + "grad_norm": 0.3609814941883087, + "learning_rate": 3.781116525263117e-05, + "loss": 0.0902, + "step": 3375 + }, + { + "epoch": 0.15142742617912974, + "grad_norm": 2.5911900997161865, + "learning_rate": 3.7779234017541116e-05, + "loss": 0.0772, + "step": 3400 + }, + { + "epoch": 0.15254086313632922, + "grad_norm": 0.28325122594833374, + "learning_rate": 3.7747085240202324e-05, + "loss": 0.0747, + "step": 3425 + }, + { + "epoch": 0.1536543000935287, + "grad_norm": 0.08380604535341263, + "learning_rate": 3.771471931397918e-05, + "loss": 0.0432, + "step": 3450 + }, + { + "epoch": 0.1547677370507282, + "grad_norm": 14.243409156799316, + "learning_rate": 3.768213663489304e-05, + "loss": 0.0615, + "step": 3475 + }, + { + "epoch": 0.15588117400792767, + "grad_norm": 56.65135955810547, + "learning_rate": 3.764933760161739e-05, + "loss": 0.0659, + "step": 3500 + }, + { + "epoch": 0.15699461096512715, + "grad_norm": 0.15652716159820557, + "learning_rate": 3.761632261547297e-05, + "loss": 0.1269, + "step": 3525 + }, + { + "epoch": 0.15810804792232663, + "grad_norm": 7.383078575134277, + "learning_rate": 3.758309208042288e-05, + "loss": 0.1109, + "step": 3550 + }, + { + "epoch": 0.15922148487952612, + "grad_norm": 95.15608978271484, + "learning_rate": 3.754964640306761e-05, + "loss": 0.146, + "step": 3575 + }, + { + "epoch": 0.1603349218367256, + "grad_norm": 0.1545642465353012, + "learning_rate": 3.751598599264008e-05, + "loss": 0.0986, + "step": 3600 + }, + { + "epoch": 0.16144835879392508, + "grad_norm": 0.13380536437034607, + "learning_rate": 3.748211126100063e-05, + "loss": 0.1089, + "step": 3625 + }, + { + "epoch": 0.16256179575112456, + "grad_norm": 22.723392486572266, + "learning_rate": 3.744802262263198e-05, + "loss": 0.1362, + "step": 3650 + }, + { + "epoch": 0.16367523270832404, + "grad_norm": 19.4218807220459, + "learning_rate": 3.741372049463417e-05, + "loss": 0.1095, + "step": 3675 + }, + { + "epoch": 0.16478866966552355, + "grad_norm": 17.95904541015625, + "learning_rate": 3.737920529671942e-05, + "loss": 0.0712, + "step": 3700 + }, + { + "epoch": 0.16590210662272303, + "grad_norm": 1.368179440498352, + "learning_rate": 3.734447745120704e-05, + "loss": 0.1154, + "step": 3725 + }, + { + "epoch": 0.16701554357992252, + "grad_norm": 0.02144320122897625, + "learning_rate": 3.730953738301826e-05, + "loss": 0.0832, + "step": 3750 + }, + { + "epoch": 0.168128980537122, + "grad_norm": 1.7905651330947876, + "learning_rate": 3.727438551967096e-05, + "loss": 0.0824, + "step": 3775 + }, + { + "epoch": 0.16924241749432148, + "grad_norm": 0.22187261283397675, + "learning_rate": 3.7239022291274546e-05, + "loss": 0.1902, + "step": 3800 + }, + { + "epoch": 0.17035585445152096, + "grad_norm": 0.06837914884090424, + "learning_rate": 3.72034481305246e-05, + "loss": 0.0784, + "step": 3825 + }, + { + "epoch": 0.17146929140872044, + "grad_norm": 0.0925784558057785, + "learning_rate": 3.7167663472697645e-05, + "loss": 0.0687, + "step": 3850 + }, + { + "epoch": 0.17258272836591992, + "grad_norm": 0.2474130392074585, + "learning_rate": 3.713166875564577e-05, + "loss": 0.1039, + "step": 3875 + }, + { + "epoch": 0.1736961653231194, + "grad_norm": 0.6533392071723938, + "learning_rate": 3.70954644197913e-05, + "loss": 0.0853, + "step": 3900 + }, + { + "epoch": 0.1748096022803189, + "grad_norm": 13.996048927307129, + "learning_rate": 3.705905090812143e-05, + "loss": 0.0873, + "step": 3925 + }, + { + "epoch": 0.17592303923751837, + "grad_norm": 15.874128341674805, + "learning_rate": 3.7022428666182736e-05, + "loss": 0.0953, + "step": 3950 + }, + { + "epoch": 0.17703647619471785, + "grad_norm": 189.27093505859375, + "learning_rate": 3.69855981420758e-05, + "loss": 0.1689, + "step": 3975 + }, + { + "epoch": 0.17814991315191733, + "grad_norm": 2.6344797611236572, + "learning_rate": 3.694855978644967e-05, + "loss": 0.0615, + "step": 4000 + }, + { + "epoch": 0.17926335010911681, + "grad_norm": 0.16542290151119232, + "learning_rate": 3.6911314052496376e-05, + "loss": 0.0705, + "step": 4025 + }, + { + "epoch": 0.1803767870663163, + "grad_norm": 0.18003667891025543, + "learning_rate": 3.687386139594539e-05, + "loss": 0.0997, + "step": 4050 + }, + { + "epoch": 0.18149022402351578, + "grad_norm": 16.793657302856445, + "learning_rate": 3.683620227505801e-05, + "loss": 0.0762, + "step": 4075 + }, + { + "epoch": 0.18260366098071526, + "grad_norm": 19.98533821105957, + "learning_rate": 3.67983371506218e-05, + "loss": 0.1019, + "step": 4100 + }, + { + "epoch": 0.18371709793791474, + "grad_norm": 11.568763732910156, + "learning_rate": 3.676026648594494e-05, + "loss": 0.0991, + "step": 4125 + }, + { + "epoch": 0.18483053489511425, + "grad_norm": 0.13966935873031616, + "learning_rate": 3.6721990746850516e-05, + "loss": 0.0766, + "step": 4150 + }, + { + "epoch": 0.18594397185231373, + "grad_norm": 9.846770286560059, + "learning_rate": 3.668351040167088e-05, + "loss": 0.1473, + "step": 4175 + }, + { + "epoch": 0.18705740880951321, + "grad_norm": 0.05613240599632263, + "learning_rate": 3.664482592124189e-05, + "loss": 0.0633, + "step": 4200 + }, + { + "epoch": 0.1881708457667127, + "grad_norm": 0.24261285364627838, + "learning_rate": 3.6605937778897145e-05, + "loss": 0.0994, + "step": 4225 + }, + { + "epoch": 0.18928428272391218, + "grad_norm": 0.037253621965646744, + "learning_rate": 3.6566846450462194e-05, + "loss": 0.073, + "step": 4250 + }, + { + "epoch": 0.19039771968111166, + "grad_norm": 0.6551414728164673, + "learning_rate": 3.652755241424875e-05, + "loss": 0.155, + "step": 4275 + }, + { + "epoch": 0.19151115663831114, + "grad_norm": 1.646484136581421, + "learning_rate": 3.6488056151048756e-05, + "loss": 0.0776, + "step": 4300 + }, + { + "epoch": 0.19262459359551062, + "grad_norm": 306.798095703125, + "learning_rate": 3.6448358144128606e-05, + "loss": 0.1294, + "step": 4325 + }, + { + "epoch": 0.1937380305527101, + "grad_norm": 0.2583596110343933, + "learning_rate": 3.640845887922315e-05, + "loss": 0.0403, + "step": 4350 + }, + { + "epoch": 0.1948514675099096, + "grad_norm": 0.03823433443903923, + "learning_rate": 3.636835884452978e-05, + "loss": 0.1147, + "step": 4375 + }, + { + "epoch": 0.19596490446710907, + "grad_norm": 0.09104909747838974, + "learning_rate": 3.6328058530702476e-05, + "loss": 0.0611, + "step": 4400 + }, + { + "epoch": 0.19707834142430855, + "grad_norm": 13.756211280822754, + "learning_rate": 3.6287558430845775e-05, + "loss": 0.1169, + "step": 4425 + }, + { + "epoch": 0.19819177838150803, + "grad_norm": 0.013652303256094456, + "learning_rate": 3.624685904050873e-05, + "loss": 0.0528, + "step": 4450 + }, + { + "epoch": 0.19930521533870751, + "grad_norm": 0.36829623579978943, + "learning_rate": 3.620596085767887e-05, + "loss": 0.0996, + "step": 4475 + }, + { + "epoch": 0.200418652295907, + "grad_norm": 0.16198916733264923, + "learning_rate": 3.616486438277612e-05, + "loss": 0.074, + "step": 4500 + }, + { + "epoch": 0.20153208925310648, + "grad_norm": 0.01803933084011078, + "learning_rate": 3.6123570118646635e-05, + "loss": 0.1303, + "step": 4525 + }, + { + "epoch": 0.20264552621030596, + "grad_norm": 11.962908744812012, + "learning_rate": 3.608207857055667e-05, + "loss": 0.1169, + "step": 4550 + }, + { + "epoch": 0.20375896316750547, + "grad_norm": 0.08494125306606293, + "learning_rate": 3.60403902461864e-05, + "loss": 0.1349, + "step": 4575 + }, + { + "epoch": 0.20487240012470495, + "grad_norm": 0.7877364158630371, + "learning_rate": 3.599850565562372e-05, + "loss": 0.108, + "step": 4600 + }, + { + "epoch": 0.20598583708190443, + "grad_norm": 90.93081665039062, + "learning_rate": 3.595642531135796e-05, + "loss": 0.1446, + "step": 4625 + }, + { + "epoch": 0.20709927403910391, + "grad_norm": 0.3855713903903961, + "learning_rate": 3.591414972827368e-05, + "loss": 0.0711, + "step": 4650 + }, + { + "epoch": 0.2082127109963034, + "grad_norm": 0.07253967225551605, + "learning_rate": 3.587167942364429e-05, + "loss": 0.0954, + "step": 4675 + }, + { + "epoch": 0.20932614795350288, + "grad_norm": 11.73831558227539, + "learning_rate": 3.58290149171258e-05, + "loss": 0.0758, + "step": 4700 + }, + { + "epoch": 0.21043958491070236, + "grad_norm": 0.1233116090297699, + "learning_rate": 3.5786156730750414e-05, + "loss": 0.0584, + "step": 4725 + }, + { + "epoch": 0.21155302186790184, + "grad_norm": 0.07078699767589569, + "learning_rate": 3.5743105388920145e-05, + "loss": 0.0915, + "step": 4750 + }, + { + "epoch": 0.21266645882510132, + "grad_norm": 19.74798583984375, + "learning_rate": 3.5699861418400415e-05, + "loss": 0.1626, + "step": 4775 + }, + { + "epoch": 0.2137798957823008, + "grad_norm": 0.04195041209459305, + "learning_rate": 3.565642534831361e-05, + "loss": 0.0012, + "step": 4800 + }, + { + "epoch": 0.2148933327395003, + "grad_norm": 75.83612060546875, + "learning_rate": 3.5612797710132586e-05, + "loss": 0.1313, + "step": 4825 + }, + { + "epoch": 0.21600676969669977, + "grad_norm": 0.07214100658893585, + "learning_rate": 3.556897903767419e-05, + "loss": 0.1026, + "step": 4850 + }, + { + "epoch": 0.21712020665389925, + "grad_norm": 0.12980283796787262, + "learning_rate": 3.552496986709269e-05, + "loss": 0.0828, + "step": 4875 + }, + { + "epoch": 0.21823364361109873, + "grad_norm": 0.013344530016183853, + "learning_rate": 3.5480770736873275e-05, + "loss": 0.083, + "step": 4900 + }, + { + "epoch": 0.2193470805682982, + "grad_norm": 3.2380993366241455, + "learning_rate": 3.5436382187825425e-05, + "loss": 0.1372, + "step": 4925 + }, + { + "epoch": 0.2204605175254977, + "grad_norm": 0.045397862792015076, + "learning_rate": 3.5391804763076275e-05, + "loss": 0.0526, + "step": 4950 + }, + { + "epoch": 0.22157395448269718, + "grad_norm": 0.09334590286016464, + "learning_rate": 3.534703900806404e-05, + "loss": 0.1112, + "step": 4975 + }, + { + "epoch": 0.22268739143989666, + "grad_norm": 0.02267163060605526, + "learning_rate": 3.530208547053124e-05, + "loss": 0.121, + "step": 5000 + }, + { + "epoch": 0.22380082839709617, + "grad_norm": 0.24515791237354279, + "learning_rate": 3.5256944700518106e-05, + "loss": 0.0859, + "step": 5025 + }, + { + "epoch": 0.22491426535429565, + "grad_norm": 0.4689369201660156, + "learning_rate": 3.521161725035579e-05, + "loss": 0.107, + "step": 5050 + }, + { + "epoch": 0.22602770231149513, + "grad_norm": 0.13337279856204987, + "learning_rate": 3.5166103674659584e-05, + "loss": 0.071, + "step": 5075 + }, + { + "epoch": 0.22714113926869461, + "grad_norm": 67.71549224853516, + "learning_rate": 3.512040453032219e-05, + "loss": 0.0726, + "step": 5100 + }, + { + "epoch": 0.2282545762258941, + "grad_norm": 0.20219382643699646, + "learning_rate": 3.507452037650689e-05, + "loss": 0.1498, + "step": 5125 + }, + { + "epoch": 0.22936801318309358, + "grad_norm": 0.007606521248817444, + "learning_rate": 3.502845177464068e-05, + "loss": 0.0418, + "step": 5150 + }, + { + "epoch": 0.23048145014029306, + "grad_norm": 0.3652168810367584, + "learning_rate": 3.49821992884074e-05, + "loss": 0.0834, + "step": 5175 + }, + { + "epoch": 0.23159488709749254, + "grad_norm": 11.301214218139648, + "learning_rate": 3.49357634837409e-05, + "loss": 0.0726, + "step": 5200 + }, + { + "epoch": 0.23270832405469202, + "grad_norm": 9.451020240783691, + "learning_rate": 3.4889144928818015e-05, + "loss": 0.1142, + "step": 5225 + }, + { + "epoch": 0.2338217610118915, + "grad_norm": 0.5295103192329407, + "learning_rate": 3.4842344194051715e-05, + "loss": 0.099, + "step": 5250 + }, + { + "epoch": 0.234935197969091, + "grad_norm": 0.1344202160835266, + "learning_rate": 3.479536185208404e-05, + "loss": 0.1088, + "step": 5275 + }, + { + "epoch": 0.23604863492629047, + "grad_norm": 0.0730552077293396, + "learning_rate": 3.474819847777915e-05, + "loss": 0.0902, + "step": 5300 + }, + { + "epoch": 0.23716207188348995, + "grad_norm": 22.340232849121094, + "learning_rate": 3.470085464821626e-05, + "loss": 0.0695, + "step": 5325 + }, + { + "epoch": 0.23827550884068943, + "grad_norm": 0.012485889717936516, + "learning_rate": 3.46533309426826e-05, + "loss": 0.1368, + "step": 5350 + }, + { + "epoch": 0.2393889457978889, + "grad_norm": 0.07602645456790924, + "learning_rate": 3.460562794266631e-05, + "loss": 0.0963, + "step": 5375 + }, + { + "epoch": 0.2405023827550884, + "grad_norm": 0.028351513668894768, + "learning_rate": 3.455774623184933e-05, + "loss": 0.0761, + "step": 5400 + }, + { + "epoch": 0.24161581971228788, + "grad_norm": 0.04493757709860802, + "learning_rate": 3.450968639610028e-05, + "loss": 0.0967, + "step": 5425 + }, + { + "epoch": 0.2427292566694874, + "grad_norm": 0.04149246960878372, + "learning_rate": 3.4461449023467254e-05, + "loss": 0.0748, + "step": 5450 + }, + { + "epoch": 0.24384269362668687, + "grad_norm": 0.18419454991817474, + "learning_rate": 3.441303470417063e-05, + "loss": 0.0576, + "step": 5475 + }, + { + "epoch": 0.24495613058388635, + "grad_norm": 5.215043067932129, + "learning_rate": 3.4364444030595893e-05, + "loss": 0.0965, + "step": 5500 + }, + { + "epoch": 0.24606956754108583, + "grad_norm": 0.19195443391799927, + "learning_rate": 3.4315677597286344e-05, + "loss": 0.0912, + "step": 5525 + }, + { + "epoch": 0.2471830044982853, + "grad_norm": 0.12233854830265045, + "learning_rate": 3.4266736000935816e-05, + "loss": 0.0968, + "step": 5550 + }, + { + "epoch": 0.2482964414554848, + "grad_norm": 0.017935922369360924, + "learning_rate": 3.4217619840381416e-05, + "loss": 0.0432, + "step": 5575 + }, + { + "epoch": 0.24940987841268428, + "grad_norm": 0.3018956780433655, + "learning_rate": 3.4168329716596164e-05, + "loss": 0.08, + "step": 5600 + }, + { + "epoch": 0.25052331536988376, + "grad_norm": 71.5376968383789, + "learning_rate": 3.411886623268166e-05, + "loss": 0.1331, + "step": 5625 + }, + { + "epoch": 0.2516367523270832, + "grad_norm": 0.12258598953485489, + "learning_rate": 3.406922999386069e-05, + "loss": 0.074, + "step": 5650 + }, + { + "epoch": 0.2527501892842827, + "grad_norm": 20.980422973632812, + "learning_rate": 3.401942160746981e-05, + "loss": 0.1432, + "step": 5675 + }, + { + "epoch": 0.25386362624148223, + "grad_norm": 0.22452101111412048, + "learning_rate": 3.396944168295195e-05, + "loss": 0.0806, + "step": 5700 + }, + { + "epoch": 0.2549770631986817, + "grad_norm": 1.4218170642852783, + "learning_rate": 3.391929083184895e-05, + "loss": 0.0626, + "step": 5725 + }, + { + "epoch": 0.2560905001558812, + "grad_norm": 10.604058265686035, + "learning_rate": 3.3868969667794025e-05, + "loss": 0.106, + "step": 5750 + }, + { + "epoch": 0.25720393711308065, + "grad_norm": 0.07359682023525238, + "learning_rate": 3.381847880650433e-05, + "loss": 0.0403, + "step": 5775 + }, + { + "epoch": 0.25831737407028016, + "grad_norm": 0.10821941494941711, + "learning_rate": 3.3767818865773376e-05, + "loss": 0.0533, + "step": 5800 + }, + { + "epoch": 0.2594308110274796, + "grad_norm": 8.280586242675781, + "learning_rate": 3.37169904654635e-05, + "loss": 0.0945, + "step": 5825 + }, + { + "epoch": 0.2605442479846791, + "grad_norm": 1.2354644536972046, + "learning_rate": 3.366599422749825e-05, + "loss": 0.1221, + "step": 5850 + }, + { + "epoch": 0.2616576849418786, + "grad_norm": 1.7320414781570435, + "learning_rate": 3.361483077585482e-05, + "loss": 0.0921, + "step": 5875 + }, + { + "epoch": 0.2627711218990781, + "grad_norm": 1.0141493082046509, + "learning_rate": 3.356350073655636e-05, + "loss": 0.0727, + "step": 5900 + }, + { + "epoch": 0.26388455885627754, + "grad_norm": 8.444538116455078, + "learning_rate": 3.3512004737664376e-05, + "loss": 0.1648, + "step": 5925 + }, + { + "epoch": 0.26499799581347705, + "grad_norm": 7.873695373535156, + "learning_rate": 3.346034340927097e-05, + "loss": 0.0527, + "step": 5950 + }, + { + "epoch": 0.2661114327706765, + "grad_norm": 1.0561110973358154, + "learning_rate": 3.340851738349122e-05, + "loss": 0.0841, + "step": 5975 + }, + { + "epoch": 0.267224869727876, + "grad_norm": 0.1629607230424881, + "learning_rate": 3.335652729445538e-05, + "loss": 0.0611, + "step": 6000 + }, + { + "epoch": 0.26833830668507547, + "grad_norm": 0.042299434542655945, + "learning_rate": 3.330437377830113e-05, + "loss": 0.1361, + "step": 6025 + }, + { + "epoch": 0.269451743642275, + "grad_norm": 0.07904113829135895, + "learning_rate": 3.325205747316582e-05, + "loss": 0.0895, + "step": 6050 + }, + { + "epoch": 0.27056518059947443, + "grad_norm": 0.04022669047117233, + "learning_rate": 3.3199579019178644e-05, + "loss": 0.0667, + "step": 6075 + }, + { + "epoch": 0.27167861755667394, + "grad_norm": 94.2168960571289, + "learning_rate": 3.31469390584528e-05, + "loss": 0.1373, + "step": 6100 + }, + { + "epoch": 0.27279205451387345, + "grad_norm": 23.157451629638672, + "learning_rate": 3.309413823507764e-05, + "loss": 0.0442, + "step": 6125 + }, + { + "epoch": 0.2739054914710729, + "grad_norm": 0.027611056342720985, + "learning_rate": 3.3041177195110816e-05, + "loss": 0.0808, + "step": 6150 + }, + { + "epoch": 0.2750189284282724, + "grad_norm": 3.2276134490966797, + "learning_rate": 3.298805658657031e-05, + "loss": 0.132, + "step": 6175 + }, + { + "epoch": 0.27613236538547187, + "grad_norm": 0.03948564827442169, + "learning_rate": 3.2934777059426584e-05, + "loss": 0.1221, + "step": 6200 + }, + { + "epoch": 0.2772458023426714, + "grad_norm": 0.2750093638896942, + "learning_rate": 3.288133926559456e-05, + "loss": 0.103, + "step": 6225 + }, + { + "epoch": 0.27835923929987083, + "grad_norm": 0.9560427069664001, + "learning_rate": 3.2827743858925685e-05, + "loss": 0.0519, + "step": 6250 + }, + { + "epoch": 0.27947267625707034, + "grad_norm": 0.11851734668016434, + "learning_rate": 3.2773991495199913e-05, + "loss": 0.1086, + "step": 6275 + }, + { + "epoch": 0.2805861132142698, + "grad_norm": 7.182758808135986, + "learning_rate": 3.272008283211769e-05, + "loss": 0.056, + "step": 6300 + }, + { + "epoch": 0.2816995501714693, + "grad_norm": 0.04427009075880051, + "learning_rate": 3.266601852929189e-05, + "loss": 0.0858, + "step": 6325 + }, + { + "epoch": 0.28281298712866876, + "grad_norm": 0.735414445400238, + "learning_rate": 3.261179924823978e-05, + "loss": 0.0891, + "step": 6350 + }, + { + "epoch": 0.28392642408586827, + "grad_norm": 0.01727837324142456, + "learning_rate": 3.255742565237487e-05, + "loss": 0.0403, + "step": 6375 + }, + { + "epoch": 0.2850398610430677, + "grad_norm": 8.384565353393555, + "learning_rate": 3.250289840699885e-05, + "loss": 0.1248, + "step": 6400 + }, + { + "epoch": 0.28615329800026723, + "grad_norm": 19.30202865600586, + "learning_rate": 3.244821817929342e-05, + "loss": 0.078, + "step": 6425 + }, + { + "epoch": 0.2872667349574667, + "grad_norm": 0.05216224864125252, + "learning_rate": 3.239338563831213e-05, + "loss": 0.1063, + "step": 6450 + }, + { + "epoch": 0.2883801719146662, + "grad_norm": 0.3387221097946167, + "learning_rate": 3.23384014549722e-05, + "loss": 0.0174, + "step": 6475 + }, + { + "epoch": 0.28949360887186565, + "grad_norm": 0.050413765013217926, + "learning_rate": 3.228326630204632e-05, + "loss": 0.1149, + "step": 6500 + }, + { + "epoch": 0.29060704582906516, + "grad_norm": 0.05692288279533386, + "learning_rate": 3.2227980854154376e-05, + "loss": 0.0229, + "step": 6525 + }, + { + "epoch": 0.29172048278626467, + "grad_norm": 0.2883279323577881, + "learning_rate": 3.217254578775525e-05, + "loss": 0.0801, + "step": 6550 + }, + { + "epoch": 0.2928339197434641, + "grad_norm": 0.028399428352713585, + "learning_rate": 3.2116961781138525e-05, + "loss": 0.0652, + "step": 6575 + }, + { + "epoch": 0.29394735670066363, + "grad_norm": 18.74675750732422, + "learning_rate": 3.206122951441615e-05, + "loss": 0.1657, + "step": 6600 + }, + { + "epoch": 0.2950607936578631, + "grad_norm": 0.3412401080131531, + "learning_rate": 3.200534966951418e-05, + "loss": 0.1038, + "step": 6625 + }, + { + "epoch": 0.2961742306150626, + "grad_norm": 0.29099151492118835, + "learning_rate": 3.194932293016437e-05, + "loss": 0.0817, + "step": 6650 + }, + { + "epoch": 0.29728766757226205, + "grad_norm": 0.16885584592819214, + "learning_rate": 3.189314998189586e-05, + "loss": 0.0756, + "step": 6675 + }, + { + "epoch": 0.29840110452946156, + "grad_norm": 0.11662675440311432, + "learning_rate": 3.1836831512026746e-05, + "loss": 0.1049, + "step": 6700 + }, + { + "epoch": 0.299514541486661, + "grad_norm": 10.235050201416016, + "learning_rate": 3.1780368209655715e-05, + "loss": 0.0625, + "step": 6725 + }, + { + "epoch": 0.3006279784438605, + "grad_norm": 0.013582950457930565, + "learning_rate": 3.1723760765653566e-05, + "loss": 0.0811, + "step": 6750 + }, + { + "epoch": 0.30174141540106, + "grad_norm": 0.028899747878313065, + "learning_rate": 3.1667009872654804e-05, + "loss": 0.1065, + "step": 6775 + }, + { + "epoch": 0.3028548523582595, + "grad_norm": 0.2777616083621979, + "learning_rate": 3.1610116225049107e-05, + "loss": 0.0882, + "step": 6800 + }, + { + "epoch": 0.30396828931545894, + "grad_norm": 0.2561856806278229, + "learning_rate": 3.155308051897287e-05, + "loss": 0.0876, + "step": 6825 + }, + { + "epoch": 0.30508172627265845, + "grad_norm": 0.0948658362030983, + "learning_rate": 3.149590345230072e-05, + "loss": 0.153, + "step": 6850 + }, + { + "epoch": 0.3061951632298579, + "grad_norm": 0.28858181834220886, + "learning_rate": 3.143858572463688e-05, + "loss": 0.125, + "step": 6875 + }, + { + "epoch": 0.3073086001870574, + "grad_norm": 1.5904790163040161, + "learning_rate": 3.1381128037306706e-05, + "loss": 0.0845, + "step": 6900 + }, + { + "epoch": 0.30842203714425687, + "grad_norm": 7.804311275482178, + "learning_rate": 3.1323531093348036e-05, + "loss": 0.0496, + "step": 6925 + }, + { + "epoch": 0.3095354741014564, + "grad_norm": 1.7563951015472412, + "learning_rate": 3.126579559750265e-05, + "loss": 0.1077, + "step": 6950 + }, + { + "epoch": 0.3106489110586559, + "grad_norm": 11.621973991394043, + "learning_rate": 3.120792225620759e-05, + "loss": 0.0678, + "step": 6975 + }, + { + "epoch": 0.31176234801585534, + "grad_norm": 54.6142692565918, + "learning_rate": 3.1149911777586533e-05, + "loss": 0.1449, + "step": 7000 + }, + { + "epoch": 0.31287578497305485, + "grad_norm": 0.6821468472480774, + "learning_rate": 3.109176487144116e-05, + "loss": 0.0686, + "step": 7025 + }, + { + "epoch": 0.3139892219302543, + "grad_norm": 0.13249638676643372, + "learning_rate": 3.103348224924244e-05, + "loss": 0.1183, + "step": 7050 + }, + { + "epoch": 0.3151026588874538, + "grad_norm": 0.16593268513679504, + "learning_rate": 3.09750646241219e-05, + "loss": 0.126, + "step": 7075 + }, + { + "epoch": 0.31621609584465327, + "grad_norm": 0.07781495153903961, + "learning_rate": 3.091651271086297e-05, + "loss": 0.0662, + "step": 7100 + }, + { + "epoch": 0.3173295328018528, + "grad_norm": 14.361419677734375, + "learning_rate": 3.085782722589217e-05, + "loss": 0.049, + "step": 7125 + }, + { + "epoch": 0.31844296975905223, + "grad_norm": 0.07331876456737518, + "learning_rate": 3.0799008887270376e-05, + "loss": 0.1351, + "step": 7150 + }, + { + "epoch": 0.31955640671625174, + "grad_norm": 0.1390572488307953, + "learning_rate": 3.074005841468403e-05, + "loss": 0.0748, + "step": 7175 + }, + { + "epoch": 0.3206698436734512, + "grad_norm": 0.049495305866003036, + "learning_rate": 3.068097652943633e-05, + "loss": 0.1004, + "step": 7200 + }, + { + "epoch": 0.3217832806306507, + "grad_norm": 12.76275634765625, + "learning_rate": 3.0621763954438417e-05, + "loss": 0.1007, + "step": 7225 + }, + { + "epoch": 0.32289671758785016, + "grad_norm": 13.835783958435059, + "learning_rate": 3.0562421414200495e-05, + "loss": 0.0994, + "step": 7250 + }, + { + "epoch": 0.32401015454504967, + "grad_norm": 11.560086250305176, + "learning_rate": 3.0502949634823004e-05, + "loss": 0.1399, + "step": 7275 + }, + { + "epoch": 0.3251235915022491, + "grad_norm": 125.53492736816406, + "learning_rate": 3.0443349343987728e-05, + "loss": 0.0669, + "step": 7300 + }, + { + "epoch": 0.32623702845944863, + "grad_norm": 0.15544213354587555, + "learning_rate": 3.0383621270948888e-05, + "loss": 0.0814, + "step": 7325 + }, + { + "epoch": 0.3273504654166481, + "grad_norm": 0.24035079777240753, + "learning_rate": 3.032376614652419e-05, + "loss": 0.1366, + "step": 7350 + }, + { + "epoch": 0.3284639023738476, + "grad_norm": 0.06057662144303322, + "learning_rate": 3.0263784703085935e-05, + "loss": 0.0574, + "step": 7375 + }, + { + "epoch": 0.3295773393310471, + "grad_norm": 0.25817441940307617, + "learning_rate": 3.0203677674552022e-05, + "loss": 0.0275, + "step": 7400 + }, + { + "epoch": 0.33069077628824656, + "grad_norm": 0.24544256925582886, + "learning_rate": 3.014344579637698e-05, + "loss": 0.0903, + "step": 7425 + }, + { + "epoch": 0.33180421324544607, + "grad_norm": 17.432865142822266, + "learning_rate": 3.0083089805542963e-05, + "loss": 0.1184, + "step": 7450 + }, + { + "epoch": 0.3329176502026455, + "grad_norm": 16.43193244934082, + "learning_rate": 3.0022610440550733e-05, + "loss": 0.1503, + "step": 7475 + }, + { + "epoch": 0.33403108715984503, + "grad_norm": 0.01428154855966568, + "learning_rate": 2.9962008441410645e-05, + "loss": 0.1138, + "step": 7500 + }, + { + "epoch": 0.3351445241170445, + "grad_norm": 0.045094311237335205, + "learning_rate": 2.9901284549633554e-05, + "loss": 0.0755, + "step": 7525 + }, + { + "epoch": 0.336257961074244, + "grad_norm": 16.375850677490234, + "learning_rate": 2.9840439508221774e-05, + "loss": 0.1069, + "step": 7550 + }, + { + "epoch": 0.33737139803144345, + "grad_norm": 0.2958405911922455, + "learning_rate": 2.9779474061659974e-05, + "loss": 0.1006, + "step": 7575 + }, + { + "epoch": 0.33848483498864296, + "grad_norm": 0.21808381378650665, + "learning_rate": 2.9718388955906063e-05, + "loss": 0.1078, + "step": 7600 + }, + { + "epoch": 0.3395982719458424, + "grad_norm": 0.09185987710952759, + "learning_rate": 2.9657184938382087e-05, + "loss": 0.0668, + "step": 7625 + }, + { + "epoch": 0.3407117089030419, + "grad_norm": 0.903226912021637, + "learning_rate": 2.9595862757965043e-05, + "loss": 0.1102, + "step": 7650 + }, + { + "epoch": 0.3418251458602414, + "grad_norm": 0.2668018937110901, + "learning_rate": 2.9534423164977765e-05, + "loss": 0.1009, + "step": 7675 + }, + { + "epoch": 0.3429385828174409, + "grad_norm": 0.26890435814857483, + "learning_rate": 2.9472866911179695e-05, + "loss": 0.06, + "step": 7700 + }, + { + "epoch": 0.34405201977464034, + "grad_norm": 17.915388107299805, + "learning_rate": 2.941119474975772e-05, + "loss": 0.1073, + "step": 7725 + }, + { + "epoch": 0.34516545673183985, + "grad_norm": 0.05032012239098549, + "learning_rate": 2.934940743531694e-05, + "loss": 0.0772, + "step": 7750 + }, + { + "epoch": 0.3462788936890393, + "grad_norm": 0.38257360458374023, + "learning_rate": 2.928750572387144e-05, + "loss": 0.1137, + "step": 7775 + }, + { + "epoch": 0.3473923306462388, + "grad_norm": 2.620697498321533, + "learning_rate": 2.9225490372835043e-05, + "loss": 0.1189, + "step": 7800 + }, + { + "epoch": 0.34850576760343827, + "grad_norm": 1.1211553812026978, + "learning_rate": 2.9163362141012024e-05, + "loss": 0.0608, + "step": 7825 + }, + { + "epoch": 0.3496192045606378, + "grad_norm": 11.626333236694336, + "learning_rate": 2.9101121788587846e-05, + "loss": 0.0781, + "step": 7850 + }, + { + "epoch": 0.3507326415178373, + "grad_norm": 0.05085618793964386, + "learning_rate": 2.9038770077119865e-05, + "loss": 0.0555, + "step": 7875 + }, + { + "epoch": 0.35184607847503674, + "grad_norm": 0.007242363411933184, + "learning_rate": 2.8976307769527974e-05, + "loss": 0.0769, + "step": 7900 + }, + { + "epoch": 0.35295951543223625, + "grad_norm": 0.018498949706554413, + "learning_rate": 2.8913735630085305e-05, + "loss": 0.0516, + "step": 7925 + }, + { + "epoch": 0.3540729523894357, + "grad_norm": 0.01721395179629326, + "learning_rate": 2.885105442440887e-05, + "loss": 0.1599, + "step": 7950 + }, + { + "epoch": 0.3551863893466352, + "grad_norm": 12.357562065124512, + "learning_rate": 2.878826491945018e-05, + "loss": 0.0521, + "step": 7975 + }, + { + "epoch": 0.35629982630383467, + "grad_norm": 0.11994218826293945, + "learning_rate": 2.872536788348587e-05, + "loss": 0.0558, + "step": 8000 + }, + { + "epoch": 0.3574132632610342, + "grad_norm": 6.6094536781311035, + "learning_rate": 2.8662364086108302e-05, + "loss": 0.1597, + "step": 8025 + }, + { + "epoch": 0.35852670021823363, + "grad_norm": 1.3313615322113037, + "learning_rate": 2.8599254298216136e-05, + "loss": 0.1593, + "step": 8050 + }, + { + "epoch": 0.35964013717543314, + "grad_norm": 0.2207033932209015, + "learning_rate": 2.853603929200491e-05, + "loss": 0.0781, + "step": 8075 + }, + { + "epoch": 0.3607535741326326, + "grad_norm": 0.11759678274393082, + "learning_rate": 2.847271984095759e-05, + "loss": 0.0513, + "step": 8100 + }, + { + "epoch": 0.3618670110898321, + "grad_norm": 0.1604037880897522, + "learning_rate": 2.84092967198351e-05, + "loss": 0.1056, + "step": 8125 + }, + { + "epoch": 0.36298044804703156, + "grad_norm": 0.05803081393241882, + "learning_rate": 2.834577070466684e-05, + "loss": 0.0579, + "step": 8150 + }, + { + "epoch": 0.36409388500423107, + "grad_norm": 0.8341618180274963, + "learning_rate": 2.8282142572741205e-05, + "loss": 0.0779, + "step": 8175 + }, + { + "epoch": 0.3652073219614305, + "grad_norm": 0.24928626418113708, + "learning_rate": 2.821841310259606e-05, + "loss": 0.1118, + "step": 8200 + }, + { + "epoch": 0.36632075891863003, + "grad_norm": 92.14579010009766, + "learning_rate": 2.8154583074009216e-05, + "loss": 0.0581, + "step": 8225 + }, + { + "epoch": 0.3674341958758295, + "grad_norm": 0.010704785585403442, + "learning_rate": 2.80906532679889e-05, + "loss": 0.0959, + "step": 8250 + }, + { + "epoch": 0.368547632833029, + "grad_norm": 0.05830851569771767, + "learning_rate": 2.802662446676418e-05, + "loss": 0.0383, + "step": 8275 + }, + { + "epoch": 0.3696610697902285, + "grad_norm": 14.7666597366333, + "learning_rate": 2.796249745377541e-05, + "loss": 0.0381, + "step": 8300 + }, + { + "epoch": 0.37077450674742796, + "grad_norm": 0.33050060272216797, + "learning_rate": 2.7898273013664657e-05, + "loss": 0.1475, + "step": 8325 + }, + { + "epoch": 0.37188794370462747, + "grad_norm": 0.006766165606677532, + "learning_rate": 2.7833951932266045e-05, + "loss": 0.1048, + "step": 8350 + }, + { + "epoch": 0.3730013806618269, + "grad_norm": 57.38689422607422, + "learning_rate": 2.7769534996596196e-05, + "loss": 0.0475, + "step": 8375 + }, + { + "epoch": 0.37411481761902643, + "grad_norm": 0.013443752191960812, + "learning_rate": 2.7705022994844588e-05, + "loss": 0.0451, + "step": 8400 + }, + { + "epoch": 0.3752282545762259, + "grad_norm": 21.72231674194336, + "learning_rate": 2.7640416716363896e-05, + "loss": 0.1466, + "step": 8425 + }, + { + "epoch": 0.3763416915334254, + "grad_norm": 7.488071918487549, + "learning_rate": 2.7575716951660335e-05, + "loss": 0.1379, + "step": 8450 + }, + { + "epoch": 0.37745512849062485, + "grad_norm": 0.05108782649040222, + "learning_rate": 2.7510924492384003e-05, + "loss": 0.0835, + "step": 8475 + }, + { + "epoch": 0.37856856544782436, + "grad_norm": 1.109173059463501, + "learning_rate": 2.7446040131319188e-05, + "loss": 0.1243, + "step": 8500 + }, + { + "epoch": 0.3796820024050238, + "grad_norm": 1.1221072673797607, + "learning_rate": 2.7381064662374655e-05, + "loss": 0.1334, + "step": 8525 + }, + { + "epoch": 0.3807954393622233, + "grad_norm": 16.271162033081055, + "learning_rate": 2.7315998880573943e-05, + "loss": 0.0482, + "step": 8550 + }, + { + "epoch": 0.3819088763194228, + "grad_norm": 0.41395649313926697, + "learning_rate": 2.7250843582045653e-05, + "loss": 0.0814, + "step": 8575 + }, + { + "epoch": 0.3830223132766223, + "grad_norm": 0.26396995782852173, + "learning_rate": 2.7185599564013666e-05, + "loss": 0.0503, + "step": 8600 + }, + { + "epoch": 0.38413575023382174, + "grad_norm": 0.15254704654216766, + "learning_rate": 2.7120267624787435e-05, + "loss": 0.0801, + "step": 8625 + }, + { + "epoch": 0.38524918719102125, + "grad_norm": 0.46369659900665283, + "learning_rate": 2.7054848563752175e-05, + "loss": 0.1062, + "step": 8650 + }, + { + "epoch": 0.3863626241482207, + "grad_norm": 0.3597426116466522, + "learning_rate": 2.698934318135912e-05, + "loss": 0.1508, + "step": 8675 + }, + { + "epoch": 0.3874760611054202, + "grad_norm": 0.08787465840578079, + "learning_rate": 2.6923752279115693e-05, + "loss": 0.0501, + "step": 8700 + }, + { + "epoch": 0.3885894980626197, + "grad_norm": 14.235735893249512, + "learning_rate": 2.6858076659575724e-05, + "loss": 0.0919, + "step": 8725 + }, + { + "epoch": 0.3897029350198192, + "grad_norm": 0.059386011213064194, + "learning_rate": 2.6792317126329632e-05, + "loss": 0.0746, + "step": 8750 + }, + { + "epoch": 0.3908163719770187, + "grad_norm": 0.08174929022789001, + "learning_rate": 2.672647448399457e-05, + "loss": 0.0747, + "step": 8775 + }, + { + "epoch": 0.39192980893421814, + "grad_norm": 109.75809478759766, + "learning_rate": 2.66605495382046e-05, + "loss": 0.0878, + "step": 8800 + }, + { + "epoch": 0.39304324589141765, + "grad_norm": 21.897994995117188, + "learning_rate": 2.659454309560082e-05, + "loss": 0.043, + "step": 8825 + }, + { + "epoch": 0.3941566828486171, + "grad_norm": 0.1924513429403305, + "learning_rate": 2.652845596382152e-05, + "loss": 0.0445, + "step": 8850 + }, + { + "epoch": 0.3952701198058166, + "grad_norm": 10.962334632873535, + "learning_rate": 2.6462288951492255e-05, + "loss": 0.1423, + "step": 8875 + }, + { + "epoch": 0.39638355676301607, + "grad_norm": 1.092352271080017, + "learning_rate": 2.639604286821601e-05, + "loss": 0.0513, + "step": 8900 + }, + { + "epoch": 0.3974969937202156, + "grad_norm": 0.07695068418979645, + "learning_rate": 2.632971852456323e-05, + "loss": 0.0651, + "step": 8925 + }, + { + "epoch": 0.39861043067741503, + "grad_norm": 0.3159739077091217, + "learning_rate": 2.6263316732061967e-05, + "loss": 0.0808, + "step": 8950 + }, + { + "epoch": 0.39972386763461454, + "grad_norm": 0.11245528608560562, + "learning_rate": 2.619683830318789e-05, + "loss": 0.0989, + "step": 8975 + }, + { + "epoch": 0.400837304591814, + "grad_norm": 0.005162264686077833, + "learning_rate": 2.6130284051354394e-05, + "loss": 0.1038, + "step": 9000 + }, + { + "epoch": 0.4019507415490135, + "grad_norm": 0.07931370288133621, + "learning_rate": 2.6063654790902605e-05, + "loss": 0.1288, + "step": 9025 + }, + { + "epoch": 0.40306417850621296, + "grad_norm": 0.1483098566532135, + "learning_rate": 2.599695133709145e-05, + "loss": 0.0582, + "step": 9050 + }, + { + "epoch": 0.40417761546341247, + "grad_norm": 5.938066482543945, + "learning_rate": 2.593017450608766e-05, + "loss": 0.1444, + "step": 9075 + }, + { + "epoch": 0.4052910524206119, + "grad_norm": 0.01066511683166027, + "learning_rate": 2.586332511495579e-05, + "loss": 0.0234, + "step": 9100 + }, + { + "epoch": 0.40640448937781143, + "grad_norm": 0.09881523251533508, + "learning_rate": 2.5796403981648232e-05, + "loss": 0.1064, + "step": 9125 + }, + { + "epoch": 0.40751792633501094, + "grad_norm": 61.86650466918945, + "learning_rate": 2.5729411924995176e-05, + "loss": 0.0709, + "step": 9150 + }, + { + "epoch": 0.4086313632922104, + "grad_norm": 0.13846437633037567, + "learning_rate": 2.5662349764694644e-05, + "loss": 0.0989, + "step": 9175 + }, + { + "epoch": 0.4097448002494099, + "grad_norm": 0.09358134865760803, + "learning_rate": 2.5595218321302404e-05, + "loss": 0.1336, + "step": 9200 + }, + { + "epoch": 0.41085823720660936, + "grad_norm": 0.14855454862117767, + "learning_rate": 2.5528018416221957e-05, + "loss": 0.1036, + "step": 9225 + }, + { + "epoch": 0.41197167416380887, + "grad_norm": 1.0118060111999512, + "learning_rate": 2.5460750871694496e-05, + "loss": 0.1035, + "step": 9250 + }, + { + "epoch": 0.4130851111210083, + "grad_norm": 0.2535758316516876, + "learning_rate": 2.5393416510788833e-05, + "loss": 0.0889, + "step": 9275 + }, + { + "epoch": 0.41419854807820783, + "grad_norm": 13.825139045715332, + "learning_rate": 2.5326016157391307e-05, + "loss": 0.1798, + "step": 9300 + }, + { + "epoch": 0.4153119850354073, + "grad_norm": 0.3431245684623718, + "learning_rate": 2.5258550636195746e-05, + "loss": 0.0251, + "step": 9325 + }, + { + "epoch": 0.4164254219926068, + "grad_norm": 0.01225372590124607, + "learning_rate": 2.519102077269335e-05, + "loss": 0.0519, + "step": 9350 + }, + { + "epoch": 0.41753885894980625, + "grad_norm": 0.027721190825104713, + "learning_rate": 2.512342739316259e-05, + "loss": 0.0958, + "step": 9375 + }, + { + "epoch": 0.41865229590700576, + "grad_norm": 0.017482172697782516, + "learning_rate": 2.5055771324659105e-05, + "loss": 0.0938, + "step": 9400 + }, + { + "epoch": 0.4197657328642052, + "grad_norm": 47.62452697753906, + "learning_rate": 2.4988053395005592e-05, + "loss": 0.0447, + "step": 9425 + }, + { + "epoch": 0.4208791698214047, + "grad_norm": 20.639039993286133, + "learning_rate": 2.4920274432781647e-05, + "loss": 0.0708, + "step": 9450 + }, + { + "epoch": 0.4219926067786042, + "grad_norm": 0.03869742900133133, + "learning_rate": 2.485243526731366e-05, + "loss": 0.0563, + "step": 9475 + }, + { + "epoch": 0.4231060437358037, + "grad_norm": 0.23518817126750946, + "learning_rate": 2.478453672866464e-05, + "loss": 0.0858, + "step": 9500 + }, + { + "epoch": 0.42421948069300314, + "grad_norm": 0.17357154190540314, + "learning_rate": 2.4716579647624084e-05, + "loss": 0.0794, + "step": 9525 + }, + { + "epoch": 0.42533291765020265, + "grad_norm": 0.01160862110555172, + "learning_rate": 2.464856485569779e-05, + "loss": 0.0798, + "step": 9550 + }, + { + "epoch": 0.42644635460740216, + "grad_norm": 0.3391244411468506, + "learning_rate": 2.4580493185097692e-05, + "loss": 0.0106, + "step": 9575 + }, + { + "epoch": 0.4275597915646016, + "grad_norm": 0.02090360037982464, + "learning_rate": 2.4512365468731692e-05, + "loss": 0.0975, + "step": 9600 + }, + { + "epoch": 0.4286732285218011, + "grad_norm": 2.563877820968628, + "learning_rate": 2.444418254019343e-05, + "loss": 0.0701, + "step": 9625 + }, + { + "epoch": 0.4297866654790006, + "grad_norm": 0.10990472882986069, + "learning_rate": 2.437594523375213e-05, + "loss": 0.141, + "step": 9650 + }, + { + "epoch": 0.4309001024362001, + "grad_norm": 0.3325424790382385, + "learning_rate": 2.430765438434235e-05, + "loss": 0.0948, + "step": 9675 + }, + { + "epoch": 0.43201353939339954, + "grad_norm": 0.5756065249443054, + "learning_rate": 2.423931082755381e-05, + "loss": 0.1001, + "step": 9700 + }, + { + "epoch": 0.43312697635059905, + "grad_norm": 8.402749061584473, + "learning_rate": 2.417091539962112e-05, + "loss": 0.0925, + "step": 9725 + }, + { + "epoch": 0.4342404133077985, + "grad_norm": 21.99260711669922, + "learning_rate": 2.4102468937413593e-05, + "loss": 0.0568, + "step": 9750 + }, + { + "epoch": 0.435353850264998, + "grad_norm": 0.5746713876724243, + "learning_rate": 2.4033972278424965e-05, + "loss": 0.0348, + "step": 9775 + }, + { + "epoch": 0.43646728722219746, + "grad_norm": 0.02806955948472023, + "learning_rate": 2.3965426260763196e-05, + "loss": 0.0285, + "step": 9800 + }, + { + "epoch": 0.437580724179397, + "grad_norm": 0.06320275366306305, + "learning_rate": 2.3896831723140145e-05, + "loss": 0.0852, + "step": 9825 + }, + { + "epoch": 0.4386941611365964, + "grad_norm": 0.05184788629412651, + "learning_rate": 2.382818950486139e-05, + "loss": 0.1056, + "step": 9850 + }, + { + "epoch": 0.43980759809379594, + "grad_norm": 9.47110366821289, + "learning_rate": 2.375950044581589e-05, + "loss": 0.0809, + "step": 9875 + }, + { + "epoch": 0.4409210350509954, + "grad_norm": 0.024401208385825157, + "learning_rate": 2.369076538646575e-05, + "loss": 0.0441, + "step": 9900 + }, + { + "epoch": 0.4420344720081949, + "grad_norm": 0.12478948384523392, + "learning_rate": 2.3621985167835913e-05, + "loss": 0.0856, + "step": 9925 + }, + { + "epoch": 0.44314790896539435, + "grad_norm": 8.50674819946289, + "learning_rate": 2.3553160631503897e-05, + "loss": 0.1729, + "step": 9950 + }, + { + "epoch": 0.44426134592259386, + "grad_norm": 4.947354316711426, + "learning_rate": 2.3484292619589452e-05, + "loss": 0.0641, + "step": 9975 + }, + { + "epoch": 0.4453747828797933, + "grad_norm": 0.1289278119802475, + "learning_rate": 2.341538197474432e-05, + "loss": 0.0291, + "step": 10000 + }, + { + "epoch": 0.44648821983699283, + "grad_norm": 0.03130419924855232, + "learning_rate": 2.334642954014185e-05, + "loss": 0.0388, + "step": 10025 + }, + { + "epoch": 0.44760165679419234, + "grad_norm": 0.13553370535373688, + "learning_rate": 2.3277436159466756e-05, + "loss": 0.0721, + "step": 10050 + }, + { + "epoch": 0.4487150937513918, + "grad_norm": 0.05339996889233589, + "learning_rate": 2.3208402676904735e-05, + "loss": 0.0726, + "step": 10075 + }, + { + "epoch": 0.4498285307085913, + "grad_norm": 0.03551988676190376, + "learning_rate": 2.3139329937132167e-05, + "loss": 0.172, + "step": 10100 + }, + { + "epoch": 0.45094196766579075, + "grad_norm": 0.36548513174057007, + "learning_rate": 2.3070218785305774e-05, + "loss": 0.1802, + "step": 10125 + }, + { + "epoch": 0.45205540462299026, + "grad_norm": 0.018251005560159683, + "learning_rate": 2.3001070067052277e-05, + "loss": 0.0896, + "step": 10150 + }, + { + "epoch": 0.4531688415801897, + "grad_norm": 0.2378775030374527, + "learning_rate": 2.293188462845806e-05, + "loss": 0.1419, + "step": 10175 + }, + { + "epoch": 0.45428227853738923, + "grad_norm": 0.4987199008464813, + "learning_rate": 2.2862663316058783e-05, + "loss": 0.1001, + "step": 10200 + }, + { + "epoch": 0.4553957154945887, + "grad_norm": 0.28313758969306946, + "learning_rate": 2.279340697682908e-05, + "loss": 0.0393, + "step": 10225 + }, + { + "epoch": 0.4565091524517882, + "grad_norm": 0.08381898701190948, + "learning_rate": 2.272411645817214e-05, + "loss": 0.0942, + "step": 10250 + }, + { + "epoch": 0.45762258940898765, + "grad_norm": 0.036367420107126236, + "learning_rate": 2.265479260790938e-05, + "loss": 0.012, + "step": 10275 + }, + { + "epoch": 0.45873602636618716, + "grad_norm": 0.2814151644706726, + "learning_rate": 2.2585436274270038e-05, + "loss": 0.1508, + "step": 10300 + }, + { + "epoch": 0.4598494633233866, + "grad_norm": 0.3467974364757538, + "learning_rate": 2.251604830588083e-05, + "loss": 0.0935, + "step": 10325 + }, + { + "epoch": 0.4609629002805861, + "grad_norm": 0.05292457342147827, + "learning_rate": 2.2446629551755523e-05, + "loss": 0.1146, + "step": 10350 + }, + { + "epoch": 0.4620763372377856, + "grad_norm": 0.5696282386779785, + "learning_rate": 2.2377180861284593e-05, + "loss": 0.0728, + "step": 10375 + }, + { + "epoch": 0.4631897741949851, + "grad_norm": 0.13021673262119293, + "learning_rate": 2.2307703084224794e-05, + "loss": 0.0684, + "step": 10400 + }, + { + "epoch": 0.46430321115218454, + "grad_norm": 0.04874199256300926, + "learning_rate": 2.2238197070688794e-05, + "loss": 0.1286, + "step": 10425 + }, + { + "epoch": 0.46541664810938405, + "grad_norm": 0.1427091658115387, + "learning_rate": 2.2168663671134727e-05, + "loss": 0.0964, + "step": 10450 + }, + { + "epoch": 0.46653008506658356, + "grad_norm": 0.5847116708755493, + "learning_rate": 2.2099103736355846e-05, + "loss": 0.0923, + "step": 10475 + }, + { + "epoch": 0.467643522023783, + "grad_norm": 13.224454879760742, + "learning_rate": 2.2029518117470062e-05, + "loss": 0.1031, + "step": 10500 + }, + { + "epoch": 0.4687569589809825, + "grad_norm": 0.14730241894721985, + "learning_rate": 2.195990766590956e-05, + "loss": 0.0915, + "step": 10525 + }, + { + "epoch": 0.469870395938182, + "grad_norm": 0.5047757029533386, + "learning_rate": 2.1890273233410366e-05, + "loss": 0.1317, + "step": 10550 + }, + { + "epoch": 0.4709838328953815, + "grad_norm": 21.881359100341797, + "learning_rate": 2.1820615672001934e-05, + "loss": 0.0431, + "step": 10575 + }, + { + "epoch": 0.47209726985258094, + "grad_norm": 0.19091269373893738, + "learning_rate": 2.175093583399672e-05, + "loss": 0.0497, + "step": 10600 + }, + { + "epoch": 0.47321070680978045, + "grad_norm": 0.06911573559045792, + "learning_rate": 2.1681234571979734e-05, + "loss": 0.0683, + "step": 10625 + }, + { + "epoch": 0.4743241437669799, + "grad_norm": 0.12921109795570374, + "learning_rate": 2.1611512738798157e-05, + "loss": 0.1125, + "step": 10650 + }, + { + "epoch": 0.4754375807241794, + "grad_norm": 1.3468788862228394, + "learning_rate": 2.154177118755084e-05, + "loss": 0.0542, + "step": 10675 + }, + { + "epoch": 0.47655101768137886, + "grad_norm": 14.072271347045898, + "learning_rate": 2.1472010771577922e-05, + "loss": 0.1039, + "step": 10700 + }, + { + "epoch": 0.4776644546385784, + "grad_norm": 0.018694249913096428, + "learning_rate": 2.1402232344450357e-05, + "loss": 0.078, + "step": 10725 + }, + { + "epoch": 0.4787778915957778, + "grad_norm": 0.015503577888011932, + "learning_rate": 2.133243675995948e-05, + "loss": 0.0599, + "step": 10750 + }, + { + "epoch": 0.47989132855297734, + "grad_norm": 0.015802688896656036, + "learning_rate": 2.1262624872106557e-05, + "loss": 0.1028, + "step": 10775 + }, + { + "epoch": 0.4810047655101768, + "grad_norm": 77.8501968383789, + "learning_rate": 2.1192797535092346e-05, + "loss": 0.102, + "step": 10800 + }, + { + "epoch": 0.4821182024673763, + "grad_norm": 0.7204959392547607, + "learning_rate": 2.112295560330662e-05, + "loss": 0.0704, + "step": 10825 + }, + { + "epoch": 0.48323163942457575, + "grad_norm": 21.50542640686035, + "learning_rate": 2.1053099931317754e-05, + "loss": 0.0671, + "step": 10850 + }, + { + "epoch": 0.48434507638177526, + "grad_norm": 10.961987495422363, + "learning_rate": 2.0983231373862224e-05, + "loss": 0.0689, + "step": 10875 + }, + { + "epoch": 0.4854585133389748, + "grad_norm": 0.4164874255657196, + "learning_rate": 2.0913350785834175e-05, + "loss": 0.0909, + "step": 10900 + }, + { + "epoch": 0.4865719502961742, + "grad_norm": 0.20815160870552063, + "learning_rate": 2.0843459022274973e-05, + "loss": 0.162, + "step": 10925 + }, + { + "epoch": 0.48768538725337374, + "grad_norm": 0.03996884077787399, + "learning_rate": 2.077355693836269e-05, + "loss": 0.0432, + "step": 10950 + }, + { + "epoch": 0.4887988242105732, + "grad_norm": 0.06414720416069031, + "learning_rate": 2.070364538940171e-05, + "loss": 0.1202, + "step": 10975 + }, + { + "epoch": 0.4899122611677727, + "grad_norm": 0.052727632224559784, + "learning_rate": 2.06337252308122e-05, + "loss": 0.0961, + "step": 11000 + }, + { + "epoch": 0.49102569812497215, + "grad_norm": 0.1574171483516693, + "learning_rate": 2.0563797318119696e-05, + "loss": 0.0644, + "step": 11025 + }, + { + "epoch": 0.49213913508217166, + "grad_norm": 0.6176048517227173, + "learning_rate": 2.0493862506944597e-05, + "loss": 0.0232, + "step": 11050 + }, + { + "epoch": 0.4932525720393711, + "grad_norm": 0.07887354493141174, + "learning_rate": 2.0423921652991713e-05, + "loss": 0.0623, + "step": 11075 + }, + { + "epoch": 0.4943660089965706, + "grad_norm": 0.379489541053772, + "learning_rate": 2.0353975612039797e-05, + "loss": 0.075, + "step": 11100 + }, + { + "epoch": 0.4954794459537701, + "grad_norm": 0.22683410346508026, + "learning_rate": 2.0284025239931064e-05, + "loss": 0.068, + "step": 11125 + }, + { + "epoch": 0.4965928829109696, + "grad_norm": 0.014947167597711086, + "learning_rate": 2.021407139256072e-05, + "loss": 0.0706, + "step": 11150 + }, + { + "epoch": 0.49770631986816904, + "grad_norm": 0.08996062725782394, + "learning_rate": 2.01441149258665e-05, + "loss": 0.1368, + "step": 11175 + }, + { + "epoch": 0.49881975682536855, + "grad_norm": 1.830735206604004, + "learning_rate": 2.0074156695818194e-05, + "loss": 0.135, + "step": 11200 + }, + { + "epoch": 0.499933193782568, + "grad_norm": 165.89505004882812, + "learning_rate": 2.0004197558407154e-05, + "loss": 0.1127, + "step": 11225 + }, + { + "epoch": 0.5010466307397675, + "grad_norm": 198.1811065673828, + "learning_rate": 1.9934238369635837e-05, + "loss": 0.0691, + "step": 11250 + }, + { + "epoch": 0.502160067696967, + "grad_norm": 17.284914016723633, + "learning_rate": 1.9864279985507345e-05, + "loss": 0.0968, + "step": 11275 + }, + { + "epoch": 0.5032735046541664, + "grad_norm": 3.0509047508239746, + "learning_rate": 1.9794323262014915e-05, + "loss": 0.0535, + "step": 11300 + }, + { + "epoch": 0.504386941611366, + "grad_norm": 0.1367504894733429, + "learning_rate": 1.972436905513146e-05, + "loss": 0.0834, + "step": 11325 + }, + { + "epoch": 0.5055003785685654, + "grad_norm": 0.118153877556324, + "learning_rate": 1.965441822079913e-05, + "loss": 0.0818, + "step": 11350 + }, + { + "epoch": 0.5066138155257649, + "grad_norm": 0.06398922950029373, + "learning_rate": 1.9584471614918787e-05, + "loss": 0.0542, + "step": 11375 + }, + { + "epoch": 0.5077272524829645, + "grad_norm": 0.03594028949737549, + "learning_rate": 1.951453009333955e-05, + "loss": 0.0065, + "step": 11400 + }, + { + "epoch": 0.5088406894401639, + "grad_norm": 0.29045698046684265, + "learning_rate": 1.9444594511848346e-05, + "loss": 0.0689, + "step": 11425 + }, + { + "epoch": 0.5099541263973634, + "grad_norm": 13.441309928894043, + "learning_rate": 1.9374665726159415e-05, + "loss": 0.0972, + "step": 11450 + }, + { + "epoch": 0.5110675633545628, + "grad_norm": 0.055517617613077164, + "learning_rate": 1.9304744591903837e-05, + "loss": 0.064, + "step": 11475 + }, + { + "epoch": 0.5121810003117624, + "grad_norm": 0.018037894740700722, + "learning_rate": 1.923483196461907e-05, + "loss": 0.1372, + "step": 11500 + }, + { + "epoch": 0.5132944372689618, + "grad_norm": 24.66988754272461, + "learning_rate": 1.9164928699738492e-05, + "loss": 0.0987, + "step": 11525 + }, + { + "epoch": 0.5144078742261613, + "grad_norm": 0.01959937810897827, + "learning_rate": 1.909503565258093e-05, + "loss": 0.1023, + "step": 11550 + }, + { + "epoch": 0.5155213111833608, + "grad_norm": 0.07445913553237915, + "learning_rate": 1.9025153678340173e-05, + "loss": 0.0704, + "step": 11575 + }, + { + "epoch": 0.5166347481405603, + "grad_norm": 0.6288459300994873, + "learning_rate": 1.895528363207453e-05, + "loss": 0.1181, + "step": 11600 + }, + { + "epoch": 0.5177481850977598, + "grad_norm": 0.2090267390012741, + "learning_rate": 1.8885426368696374e-05, + "loss": 0.121, + "step": 11625 + }, + { + "epoch": 0.5188616220549592, + "grad_norm": 0.4164348244667053, + "learning_rate": 1.8815582742961652e-05, + "loss": 0.0685, + "step": 11650 + }, + { + "epoch": 0.5199750590121587, + "grad_norm": 0.7623189091682434, + "learning_rate": 1.874575360945946e-05, + "loss": 0.0942, + "step": 11675 + }, + { + "epoch": 0.5210884959693582, + "grad_norm": 0.10410900413990021, + "learning_rate": 1.8675939822601558e-05, + "loss": 0.1427, + "step": 11700 + }, + { + "epoch": 0.5222019329265577, + "grad_norm": 9.168070793151855, + "learning_rate": 1.860614223661194e-05, + "loss": 0.1384, + "step": 11725 + }, + { + "epoch": 0.5233153698837572, + "grad_norm": 0.12809838354587555, + "learning_rate": 1.853636170551636e-05, + "loss": 0.0664, + "step": 11750 + }, + { + "epoch": 0.5244288068409567, + "grad_norm": 2.214749336242676, + "learning_rate": 1.8466599083131894e-05, + "loss": 0.0525, + "step": 11775 + }, + { + "epoch": 0.5255422437981562, + "grad_norm": 0.1398187279701233, + "learning_rate": 1.8396855223056505e-05, + "loss": 0.1128, + "step": 11800 + }, + { + "epoch": 0.5266556807553556, + "grad_norm": 0.05570574477314949, + "learning_rate": 1.8327130978658565e-05, + "loss": 0.0942, + "step": 11825 + }, + { + "epoch": 0.5277691177125551, + "grad_norm": 0.015549311414361, + "learning_rate": 1.825742720306644e-05, + "loss": 0.0127, + "step": 11850 + }, + { + "epoch": 0.5288825546697546, + "grad_norm": 0.048954132944345474, + "learning_rate": 1.8187744749158055e-05, + "loss": 0.0417, + "step": 11875 + }, + { + "epoch": 0.5299959916269541, + "grad_norm": 2.6331558227539062, + "learning_rate": 1.8118084469550446e-05, + "loss": 0.0608, + "step": 11900 + }, + { + "epoch": 0.5311094285841536, + "grad_norm": 0.16815824806690216, + "learning_rate": 1.8048447216589316e-05, + "loss": 0.0504, + "step": 11925 + }, + { + "epoch": 0.532222865541353, + "grad_norm": 31.2311954498291, + "learning_rate": 1.7978833842338625e-05, + "loss": 0.1608, + "step": 11950 + }, + { + "epoch": 0.5333363024985526, + "grad_norm": 0.21393436193466187, + "learning_rate": 1.7909245198570165e-05, + "loss": 0.087, + "step": 11975 + }, + { + "epoch": 0.534449739455752, + "grad_norm": 0.7251951694488525, + "learning_rate": 1.7839682136753143e-05, + "loss": 0.0518, + "step": 12000 + }, + { + "epoch": 0.5355631764129515, + "grad_norm": 13.856010437011719, + "learning_rate": 1.7770145508043722e-05, + "loss": 0.07, + "step": 12025 + }, + { + "epoch": 0.5366766133701509, + "grad_norm": 0.03707493096590042, + "learning_rate": 1.7700636163274657e-05, + "loss": 0.1027, + "step": 12050 + }, + { + "epoch": 0.5377900503273505, + "grad_norm": 0.33082666993141174, + "learning_rate": 1.763115495294486e-05, + "loss": 0.0741, + "step": 12075 + }, + { + "epoch": 0.53890348728455, + "grad_norm": 1.9731577634811401, + "learning_rate": 1.7561702727209e-05, + "loss": 0.0381, + "step": 12100 + }, + { + "epoch": 0.5400169242417494, + "grad_norm": 0.016690315678715706, + "learning_rate": 1.749228033586709e-05, + "loss": 0.0613, + "step": 12125 + }, + { + "epoch": 0.5411303611989489, + "grad_norm": 11.721412658691406, + "learning_rate": 1.7422888628354095e-05, + "loss": 0.0957, + "step": 12150 + }, + { + "epoch": 0.5422437981561484, + "grad_norm": 0.08547116816043854, + "learning_rate": 1.7353528453729552e-05, + "loss": 0.0852, + "step": 12175 + }, + { + "epoch": 0.5433572351133479, + "grad_norm": 0.03922642022371292, + "learning_rate": 1.728420066066717e-05, + "loss": 0.0678, + "step": 12200 + }, + { + "epoch": 0.5444706720705473, + "grad_norm": 20.77050018310547, + "learning_rate": 1.7214906097444425e-05, + "loss": 0.0576, + "step": 12225 + }, + { + "epoch": 0.5455841090277469, + "grad_norm": 0.051048461347818375, + "learning_rate": 1.7145645611932218e-05, + "loss": 0.0796, + "step": 12250 + }, + { + "epoch": 0.5466975459849464, + "grad_norm": 0.5286572575569153, + "learning_rate": 1.707642005158449e-05, + "loss": 0.0701, + "step": 12275 + }, + { + "epoch": 0.5478109829421458, + "grad_norm": 0.10311714559793472, + "learning_rate": 1.7007230263427834e-05, + "loss": 0.0738, + "step": 12300 + }, + { + "epoch": 0.5489244198993453, + "grad_norm": 98.99889373779297, + "learning_rate": 1.693807709405114e-05, + "loss": 0.067, + "step": 12325 + }, + { + "epoch": 0.5500378568565448, + "grad_norm": 10.721565246582031, + "learning_rate": 1.686896138959524e-05, + "loss": 0.0247, + "step": 12350 + }, + { + "epoch": 0.5511512938137443, + "grad_norm": 0.025939449667930603, + "learning_rate": 1.679988399574258e-05, + "loss": 0.043, + "step": 12375 + }, + { + "epoch": 0.5522647307709437, + "grad_norm": 0.05403141677379608, + "learning_rate": 1.673084575770682e-05, + "loss": 0.0906, + "step": 12400 + }, + { + "epoch": 0.5533781677281432, + "grad_norm": 0.1890357881784439, + "learning_rate": 1.666184752022252e-05, + "loss": 0.1179, + "step": 12425 + }, + { + "epoch": 0.5544916046853428, + "grad_norm": 0.15934480726718903, + "learning_rate": 1.659289012753481e-05, + "loss": 0.0444, + "step": 12450 + }, + { + "epoch": 0.5556050416425422, + "grad_norm": 0.10561876744031906, + "learning_rate": 1.6523974423389068e-05, + "loss": 0.014, + "step": 12475 + }, + { + "epoch": 0.5567184785997417, + "grad_norm": 0.5226536989212036, + "learning_rate": 1.645510125102056e-05, + "loss": 0.0433, + "step": 12500 + }, + { + "epoch": 0.5578319155569411, + "grad_norm": 0.10878469049930573, + "learning_rate": 1.6386271453144144e-05, + "loss": 0.1258, + "step": 12525 + }, + { + "epoch": 0.5589453525141407, + "grad_norm": 0.44833049178123474, + "learning_rate": 1.6317485871943966e-05, + "loss": 0.1303, + "step": 12550 + }, + { + "epoch": 0.5600587894713401, + "grad_norm": 0.02392134629189968, + "learning_rate": 1.624874534906316e-05, + "loss": 0.0989, + "step": 12575 + }, + { + "epoch": 0.5611722264285396, + "grad_norm": 0.05233491584658623, + "learning_rate": 1.6180050725593516e-05, + "loss": 0.0654, + "step": 12600 + }, + { + "epoch": 0.5622856633857392, + "grad_norm": 1.7143317461013794, + "learning_rate": 1.6111402842065213e-05, + "loss": 0.1005, + "step": 12625 + }, + { + "epoch": 0.5633991003429386, + "grad_norm": 0.02804090641438961, + "learning_rate": 1.604280253843655e-05, + "loss": 0.0328, + "step": 12650 + }, + { + "epoch": 0.5645125373001381, + "grad_norm": 0.04471449553966522, + "learning_rate": 1.5974250654083643e-05, + "loss": 0.125, + "step": 12675 + }, + { + "epoch": 0.5656259742573375, + "grad_norm": 0.04020120948553085, + "learning_rate": 1.5905748027790145e-05, + "loss": 0.0254, + "step": 12700 + }, + { + "epoch": 0.5667394112145371, + "grad_norm": 9.090320587158203, + "learning_rate": 1.5837295497737005e-05, + "loss": 0.1001, + "step": 12725 + }, + { + "epoch": 0.5678528481717365, + "grad_norm": 0.017459219321608543, + "learning_rate": 1.5768893901492238e-05, + "loss": 0.0815, + "step": 12750 + }, + { + "epoch": 0.568966285128936, + "grad_norm": 0.14465971291065216, + "learning_rate": 1.5700544076000604e-05, + "loss": 0.0787, + "step": 12775 + }, + { + "epoch": 0.5700797220861354, + "grad_norm": 0.27508795261383057, + "learning_rate": 1.5632246857573434e-05, + "loss": 0.0788, + "step": 12800 + }, + { + "epoch": 0.571193159043335, + "grad_norm": 0.668519914150238, + "learning_rate": 1.5564003081878355e-05, + "loss": 0.0925, + "step": 12825 + }, + { + "epoch": 0.5723065960005345, + "grad_norm": 0.02386564202606678, + "learning_rate": 1.5495813583929105e-05, + "loss": 0.0432, + "step": 12850 + }, + { + "epoch": 0.5734200329577339, + "grad_norm": 0.31950095295906067, + "learning_rate": 1.5427679198075277e-05, + "loss": 0.0472, + "step": 12875 + }, + { + "epoch": 0.5745334699149334, + "grad_norm": 0.013991210609674454, + "learning_rate": 1.5359600757992127e-05, + "loss": 0.1124, + "step": 12900 + }, + { + "epoch": 0.5756469068721329, + "grad_norm": 0.05440308153629303, + "learning_rate": 1.529157909667037e-05, + "loss": 0.0085, + "step": 12925 + }, + { + "epoch": 0.5767603438293324, + "grad_norm": 132.2735137939453, + "learning_rate": 1.5223615046406007e-05, + "loss": 0.0712, + "step": 12950 + }, + { + "epoch": 0.5778737807865318, + "grad_norm": 0.03499956429004669, + "learning_rate": 1.5155709438790105e-05, + "loss": 0.0275, + "step": 12975 + }, + { + "epoch": 0.5789872177437313, + "grad_norm": 13.37109088897705, + "learning_rate": 1.5087863104698653e-05, + "loss": 0.0884, + "step": 13000 + }, + { + "epoch": 0.5801006547009309, + "grad_norm": 0.1880478411912918, + "learning_rate": 1.502007687428237e-05, + "loss": 0.046, + "step": 13025 + }, + { + "epoch": 0.5812140916581303, + "grad_norm": 0.01652064546942711, + "learning_rate": 1.4952351576956587e-05, + "loss": 0.0456, + "step": 13050 + }, + { + "epoch": 0.5823275286153298, + "grad_norm": 0.05549848824739456, + "learning_rate": 1.4884688041391053e-05, + "loss": 0.0455, + "step": 13075 + }, + { + "epoch": 0.5834409655725293, + "grad_norm": 0.1668526977300644, + "learning_rate": 1.4817087095499809e-05, + "loss": 0.0865, + "step": 13100 + }, + { + "epoch": 0.5845544025297288, + "grad_norm": 0.06226930022239685, + "learning_rate": 1.4749549566431092e-05, + "loss": 0.0033, + "step": 13125 + }, + { + "epoch": 0.5856678394869282, + "grad_norm": 0.2953890860080719, + "learning_rate": 1.4682076280557167e-05, + "loss": 0.063, + "step": 13150 + }, + { + "epoch": 0.5867812764441277, + "grad_norm": 0.024164235219359398, + "learning_rate": 1.4614668063464235e-05, + "loss": 0.0394, + "step": 13175 + }, + { + "epoch": 0.5878947134013273, + "grad_norm": 0.04861380532383919, + "learning_rate": 1.4547325739942327e-05, + "loss": 0.1269, + "step": 13200 + }, + { + "epoch": 0.5890081503585267, + "grad_norm": 0.2787506878376007, + "learning_rate": 1.4480050133975242e-05, + "loss": 0.0625, + "step": 13225 + }, + { + "epoch": 0.5901215873157262, + "grad_norm": 0.12541650235652924, + "learning_rate": 1.4412842068730419e-05, + "loss": 0.0756, + "step": 13250 + }, + { + "epoch": 0.5912350242729256, + "grad_norm": 0.06487028300762177, + "learning_rate": 1.4345702366548884e-05, + "loss": 0.0956, + "step": 13275 + }, + { + "epoch": 0.5923484612301252, + "grad_norm": 0.023417694494128227, + "learning_rate": 1.4278631848935192e-05, + "loss": 0.0588, + "step": 13300 + }, + { + "epoch": 0.5934618981873246, + "grad_norm": 0.04044019430875778, + "learning_rate": 1.4211631336547389e-05, + "loss": 0.0581, + "step": 13325 + }, + { + "epoch": 0.5945753351445241, + "grad_norm": 0.13640573620796204, + "learning_rate": 1.4144701649186937e-05, + "loss": 0.0508, + "step": 13350 + }, + { + "epoch": 0.5956887721017236, + "grad_norm": 0.17478786408901215, + "learning_rate": 1.4077843605788701e-05, + "loss": 0.0371, + "step": 13375 + }, + { + "epoch": 0.5968022090589231, + "grad_norm": 0.09428933262825012, + "learning_rate": 1.4011058024410931e-05, + "loss": 0.0125, + "step": 13400 + }, + { + "epoch": 0.5979156460161226, + "grad_norm": 0.01731252484023571, + "learning_rate": 1.3944345722225268e-05, + "loss": 0.0668, + "step": 13425 + }, + { + "epoch": 0.599029082973322, + "grad_norm": 0.017465805634856224, + "learning_rate": 1.38777075155067e-05, + "loss": 0.1256, + "step": 13450 + }, + { + "epoch": 0.6001425199305215, + "grad_norm": 0.028824403882026672, + "learning_rate": 1.3811144219623613e-05, + "loss": 0.1192, + "step": 13475 + }, + { + "epoch": 0.601255956887721, + "grad_norm": 0.2588452696800232, + "learning_rate": 1.3744656649027795e-05, + "loss": 0.0677, + "step": 13500 + }, + { + "epoch": 0.6023693938449205, + "grad_norm": 0.2693833112716675, + "learning_rate": 1.3678245617244503e-05, + "loss": 0.0943, + "step": 13525 + }, + { + "epoch": 0.60348283080212, + "grad_norm": 0.055959224700927734, + "learning_rate": 1.361191193686246e-05, + "loss": 0.0645, + "step": 13550 + }, + { + "epoch": 0.6045962677593195, + "grad_norm": 12.030915260314941, + "learning_rate": 1.354565641952393e-05, + "loss": 0.0828, + "step": 13575 + }, + { + "epoch": 0.605709704716519, + "grad_norm": 153.66726684570312, + "learning_rate": 1.3479479875914834e-05, + "loss": 0.0849, + "step": 13600 + }, + { + "epoch": 0.6068231416737184, + "grad_norm": 0.015734700486063957, + "learning_rate": 1.3413383115754748e-05, + "loss": 0.0872, + "step": 13625 + }, + { + "epoch": 0.6079365786309179, + "grad_norm": 0.16204610466957092, + "learning_rate": 1.3347366947787054e-05, + "loss": 0.1388, + "step": 13650 + }, + { + "epoch": 0.6090500155881174, + "grad_norm": 0.10918817669153214, + "learning_rate": 1.3281432179769023e-05, + "loss": 0.1072, + "step": 13675 + }, + { + "epoch": 0.6101634525453169, + "grad_norm": 0.25966718792915344, + "learning_rate": 1.3215579618461955e-05, + "loss": 0.1107, + "step": 13700 + }, + { + "epoch": 0.6112768895025164, + "grad_norm": 0.11825607717037201, + "learning_rate": 1.3149810069621265e-05, + "loss": 0.0657, + "step": 13725 + }, + { + "epoch": 0.6123903264597158, + "grad_norm": 0.08822792768478394, + "learning_rate": 1.3084124337986666e-05, + "loss": 0.0468, + "step": 13750 + }, + { + "epoch": 0.6135037634169154, + "grad_norm": 0.02553175576031208, + "learning_rate": 1.301852322727228e-05, + "loss": 0.0874, + "step": 13775 + }, + { + "epoch": 0.6146172003741148, + "grad_norm": 0.3989555537700653, + "learning_rate": 1.295300754015687e-05, + "loss": 0.0379, + "step": 13800 + }, + { + "epoch": 0.6157306373313143, + "grad_norm": 0.04876472055912018, + "learning_rate": 1.2887578078273944e-05, + "loss": 0.0972, + "step": 13825 + }, + { + "epoch": 0.6168440742885137, + "grad_norm": 0.776457667350769, + "learning_rate": 1.2822235642201983e-05, + "loss": 0.0757, + "step": 13850 + }, + { + "epoch": 0.6179575112457133, + "grad_norm": 1.26872718334198, + "learning_rate": 1.275698103145465e-05, + "loss": 0.0438, + "step": 13875 + }, + { + "epoch": 0.6190709482029128, + "grad_norm": 7.4887309074401855, + "learning_rate": 1.2691815044471005e-05, + "loss": 0.0944, + "step": 13900 + }, + { + "epoch": 0.6201843851601122, + "grad_norm": 0.061836518347263336, + "learning_rate": 1.262673847860572e-05, + "loss": 0.0507, + "step": 13925 + }, + { + "epoch": 0.6212978221173118, + "grad_norm": 31.68821144104004, + "learning_rate": 1.2561752130119325e-05, + "loss": 0.1315, + "step": 13950 + }, + { + "epoch": 0.6224112590745112, + "grad_norm": 1.8287922143936157, + "learning_rate": 1.2496856794168493e-05, + "loss": 0.0173, + "step": 13975 + }, + { + "epoch": 0.6235246960317107, + "grad_norm": 0.07127226889133453, + "learning_rate": 1.243205326479628e-05, + "loss": 0.1028, + "step": 14000 + }, + { + "epoch": 0.6246381329889101, + "grad_norm": 0.01413736306130886, + "learning_rate": 1.2367342334922419e-05, + "loss": 0.0194, + "step": 14025 + }, + { + "epoch": 0.6257515699461097, + "grad_norm": 0.01650845631957054, + "learning_rate": 1.2302724796333611e-05, + "loss": 0.0083, + "step": 14050 + }, + { + "epoch": 0.6268650069033092, + "grad_norm": 0.01144505012780428, + "learning_rate": 1.223820143967386e-05, + "loss": 0.0715, + "step": 14075 + }, + { + "epoch": 0.6279784438605086, + "grad_norm": 43.129913330078125, + "learning_rate": 1.2173773054434786e-05, + "loss": 0.1128, + "step": 14100 + }, + { + "epoch": 0.6290918808177081, + "grad_norm": 0.40030625462532043, + "learning_rate": 1.2109440428945937e-05, + "loss": 0.0995, + "step": 14125 + }, + { + "epoch": 0.6302053177749076, + "grad_norm": 0.14804595708847046, + "learning_rate": 1.2045204350365194e-05, + "loss": 0.077, + "step": 14150 + }, + { + "epoch": 0.6313187547321071, + "grad_norm": 8.412322044372559, + "learning_rate": 1.1981065604669102e-05, + "loss": 0.0816, + "step": 14175 + }, + { + "epoch": 0.6324321916893065, + "grad_norm": 0.00858822651207447, + "learning_rate": 1.1917024976643274e-05, + "loss": 0.0158, + "step": 14200 + }, + { + "epoch": 0.633545628646506, + "grad_norm": 30.407943725585938, + "learning_rate": 1.1853083249872768e-05, + "loss": 0.033, + "step": 14225 + }, + { + "epoch": 0.6346590656037056, + "grad_norm": 0.0964275524020195, + "learning_rate": 1.1789241206732519e-05, + "loss": 0.0953, + "step": 14250 + }, + { + "epoch": 0.635772502560905, + "grad_norm": 0.0621681734919548, + "learning_rate": 1.1725499628377759e-05, + "loss": 0.0566, + "step": 14275 + }, + { + "epoch": 0.6368859395181045, + "grad_norm": 91.5651626586914, + "learning_rate": 1.1661859294734454e-05, + "loss": 0.1282, + "step": 14300 + }, + { + "epoch": 0.6379993764753039, + "grad_norm": 20.94974708557129, + "learning_rate": 1.1598320984489767e-05, + "loss": 0.079, + "step": 14325 + }, + { + "epoch": 0.6391128134325035, + "grad_norm": 0.06739246100187302, + "learning_rate": 1.153488547508252e-05, + "loss": 0.0762, + "step": 14350 + }, + { + "epoch": 0.6402262503897029, + "grad_norm": 0.02148587629199028, + "learning_rate": 1.1471553542693715e-05, + "loss": 0.0377, + "step": 14375 + }, + { + "epoch": 0.6413396873469024, + "grad_norm": 37.625404357910156, + "learning_rate": 1.140832596223698e-05, + "loss": 0.0565, + "step": 14400 + }, + { + "epoch": 0.642453124304102, + "grad_norm": 0.10515021532773972, + "learning_rate": 1.134520350734913e-05, + "loss": 0.0788, + "step": 14425 + }, + { + "epoch": 0.6435665612613014, + "grad_norm": 0.11388204246759415, + "learning_rate": 1.1282186950380711e-05, + "loss": 0.0452, + "step": 14450 + }, + { + "epoch": 0.6446799982185009, + "grad_norm": 0.4367184340953827, + "learning_rate": 1.1219277062386497e-05, + "loss": 0.0818, + "step": 14475 + }, + { + "epoch": 0.6457934351757003, + "grad_norm": 33.38430404663086, + "learning_rate": 1.1156474613116106e-05, + "loss": 0.0526, + "step": 14500 + }, + { + "epoch": 0.6469068721328999, + "grad_norm": 7.543337345123291, + "learning_rate": 1.1093780371004548e-05, + "loss": 0.1187, + "step": 14525 + }, + { + "epoch": 0.6480203090900993, + "grad_norm": 0.032384540885686874, + "learning_rate": 1.103119510316286e-05, + "loss": 0.0276, + "step": 14550 + }, + { + "epoch": 0.6491337460472988, + "grad_norm": 11.106677055358887, + "learning_rate": 1.0968719575368679e-05, + "loss": 0.0858, + "step": 14575 + }, + { + "epoch": 0.6502471830044982, + "grad_norm": 101.50527954101562, + "learning_rate": 1.0906354552056895e-05, + "loss": 0.0249, + "step": 14600 + }, + { + "epoch": 0.6513606199616978, + "grad_norm": 16.35914421081543, + "learning_rate": 1.0844100796310287e-05, + "loss": 0.023, + "step": 14625 + }, + { + "epoch": 0.6524740569188973, + "grad_norm": 0.3573369085788727, + "learning_rate": 1.078195906985021e-05, + "loss": 0.0342, + "step": 14650 + }, + { + "epoch": 0.6535874938760967, + "grad_norm": 0.01951185241341591, + "learning_rate": 1.0719930133027238e-05, + "loss": 0.0695, + "step": 14675 + }, + { + "epoch": 0.6547009308332962, + "grad_norm": 0.006902535445988178, + "learning_rate": 1.0658014744811892e-05, + "loss": 0.0251, + "step": 14700 + }, + { + "epoch": 0.6558143677904957, + "grad_norm": 17.231019973754883, + "learning_rate": 1.059621366278532e-05, + "loss": 0.1581, + "step": 14725 + }, + { + "epoch": 0.6569278047476952, + "grad_norm": 0.09472651034593582, + "learning_rate": 1.0534527643130082e-05, + "loss": 0.0712, + "step": 14750 + }, + { + "epoch": 0.6580412417048946, + "grad_norm": 0.022969521582126617, + "learning_rate": 1.0472957440620837e-05, + "loss": 0.097, + "step": 14775 + }, + { + "epoch": 0.6591546786620942, + "grad_norm": 0.44400086998939514, + "learning_rate": 1.0411503808615131e-05, + "loss": 0.0358, + "step": 14800 + }, + { + "epoch": 0.6602681156192937, + "grad_norm": 0.04555133730173111, + "learning_rate": 1.0350167499044216e-05, + "loss": 0.0573, + "step": 14825 + }, + { + "epoch": 0.6613815525764931, + "grad_norm": 0.021451586857438087, + "learning_rate": 1.028894926240378e-05, + "loss": 0.0344, + "step": 14850 + }, + { + "epoch": 0.6624949895336926, + "grad_norm": 9.742546081542969, + "learning_rate": 1.0227849847744817e-05, + "loss": 0.0721, + "step": 14875 + }, + { + "epoch": 0.6636084264908921, + "grad_norm": 2.3758695125579834, + "learning_rate": 1.0166870002664432e-05, + "loss": 0.0478, + "step": 14900 + }, + { + "epoch": 0.6647218634480916, + "grad_norm": 0.03618834540247917, + "learning_rate": 1.0106010473296737e-05, + "loss": 0.0711, + "step": 14925 + }, + { + "epoch": 0.665835300405291, + "grad_norm": 0.051780980080366135, + "learning_rate": 1.004527200430366e-05, + "loss": 0.0579, + "step": 14950 + }, + { + "epoch": 0.6669487373624905, + "grad_norm": 0.17397186160087585, + "learning_rate": 9.984655338865874e-06, + "loss": 0.0528, + "step": 14975 + }, + { + "epoch": 0.6680621743196901, + "grad_norm": 0.010688379406929016, + "learning_rate": 9.924161218673689e-06, + "loss": 0.0364, + "step": 15000 + }, + { + "epoch": 0.6691756112768895, + "grad_norm": 0.10326721519231796, + "learning_rate": 9.863790383918007e-06, + "loss": 0.0841, + "step": 15025 + }, + { + "epoch": 0.670289048234089, + "grad_norm": 0.006623438559472561, + "learning_rate": 9.80354357328121e-06, + "loss": 0.1098, + "step": 15050 + }, + { + "epoch": 0.6714024851912884, + "grad_norm": 0.1552818864583969, + "learning_rate": 9.743421523928172e-06, + "loss": 0.0705, + "step": 15075 + }, + { + "epoch": 0.672515922148488, + "grad_norm": 0.34292566776275635, + "learning_rate": 9.683424971497195e-06, + "loss": 0.1153, + "step": 15100 + }, + { + "epoch": 0.6736293591056874, + "grad_norm": 0.5139630436897278, + "learning_rate": 9.623554650091066e-06, + "loss": 0.0384, + "step": 15125 + }, + { + "epoch": 0.6747427960628869, + "grad_norm": 0.44938117265701294, + "learning_rate": 9.563811292268019e-06, + "loss": 0.0338, + "step": 15150 + }, + { + "epoch": 0.6758562330200863, + "grad_norm": 0.04680267721414566, + "learning_rate": 9.504195629032795e-06, + "loss": 0.0282, + "step": 15175 + }, + { + "epoch": 0.6769696699772859, + "grad_norm": 0.7540410757064819, + "learning_rate": 9.444708389827693e-06, + "loss": 0.0763, + "step": 15200 + }, + { + "epoch": 0.6780831069344854, + "grad_norm": 0.027341516688466072, + "learning_rate": 9.385350302523667e-06, + "loss": 0.0413, + "step": 15225 + }, + { + "epoch": 0.6791965438916848, + "grad_norm": 39.69243240356445, + "learning_rate": 9.326122093411378e-06, + "loss": 0.0971, + "step": 15250 + }, + { + "epoch": 0.6803099808488844, + "grad_norm": 0.05220544710755348, + "learning_rate": 9.267024487192338e-06, + "loss": 0.0582, + "step": 15275 + }, + { + "epoch": 0.6814234178060838, + "grad_norm": 0.011502075009047985, + "learning_rate": 9.208058206970045e-06, + "loss": 0.0376, + "step": 15300 + }, + { + "epoch": 0.6825368547632833, + "grad_norm": 0.31533384323120117, + "learning_rate": 9.149223974241113e-06, + "loss": 0.1116, + "step": 15325 + }, + { + "epoch": 0.6836502917204828, + "grad_norm": 0.1892848163843155, + "learning_rate": 9.090522508886457e-06, + "loss": 0.0357, + "step": 15350 + }, + { + "epoch": 0.6847637286776823, + "grad_norm": 0.021438557654619217, + "learning_rate": 9.031954529162477e-06, + "loss": 0.0801, + "step": 15375 + }, + { + "epoch": 0.6858771656348818, + "grad_norm": 0.0905388817191124, + "learning_rate": 8.973520751692304e-06, + "loss": 0.0151, + "step": 15400 + }, + { + "epoch": 0.6869906025920812, + "grad_norm": 0.02208125777542591, + "learning_rate": 8.915221891456974e-06, + "loss": 0.0501, + "step": 15425 + }, + { + "epoch": 0.6881040395492807, + "grad_norm": 0.3321913480758667, + "learning_rate": 8.857058661786722e-06, + "loss": 0.0797, + "step": 15450 + }, + { + "epoch": 0.6892174765064802, + "grad_norm": 0.16185687482357025, + "learning_rate": 8.799031774352231e-06, + "loss": 0.0608, + "step": 15475 + }, + { + "epoch": 0.6903309134636797, + "grad_norm": 0.011803328059613705, + "learning_rate": 8.741141939155961e-06, + "loss": 0.013, + "step": 15500 + }, + { + "epoch": 0.6914443504208792, + "grad_norm": 0.013166255317628384, + "learning_rate": 8.683389864523409e-06, + "loss": 0.0187, + "step": 15525 + }, + { + "epoch": 0.6925577873780786, + "grad_norm": 0.06444378197193146, + "learning_rate": 8.625776257094477e-06, + "loss": 0.0295, + "step": 15550 + }, + { + "epoch": 0.6936712243352782, + "grad_norm": 20.706222534179688, + "learning_rate": 8.568301821814808e-06, + "loss": 0.0995, + "step": 15575 + }, + { + "epoch": 0.6947846612924776, + "grad_norm": 51.754207611083984, + "learning_rate": 8.510967261927199e-06, + "loss": 0.1911, + "step": 15600 + }, + { + "epoch": 0.6958980982496771, + "grad_norm": 0.015366621315479279, + "learning_rate": 8.453773278962932e-06, + "loss": 0.067, + "step": 15625 + }, + { + "epoch": 0.6970115352068765, + "grad_norm": 8.943309783935547, + "learning_rate": 8.396720572733243e-06, + "loss": 0.117, + "step": 15650 + }, + { + "epoch": 0.6981249721640761, + "grad_norm": 0.11915289610624313, + "learning_rate": 8.339809841320732e-06, + "loss": 0.0645, + "step": 15675 + }, + { + "epoch": 0.6992384091212756, + "grad_norm": 0.09671024978160858, + "learning_rate": 8.283041781070847e-06, + "loss": 0.0928, + "step": 15700 + }, + { + "epoch": 0.700351846078475, + "grad_norm": 0.03198957443237305, + "learning_rate": 8.22641708658333e-06, + "loss": 0.1054, + "step": 15725 + }, + { + "epoch": 0.7014652830356746, + "grad_norm": 68.7297592163086, + "learning_rate": 8.169936450703737e-06, + "loss": 0.0969, + "step": 15750 + }, + { + "epoch": 0.702578719992874, + "grad_norm": 0.02619253285229206, + "learning_rate": 8.113600564514979e-06, + "loss": 0.0013, + "step": 15775 + }, + { + "epoch": 0.7036921569500735, + "grad_norm": 0.6842272281646729, + "learning_rate": 8.057410117328823e-06, + "loss": 0.0443, + "step": 15800 + }, + { + "epoch": 0.7048055939072729, + "grad_norm": 132.8560333251953, + "learning_rate": 8.00136579667749e-06, + "loss": 0.0347, + "step": 15825 + }, + { + "epoch": 0.7059190308644725, + "grad_norm": 0.31542474031448364, + "learning_rate": 7.945468288305226e-06, + "loss": 0.0402, + "step": 15850 + }, + { + "epoch": 0.707032467821672, + "grad_norm": 0.1650291383266449, + "learning_rate": 7.889718276159937e-06, + "loss": 0.1061, + "step": 15875 + }, + { + "epoch": 0.7081459047788714, + "grad_norm": 0.1369670331478119, + "learning_rate": 7.834116442384785e-06, + "loss": 0.0693, + "step": 15900 + }, + { + "epoch": 0.7092593417360709, + "grad_norm": 17.667673110961914, + "learning_rate": 7.778663467309865e-06, + "loss": 0.047, + "step": 15925 + }, + { + "epoch": 0.7103727786932704, + "grad_norm": 10.790023803710938, + "learning_rate": 7.723360029443865e-06, + "loss": 0.126, + "step": 15950 + }, + { + "epoch": 0.7114862156504699, + "grad_norm": 0.010408015921711922, + "learning_rate": 7.668206805465797e-06, + "loss": 0.0677, + "step": 15975 + }, + { + "epoch": 0.7125996526076693, + "grad_norm": 15.175890922546387, + "learning_rate": 7.613204470216671e-06, + "loss": 0.0762, + "step": 16000 + }, + { + "epoch": 0.7137130895648688, + "grad_norm": 0.09815490990877151, + "learning_rate": 7.558353696691267e-06, + "loss": 0.033, + "step": 16025 + }, + { + "epoch": 0.7148265265220684, + "grad_norm": 0.23362314701080322, + "learning_rate": 7.503655156029885e-06, + "loss": 0.0456, + "step": 16050 + }, + { + "epoch": 0.7159399634792678, + "grad_norm": 0.04253578558564186, + "learning_rate": 7.449109517510171e-06, + "loss": 0.0295, + "step": 16075 + }, + { + "epoch": 0.7170534004364673, + "grad_norm": 0.08314715325832367, + "learning_rate": 7.3947174485388666e-06, + "loss": 0.0577, + "step": 16100 + }, + { + "epoch": 0.7181668373936668, + "grad_norm": 0.19052563607692719, + "learning_rate": 7.340479614643681e-06, + "loss": 0.0474, + "step": 16125 + }, + { + "epoch": 0.7192802743508663, + "grad_norm": 1.0644758939743042, + "learning_rate": 7.286396679465164e-06, + "loss": 0.1031, + "step": 16150 + }, + { + "epoch": 0.7203937113080657, + "grad_norm": 0.02732575498521328, + "learning_rate": 7.232469304748539e-06, + "loss": 0.0895, + "step": 16175 + }, + { + "epoch": 0.7215071482652652, + "grad_norm": 83.3663101196289, + "learning_rate": 7.178698150335641e-06, + "loss": 0.083, + "step": 16200 + }, + { + "epoch": 0.7226205852224648, + "grad_norm": 0.3653166890144348, + "learning_rate": 7.125083874156819e-06, + "loss": 0.0565, + "step": 16225 + }, + { + "epoch": 0.7237340221796642, + "grad_norm": 0.18628665804862976, + "learning_rate": 7.071627132222931e-06, + "loss": 0.0314, + "step": 16250 + }, + { + "epoch": 0.7248474591368637, + "grad_norm": 0.016880426555871964, + "learning_rate": 7.018328578617253e-06, + "loss": 0.0562, + "step": 16275 + }, + { + "epoch": 0.7259608960940631, + "grad_norm": 0.044409096240997314, + "learning_rate": 6.965188865487522e-06, + "loss": 0.0471, + "step": 16300 + }, + { + "epoch": 0.7270743330512627, + "grad_norm": 21.80831527709961, + "learning_rate": 6.912208643037937e-06, + "loss": 0.1115, + "step": 16325 + }, + { + "epoch": 0.7281877700084621, + "grad_norm": 0.017523668706417084, + "learning_rate": 6.859388559521225e-06, + "loss": 0.0268, + "step": 16350 + }, + { + "epoch": 0.7293012069656616, + "grad_norm": 201.114013671875, + "learning_rate": 6.806729261230676e-06, + "loss": 0.0669, + "step": 16375 + }, + { + "epoch": 0.730414643922861, + "grad_norm": 0.01862688735127449, + "learning_rate": 6.754231392492254e-06, + "loss": 0.0437, + "step": 16400 + }, + { + "epoch": 0.7315280808800606, + "grad_norm": 0.026870861649513245, + "learning_rate": 6.701895595656715e-06, + "loss": 0.0957, + "step": 16425 + }, + { + "epoch": 0.7326415178372601, + "grad_norm": 0.07133554667234421, + "learning_rate": 6.6497225110917515e-06, + "loss": 0.0566, + "step": 16450 + }, + { + "epoch": 0.7337549547944595, + "grad_norm": 48.52125930786133, + "learning_rate": 6.597712777174141e-06, + "loss": 0.0401, + "step": 16475 + }, + { + "epoch": 0.734868391751659, + "grad_norm": 20.256444931030273, + "learning_rate": 6.54586703028194e-06, + "loss": 0.0592, + "step": 16500 + }, + { + "epoch": 0.7359818287088585, + "grad_norm": 22.120195388793945, + "learning_rate": 6.494185904786703e-06, + "loss": 0.1089, + "step": 16525 + }, + { + "epoch": 0.737095265666058, + "grad_norm": 0.08727439492940903, + "learning_rate": 6.442670033045731e-06, + "loss": 0.0479, + "step": 16550 + }, + { + "epoch": 0.7382087026232574, + "grad_norm": 0.47929519414901733, + "learning_rate": 6.391320045394305e-06, + "loss": 0.1064, + "step": 16575 + }, + { + "epoch": 0.739322139580457, + "grad_norm": 0.8217176795005798, + "learning_rate": 6.340136570137991e-06, + "loss": 0.0677, + "step": 16600 + }, + { + "epoch": 0.7404355765376565, + "grad_norm": 16.746612548828125, + "learning_rate": 6.28912023354497e-06, + "loss": 0.0475, + "step": 16625 + }, + { + "epoch": 0.7415490134948559, + "grad_norm": 0.14232224225997925, + "learning_rate": 6.2382716598383355e-06, + "loss": 0.0882, + "step": 16650 + }, + { + "epoch": 0.7426624504520554, + "grad_norm": 0.030962107703089714, + "learning_rate": 6.187591471188481e-06, + "loss": 0.0495, + "step": 16675 + }, + { + "epoch": 0.7437758874092549, + "grad_norm": 0.061680518090724945, + "learning_rate": 6.137080287705481e-06, + "loss": 0.0929, + "step": 16700 + }, + { + "epoch": 0.7448893243664544, + "grad_norm": 17.700639724731445, + "learning_rate": 6.086738727431523e-06, + "loss": 0.0528, + "step": 16725 + }, + { + "epoch": 0.7460027613236538, + "grad_norm": 1.2548288106918335, + "learning_rate": 6.0365674063333046e-06, + "loss": 0.0833, + "step": 16750 + }, + { + "epoch": 0.7471161982808533, + "grad_norm": 0.4032031297683716, + "learning_rate": 5.986566938294532e-06, + "loss": 0.0506, + "step": 16775 + }, + { + "epoch": 0.7482296352380529, + "grad_norm": 0.03656847029924393, + "learning_rate": 5.936737935108383e-06, + "loss": 0.0262, + "step": 16800 + }, + { + "epoch": 0.7493430721952523, + "grad_norm": 0.02097596973180771, + "learning_rate": 5.887081006470061e-06, + "loss": 0.058, + "step": 16825 + }, + { + "epoch": 0.7504565091524518, + "grad_norm": 0.11637075990438461, + "learning_rate": 5.837596759969281e-06, + "loss": 0.0949, + "step": 16850 + }, + { + "epoch": 0.7515699461096512, + "grad_norm": 0.06315034627914429, + "learning_rate": 5.788285801082878e-06, + "loss": 0.0304, + "step": 16875 + }, + { + "epoch": 0.7526833830668508, + "grad_norm": 0.11649344861507416, + "learning_rate": 5.73914873316737e-06, + "loss": 0.0368, + "step": 16900 + }, + { + "epoch": 0.7537968200240502, + "grad_norm": 0.03985821083188057, + "learning_rate": 5.6901861574516115e-06, + "loss": 0.0074, + "step": 16925 + }, + { + "epoch": 0.7549102569812497, + "grad_norm": 0.016696982085704803, + "learning_rate": 5.64139867302939e-06, + "loss": 0.0675, + "step": 16950 + }, + { + "epoch": 0.7560236939384493, + "grad_norm": 0.011424514465034008, + "learning_rate": 5.592786876852127e-06, + "loss": 0.0303, + "step": 16975 + }, + { + "epoch": 0.7571371308956487, + "grad_norm": 15.772963523864746, + "learning_rate": 5.54435136372158e-06, + "loss": 0.0487, + "step": 17000 + }, + { + "epoch": 0.7582505678528482, + "grad_norm": 0.7694092392921448, + "learning_rate": 5.496092726282538e-06, + "loss": 0.0411, + "step": 17025 + }, + { + "epoch": 0.7593640048100476, + "grad_norm": 14.076139450073242, + "learning_rate": 5.448011555015584e-06, + "loss": 0.1227, + "step": 17050 + }, + { + "epoch": 0.7604774417672472, + "grad_norm": 0.1597975343465805, + "learning_rate": 5.400108438229865e-06, + "loss": 0.0503, + "step": 17075 + }, + { + "epoch": 0.7615908787244466, + "grad_norm": 0.08557901531457901, + "learning_rate": 5.352383962055918e-06, + "loss": 0.0788, + "step": 17100 + }, + { + "epoch": 0.7627043156816461, + "grad_norm": 19.060636520385742, + "learning_rate": 5.304838710438458e-06, + "loss": 0.0935, + "step": 17125 + }, + { + "epoch": 0.7638177526388455, + "grad_norm": 0.024234607815742493, + "learning_rate": 5.2574732651292606e-06, + "loss": 0.0575, + "step": 17150 + }, + { + "epoch": 0.7649311895960451, + "grad_norm": 0.010592158883810043, + "learning_rate": 5.210288205680032e-06, + "loss": 0.0992, + "step": 17175 + }, + { + "epoch": 0.7660446265532446, + "grad_norm": 0.08044129610061646, + "learning_rate": 5.163284109435336e-06, + "loss": 0.1277, + "step": 17200 + }, + { + "epoch": 0.767158063510444, + "grad_norm": 15.679009437561035, + "learning_rate": 5.116461551525503e-06, + "loss": 0.0561, + "step": 17225 + }, + { + "epoch": 0.7682715004676435, + "grad_norm": 0.5942630171775818, + "learning_rate": 5.069821104859607e-06, + "loss": 0.0234, + "step": 17250 + }, + { + "epoch": 0.769384937424843, + "grad_norm": 0.06479264050722122, + "learning_rate": 5.023363340118452e-06, + "loss": 0.1106, + "step": 17275 + }, + { + "epoch": 0.7704983743820425, + "grad_norm": 10.735722541809082, + "learning_rate": 4.977088825747609e-06, + "loss": 0.0826, + "step": 17300 + }, + { + "epoch": 0.771611811339242, + "grad_norm": 0.09754932671785355, + "learning_rate": 4.930998127950424e-06, + "loss": 0.0574, + "step": 17325 + }, + { + "epoch": 0.7727252482964414, + "grad_norm": 1.6428059339523315, + "learning_rate": 4.885091810681118e-06, + "loss": 0.0551, + "step": 17350 + }, + { + "epoch": 0.773838685253641, + "grad_norm": 0.011265415698289871, + "learning_rate": 4.839370435637869e-06, + "loss": 0.0587, + "step": 17375 + }, + { + "epoch": 0.7749521222108404, + "grad_norm": 0.5027729272842407, + "learning_rate": 4.793834562255968e-06, + "loss": 0.0549, + "step": 17400 + }, + { + "epoch": 0.7760655591680399, + "grad_norm": 0.034433938562870026, + "learning_rate": 4.748484747700937e-06, + "loss": 0.0857, + "step": 17425 + }, + { + "epoch": 0.7771789961252394, + "grad_norm": 0.39721280336380005, + "learning_rate": 4.703321546861728e-06, + "loss": 0.0136, + "step": 17450 + }, + { + "epoch": 0.7782924330824389, + "grad_norm": 11.144375801086426, + "learning_rate": 4.658345512343954e-06, + "loss": 0.0919, + "step": 17475 + }, + { + "epoch": 0.7794058700396383, + "grad_norm": 0.03096114657819271, + "learning_rate": 4.613557194463083e-06, + "loss": 0.1601, + "step": 17500 + }, + { + "epoch": 0.7805193069968378, + "grad_norm": 0.29175180196762085, + "learning_rate": 4.5689571412377444e-06, + "loss": 0.1204, + "step": 17525 + }, + { + "epoch": 0.7816327439540374, + "grad_norm": 0.077189140021801, + "learning_rate": 4.524545898382999e-06, + "loss": 0.0422, + "step": 17550 + }, + { + "epoch": 0.7827461809112368, + "grad_norm": 0.04853476583957672, + "learning_rate": 4.4803240093036895e-06, + "loss": 0.0418, + "step": 17575 + }, + { + "epoch": 0.7838596178684363, + "grad_norm": 0.25351813435554504, + "learning_rate": 4.436292015087751e-06, + "loss": 0.1125, + "step": 17600 + }, + { + "epoch": 0.7849730548256357, + "grad_norm": 0.33566001057624817, + "learning_rate": 4.392450454499624e-06, + "loss": 0.1044, + "step": 17625 + }, + { + "epoch": 0.7860864917828353, + "grad_norm": 0.33860525488853455, + "learning_rate": 4.348799863973645e-06, + "loss": 0.0398, + "step": 17650 + }, + { + "epoch": 0.7871999287400347, + "grad_norm": 0.019481919705867767, + "learning_rate": 4.305340777607503e-06, + "loss": 0.114, + "step": 17675 + }, + { + "epoch": 0.7883133656972342, + "grad_norm": 0.05631513521075249, + "learning_rate": 4.262073727155673e-06, + "loss": 0.0805, + "step": 17700 + }, + { + "epoch": 0.7894268026544337, + "grad_norm": 0.23303325474262238, + "learning_rate": 4.218999242022936e-06, + "loss": 0.0584, + "step": 17725 + }, + { + "epoch": 0.7905402396116332, + "grad_norm": 0.4254063665866852, + "learning_rate": 4.176117849257884e-06, + "loss": 0.0253, + "step": 17750 + }, + { + "epoch": 0.7916536765688327, + "grad_norm": 0.07280249893665314, + "learning_rate": 4.133430073546496e-06, + "loss": 0.017, + "step": 17775 + }, + { + "epoch": 0.7927671135260321, + "grad_norm": 0.09632007777690887, + "learning_rate": 4.090936437205683e-06, + "loss": 0.0441, + "step": 17800 + }, + { + "epoch": 0.7938805504832316, + "grad_norm": 0.028888842090964317, + "learning_rate": 4.048637460176925e-06, + "loss": 0.0914, + "step": 17825 + }, + { + "epoch": 0.7949939874404311, + "grad_norm": 0.27641311287879944, + "learning_rate": 4.0065336600198886e-06, + "loss": 0.041, + "step": 17850 + }, + { + "epoch": 0.7961074243976306, + "grad_norm": 0.08862696588039398, + "learning_rate": 3.96462555190613e-06, + "loss": 0.0638, + "step": 17875 + }, + { + "epoch": 0.7972208613548301, + "grad_norm": 0.021214209496974945, + "learning_rate": 3.922913648612738e-06, + "loss": 0.0523, + "step": 17900 + }, + { + "epoch": 0.7983342983120296, + "grad_norm": 0.4044612646102905, + "learning_rate": 3.881398460516106e-06, + "loss": 0.0566, + "step": 17925 + }, + { + "epoch": 0.7994477352692291, + "grad_norm": 0.13831877708435059, + "learning_rate": 3.840080495585659e-06, + "loss": 0.0788, + "step": 17950 + }, + { + "epoch": 0.8005611722264285, + "grad_norm": 0.05166686698794365, + "learning_rate": 3.7989602593776597e-06, + "loss": 0.03, + "step": 17975 + }, + { + "epoch": 0.801674609183628, + "grad_norm": 0.5509428381919861, + "learning_rate": 3.758038255029006e-06, + "loss": 0.0587, + "step": 18000 + }, + { + "epoch": 0.8027880461408275, + "grad_norm": 0.0240898709744215, + "learning_rate": 3.717314983251077e-06, + "loss": 0.0805, + "step": 18025 + }, + { + "epoch": 0.803901483098027, + "grad_norm": 0.01613386906683445, + "learning_rate": 3.6767909423236115e-06, + "loss": 0.0789, + "step": 18050 + }, + { + "epoch": 0.8050149200552265, + "grad_norm": 0.010899515822529793, + "learning_rate": 3.6364666280886194e-06, + "loss": 0.0338, + "step": 18075 + }, + { + "epoch": 0.8061283570124259, + "grad_norm": 0.04010821506381035, + "learning_rate": 3.5963425339442925e-06, + "loss": 0.0343, + "step": 18100 + }, + { + "epoch": 0.8072417939696255, + "grad_norm": 0.034061819314956665, + "learning_rate": 3.5564191508389855e-06, + "loss": 0.0384, + "step": 18125 + }, + { + "epoch": 0.8083552309268249, + "grad_norm": 0.018084630370140076, + "learning_rate": 3.516696967265203e-06, + "loss": 0.0813, + "step": 18150 + }, + { + "epoch": 0.8094686678840244, + "grad_norm": 0.07554034888744354, + "learning_rate": 3.4771764692536135e-06, + "loss": 0.0494, + "step": 18175 + }, + { + "epoch": 0.8105821048412238, + "grad_norm": 0.5343236923217773, + "learning_rate": 3.4378581403671294e-06, + "loss": 0.0743, + "step": 18200 + }, + { + "epoch": 0.8116955417984234, + "grad_norm": 10.600082397460938, + "learning_rate": 3.3987424616949617e-06, + "loss": 0.1109, + "step": 18225 + }, + { + "epoch": 0.8128089787556229, + "grad_norm": 0.37871259450912476, + "learning_rate": 3.3598299118467413e-06, + "loss": 0.0192, + "step": 18250 + }, + { + "epoch": 0.8139224157128223, + "grad_norm": 0.024805864319205284, + "learning_rate": 3.321120966946667e-06, + "loss": 0.0009, + "step": 18275 + }, + { + "epoch": 0.8150358526700219, + "grad_norm": 0.11892875283956528, + "learning_rate": 3.2826161006276913e-06, + "loss": 0.071, + "step": 18300 + }, + { + "epoch": 0.8161492896272213, + "grad_norm": 0.11794514954090118, + "learning_rate": 3.2443157840256957e-06, + "loss": 0.0547, + "step": 18325 + }, + { + "epoch": 0.8172627265844208, + "grad_norm": 0.048415035009384155, + "learning_rate": 3.2062204857737476e-06, + "loss": 0.0293, + "step": 18350 + }, + { + "epoch": 0.8183761635416202, + "grad_norm": 0.09221319109201431, + "learning_rate": 3.1683306719963604e-06, + "loss": 0.0468, + "step": 18375 + }, + { + "epoch": 0.8194896004988198, + "grad_norm": 0.009933177381753922, + "learning_rate": 3.130646806303803e-06, + "loss": 0.0501, + "step": 18400 + }, + { + "epoch": 0.8206030374560193, + "grad_norm": 0.2183195799589157, + "learning_rate": 3.0931693497864023e-06, + "loss": 0.0534, + "step": 18425 + }, + { + "epoch": 0.8217164744132187, + "grad_norm": 0.04401267319917679, + "learning_rate": 3.0558987610089132e-06, + "loss": 0.0671, + "step": 18450 + }, + { + "epoch": 0.8228299113704182, + "grad_norm": 0.10095870494842529, + "learning_rate": 3.0188354960049105e-06, + "loss": 0.0267, + "step": 18475 + }, + { + "epoch": 0.8239433483276177, + "grad_norm": 37.297325134277344, + "learning_rate": 2.9819800082712215e-06, + "loss": 0.0325, + "step": 18500 + }, + { + "epoch": 0.8250567852848172, + "grad_norm": 0.0290220957249403, + "learning_rate": 2.945332748762344e-06, + "loss": 0.0447, + "step": 18525 + }, + { + "epoch": 0.8261702222420166, + "grad_norm": 17.21735382080078, + "learning_rate": 2.908894165884957e-06, + "loss": 0.0511, + "step": 18550 + }, + { + "epoch": 0.8272836591992161, + "grad_norm": 0.03269251063466072, + "learning_rate": 2.872664705492416e-06, + "loss": 0.0893, + "step": 18575 + }, + { + "epoch": 0.8283970961564157, + "grad_norm": 0.0077833011746406555, + "learning_rate": 2.8366448108793233e-06, + "loss": 0.0283, + "step": 18600 + }, + { + "epoch": 0.8295105331136151, + "grad_norm": 0.43284299969673157, + "learning_rate": 2.80083492277607e-06, + "loss": 0.0557, + "step": 18625 + }, + { + "epoch": 0.8306239700708146, + "grad_norm": 0.02338644489645958, + "learning_rate": 2.7652354793434666e-06, + "loss": 0.1057, + "step": 18650 + }, + { + "epoch": 0.831737407028014, + "grad_norm": 0.03380841761827469, + "learning_rate": 2.7298469161673733e-06, + "loss": 0.0813, + "step": 18675 + }, + { + "epoch": 0.8328508439852136, + "grad_norm": 0.13972997665405273, + "learning_rate": 2.6946696662533824e-06, + "loss": 0.06, + "step": 18700 + }, + { + "epoch": 0.833964280942413, + "grad_norm": 0.06724318861961365, + "learning_rate": 2.6597041600214947e-06, + "loss": 0.0262, + "step": 18725 + }, + { + "epoch": 0.8350777178996125, + "grad_norm": 0.022787462919950485, + "learning_rate": 2.6249508253008806e-06, + "loss": 0.1003, + "step": 18750 + }, + { + "epoch": 0.8361911548568121, + "grad_norm": 0.010312095284461975, + "learning_rate": 2.5904100873246195e-06, + "loss": 0.1163, + "step": 18775 + }, + { + "epoch": 0.8373045918140115, + "grad_norm": 3.376255989074707, + "learning_rate": 2.5560823687245282e-06, + "loss": 0.0964, + "step": 18800 + }, + { + "epoch": 0.838418028771211, + "grad_norm": 0.20705439150333405, + "learning_rate": 2.5219680895259612e-06, + "loss": 0.0264, + "step": 18825 + }, + { + "epoch": 0.8395314657284104, + "grad_norm": 0.05201965197920799, + "learning_rate": 2.488067667142682e-06, + "loss": 0.0483, + "step": 18850 + }, + { + "epoch": 0.84064490268561, + "grad_norm": 15.602285385131836, + "learning_rate": 2.454381516371753e-06, + "loss": 0.0855, + "step": 18875 + }, + { + "epoch": 0.8417583396428094, + "grad_norm": 14.771347045898438, + "learning_rate": 2.4209100493884763e-06, + "loss": 0.1803, + "step": 18900 + }, + { + "epoch": 0.8428717766000089, + "grad_norm": 0.04606569930911064, + "learning_rate": 2.3876536757413218e-06, + "loss": 0.0739, + "step": 18925 + }, + { + "epoch": 0.8439852135572083, + "grad_norm": 0.016485804691910744, + "learning_rate": 2.3546128023469383e-06, + "loss": 0.0433, + "step": 18950 + }, + { + "epoch": 0.8450986505144079, + "grad_norm": 0.07802044600248337, + "learning_rate": 2.321787833485163e-06, + "loss": 0.0248, + "step": 18975 + }, + { + "epoch": 0.8462120874716074, + "grad_norm": 0.017175985500216484, + "learning_rate": 2.2891791707940913e-06, + "loss": 0.0322, + "step": 19000 + }, + { + "epoch": 0.8473255244288068, + "grad_norm": 0.31442588567733765, + "learning_rate": 2.2567872132651393e-06, + "loss": 0.0566, + "step": 19025 + }, + { + "epoch": 0.8484389613860063, + "grad_norm": 0.009995149448513985, + "learning_rate": 2.2246123572381718e-06, + "loss": 0.0276, + "step": 19050 + }, + { + "epoch": 0.8495523983432058, + "grad_norm": 0.025844572111964226, + "learning_rate": 2.192654996396659e-06, + "loss": 0.079, + "step": 19075 + }, + { + "epoch": 0.8506658353004053, + "grad_norm": 0.1997177004814148, + "learning_rate": 2.1609155217628584e-06, + "loss": 0.0545, + "step": 19100 + }, + { + "epoch": 0.8517792722576047, + "grad_norm": 0.11774219572544098, + "learning_rate": 2.129394321693021e-06, + "loss": 0.0845, + "step": 19125 + }, + { + "epoch": 0.8528927092148043, + "grad_norm": 0.8494458198547363, + "learning_rate": 2.09809178187264e-06, + "loss": 0.0629, + "step": 19150 + }, + { + "epoch": 0.8540061461720038, + "grad_norm": 16.696989059448242, + "learning_rate": 2.067008285311751e-06, + "loss": 0.0306, + "step": 19175 + }, + { + "epoch": 0.8551195831292032, + "grad_norm": 0.04906664416193962, + "learning_rate": 2.0361442123402164e-06, + "loss": 0.0913, + "step": 19200 + }, + { + "epoch": 0.8562330200864027, + "grad_norm": 13.342795372009277, + "learning_rate": 2.005499940603097e-06, + "loss": 0.1194, + "step": 19225 + }, + { + "epoch": 0.8573464570436022, + "grad_norm": 0.04243914783000946, + "learning_rate": 1.9750758450560116e-06, + "loss": 0.0775, + "step": 19250 + }, + { + "epoch": 0.8584598940008017, + "grad_norm": 0.024273626506328583, + "learning_rate": 1.9448722979605692e-06, + "loss": 0.0922, + "step": 19275 + }, + { + "epoch": 0.8595733309580011, + "grad_norm": 15.679910659790039, + "learning_rate": 1.9148896688797956e-06, + "loss": 0.0433, + "step": 19300 + }, + { + "epoch": 0.8606867679152006, + "grad_norm": 0.013732674531638622, + "learning_rate": 1.8851283246736218e-06, + "loss": 0.0515, + "step": 19325 + }, + { + "epoch": 0.8618002048724002, + "grad_norm": 0.02653571590781212, + "learning_rate": 1.855588629494387e-06, + "loss": 0.0831, + "step": 19350 + }, + { + "epoch": 0.8629136418295996, + "grad_norm": 18.192138671875, + "learning_rate": 1.8262709447824001e-06, + "loss": 0.0354, + "step": 19375 + }, + { + "epoch": 0.8640270787867991, + "grad_norm": 1.8473023176193237, + "learning_rate": 1.7971756292614961e-06, + "loss": 0.0811, + "step": 19400 + }, + { + "epoch": 0.8651405157439985, + "grad_norm": 15.879515647888184, + "learning_rate": 1.7683030389346577e-06, + "loss": 0.0214, + "step": 19425 + }, + { + "epoch": 0.8662539527011981, + "grad_norm": 12.173957824707031, + "learning_rate": 1.739653527079659e-06, + "loss": 0.0837, + "step": 19450 + }, + { + "epoch": 0.8673673896583975, + "grad_norm": 12.342336654663086, + "learning_rate": 1.7112274442447475e-06, + "loss": 0.0446, + "step": 19475 + }, + { + "epoch": 0.868480826615597, + "grad_norm": 0.009152592159807682, + "learning_rate": 1.6830251382443408e-06, + "loss": 0.0659, + "step": 19500 + }, + { + "epoch": 0.8695942635727965, + "grad_norm": 0.06887594610452652, + "learning_rate": 1.655046954154782e-06, + "loss": 0.1522, + "step": 19525 + }, + { + "epoch": 0.870707700529996, + "grad_norm": 0.007866369560360909, + "learning_rate": 1.6272932343101122e-06, + "loss": 0.0102, + "step": 19550 + }, + { + "epoch": 0.8718211374871955, + "grad_norm": 0.30307716131210327, + "learning_rate": 1.599764318297894e-06, + "loss": 0.0528, + "step": 19575 + }, + { + "epoch": 0.8729345744443949, + "grad_norm": 0.057492926716804504, + "learning_rate": 1.5724605429550366e-06, + "loss": 0.0162, + "step": 19600 + }, + { + "epoch": 0.8740480114015945, + "grad_norm": 0.0504818893969059, + "learning_rate": 1.545382242363682e-06, + "loss": 0.0132, + "step": 19625 + }, + { + "epoch": 0.875161448358794, + "grad_norm": 0.007792500779032707, + "learning_rate": 1.5185297478471373e-06, + "loss": 0.0239, + "step": 19650 + }, + { + "epoch": 0.8762748853159934, + "grad_norm": 0.04944828525185585, + "learning_rate": 1.4919033879657852e-06, + "loss": 0.0283, + "step": 19675 + }, + { + "epoch": 0.8773883222731929, + "grad_norm": 0.08414191752672195, + "learning_rate": 1.4655034885130893e-06, + "loss": 0.0719, + "step": 19700 + }, + { + "epoch": 0.8785017592303924, + "grad_norm": 11.17496395111084, + "learning_rate": 1.4393303725115914e-06, + "loss": 0.0667, + "step": 19725 + }, + { + "epoch": 0.8796151961875919, + "grad_norm": 110.51337432861328, + "learning_rate": 1.4133843602089847e-06, + "loss": 0.0893, + "step": 19750 + }, + { + "epoch": 0.8807286331447913, + "grad_norm": 0.24963940680027008, + "learning_rate": 1.387665769074167e-06, + "loss": 0.0753, + "step": 19775 + }, + { + "epoch": 0.8818420701019908, + "grad_norm": 0.39186498522758484, + "learning_rate": 1.362174913793366e-06, + "loss": 0.0504, + "step": 19800 + }, + { + "epoch": 0.8829555070591903, + "grad_norm": 0.00791653897613287, + "learning_rate": 1.3369121062662948e-06, + "loss": 0.0943, + "step": 19825 + }, + { + "epoch": 0.8840689440163898, + "grad_norm": 0.08412867039442062, + "learning_rate": 1.3118776556023338e-06, + "loss": 0.0733, + "step": 19850 + }, + { + "epoch": 0.8851823809735893, + "grad_norm": 90.87469482421875, + "learning_rate": 1.2870718681167427e-06, + "loss": 0.02, + "step": 19875 + }, + { + "epoch": 0.8862958179307887, + "grad_norm": 11.460103988647461, + "learning_rate": 1.2624950473269148e-06, + "loss": 0.0946, + "step": 19900 + }, + { + "epoch": 0.8874092548879883, + "grad_norm": 0.0059192064218223095, + "learning_rate": 1.238147493948665e-06, + "loss": 0.0185, + "step": 19925 + }, + { + "epoch": 0.8885226918451877, + "grad_norm": 0.22380521893501282, + "learning_rate": 1.2140295058925533e-06, + "loss": 0.0008, + "step": 19950 + }, + { + "epoch": 0.8896361288023872, + "grad_norm": 0.2047863006591797, + "learning_rate": 1.1901413782602322e-06, + "loss": 0.0572, + "step": 19975 + }, + { + "epoch": 0.8907495657595866, + "grad_norm": 0.008318849839270115, + "learning_rate": 1.166483403340839e-06, + "loss": 0.0353, + "step": 20000 + }, + { + "epoch": 0.8918630027167862, + "grad_norm": 0.06174292787909508, + "learning_rate": 1.1430558706074212e-06, + "loss": 0.0302, + "step": 20025 + }, + { + "epoch": 0.8929764396739857, + "grad_norm": 0.0325322151184082, + "learning_rate": 1.1198590667134002e-06, + "loss": 0.0282, + "step": 20050 + }, + { + "epoch": 0.8940898766311851, + "grad_norm": 0.09468420594930649, + "learning_rate": 1.096893275489046e-06, + "loss": 0.0445, + "step": 20075 + }, + { + "epoch": 0.8952033135883847, + "grad_norm": 17.444026947021484, + "learning_rate": 1.0741587779380192e-06, + "loss": 0.062, + "step": 20100 + }, + { + "epoch": 0.8963167505455841, + "grad_norm": 0.02685241959989071, + "learning_rate": 1.0516558522339348e-06, + "loss": 0.0764, + "step": 20125 + }, + { + "epoch": 0.8974301875027836, + "grad_norm": 0.0349482037127018, + "learning_rate": 1.029384773716946e-06, + "loss": 0.0352, + "step": 20150 + }, + { + "epoch": 0.898543624459983, + "grad_norm": 0.06942135840654373, + "learning_rate": 1.0073458148903858e-06, + "loss": 0.0864, + "step": 20175 + }, + { + "epoch": 0.8996570614171826, + "grad_norm": 0.1278313398361206, + "learning_rate": 9.85539245417424e-07, + "loss": 0.0696, + "step": 20200 + }, + { + "epoch": 0.9007704983743821, + "grad_norm": 0.14868886768817902, + "learning_rate": 9.639653321177823e-07, + "loss": 0.0709, + "step": 20225 + }, + { + "epoch": 0.9018839353315815, + "grad_norm": 11.872529029846191, + "learning_rate": 9.426243389644463e-07, + "loss": 0.0779, + "step": 20250 + }, + { + "epoch": 0.902997372288781, + "grad_norm": 0.015093258582055569, + "learning_rate": 9.215165270804616e-07, + "loss": 0.1371, + "step": 20275 + }, + { + "epoch": 0.9041108092459805, + "grad_norm": 0.31670305132865906, + "learning_rate": 9.006421547357158e-07, + "loss": 0.0841, + "step": 20300 + }, + { + "epoch": 0.90522424620318, + "grad_norm": 47.879825592041016, + "learning_rate": 8.800014773438015e-07, + "loss": 0.0608, + "step": 20325 + }, + { + "epoch": 0.9063376831603794, + "grad_norm": 11.414122581481934, + "learning_rate": 8.595947474588651e-07, + "loss": 0.1249, + "step": 20350 + }, + { + "epoch": 0.9074511201175789, + "grad_norm": 0.02606755867600441, + "learning_rate": 8.394222147725362e-07, + "loss": 0.0606, + "step": 20375 + }, + { + "epoch": 0.9085645570747785, + "grad_norm": 0.014529196545481682, + "learning_rate": 8.194841261108655e-07, + "loss": 0.0362, + "step": 20400 + }, + { + "epoch": 0.9096779940319779, + "grad_norm": 11.685364723205566, + "learning_rate": 7.997807254313117e-07, + "loss": 0.0832, + "step": 20425 + }, + { + "epoch": 0.9107914309891774, + "grad_norm": 0.1444302648305893, + "learning_rate": 7.803122538197371e-07, + "loss": 0.0661, + "step": 20450 + }, + { + "epoch": 0.9119048679463769, + "grad_norm": 0.026668483391404152, + "learning_rate": 7.610789494874793e-07, + "loss": 0.0679, + "step": 20475 + }, + { + "epoch": 0.9130183049035764, + "grad_norm": 0.5154994130134583, + "learning_rate": 7.420810477684325e-07, + "loss": 0.0381, + "step": 20500 + }, + { + "epoch": 0.9141317418607758, + "grad_norm": 0.7168388366699219, + "learning_rate": 7.233187811161513e-07, + "loss": 0.0598, + "step": 20525 + }, + { + "epoch": 0.9152451788179753, + "grad_norm": 0.0393916517496109, + "learning_rate": 7.047923791010269e-07, + "loss": 0.0228, + "step": 20550 + }, + { + "epoch": 0.9163586157751749, + "grad_norm": 0.11829303205013275, + "learning_rate": 6.865020684074686e-07, + "loss": 0.0124, + "step": 20575 + }, + { + "epoch": 0.9174720527323743, + "grad_norm": 0.14193271100521088, + "learning_rate": 6.684480728311315e-07, + "loss": 0.0468, + "step": 20600 + }, + { + "epoch": 0.9185854896895738, + "grad_norm": 0.0144277885556221, + "learning_rate": 6.506306132761797e-07, + "loss": 0.0809, + "step": 20625 + }, + { + "epoch": 0.9196989266467732, + "grad_norm": 0.4156523644924164, + "learning_rate": 6.330499077525765e-07, + "loss": 0.0279, + "step": 20650 + }, + { + "epoch": 0.9208123636039728, + "grad_norm": 8.571426391601562, + "learning_rate": 6.157061713734269e-07, + "loss": 0.0859, + "step": 20675 + }, + { + "epoch": 0.9219258005611722, + "grad_norm": 0.005042167380452156, + "learning_rate": 5.985996163523422e-07, + "loss": 0.0348, + "step": 20700 + }, + { + "epoch": 0.9230392375183717, + "grad_norm": 0.023421961814165115, + "learning_rate": 5.817304520008371e-07, + "loss": 0.0634, + "step": 20725 + }, + { + "epoch": 0.9241526744755711, + "grad_norm": 0.03484127297997475, + "learning_rate": 5.650988847257743e-07, + "loss": 0.0419, + "step": 20750 + }, + { + "epoch": 0.9252661114327707, + "grad_norm": 0.03696205094456673, + "learning_rate": 5.487051180268443e-07, + "loss": 0.0773, + "step": 20775 + }, + { + "epoch": 0.9263795483899702, + "grad_norm": 0.3316960334777832, + "learning_rate": 5.325493524940628e-07, + "loss": 0.0473, + "step": 20800 + }, + { + "epoch": 0.9274929853471696, + "grad_norm": 0.13512980937957764, + "learning_rate": 5.166317858053261e-07, + "loss": 0.0007, + "step": 20825 + }, + { + "epoch": 0.9286064223043691, + "grad_norm": 0.04074832424521446, + "learning_rate": 5.009526127239883e-07, + "loss": 0.0653, + "step": 20850 + }, + { + "epoch": 0.9297198592615686, + "grad_norm": 76.54032135009766, + "learning_rate": 4.855120250964773e-07, + "loss": 0.0371, + "step": 20875 + }, + { + "epoch": 0.9308332962187681, + "grad_norm": 0.2280624508857727, + "learning_rate": 4.7031021184995585e-07, + "loss": 0.0385, + "step": 20900 + }, + { + "epoch": 0.9319467331759675, + "grad_norm": 0.10697930306196213, + "learning_rate": 4.553473589899926e-07, + "loss": 0.0673, + "step": 20925 + }, + { + "epoch": 0.9330601701331671, + "grad_norm": 0.012013726867735386, + "learning_rate": 4.4062364959830626e-07, + "loss": 0.0798, + "step": 20950 + }, + { + "epoch": 0.9341736070903666, + "grad_norm": 0.011254810728132725, + "learning_rate": 4.2613926383051173e-07, + "loss": 0.0832, + "step": 20975 + }, + { + "epoch": 0.935287044047566, + "grad_norm": 0.25895559787750244, + "learning_rate": 4.1189437891392183e-07, + "loss": 0.054, + "step": 21000 + }, + { + "epoch": 0.9364004810047655, + "grad_norm": 0.27853846549987793, + "learning_rate": 3.9788916914537566e-07, + "loss": 0.0545, + "step": 21025 + }, + { + "epoch": 0.937513917961965, + "grad_norm": 0.04974246025085449, + "learning_rate": 3.8412380588910503e-07, + "loss": 0.0477, + "step": 21050 + }, + { + "epoch": 0.9386273549191645, + "grad_norm": 0.305239200592041, + "learning_rate": 3.7059845757464686e-07, + "loss": 0.0209, + "step": 21075 + }, + { + "epoch": 0.939740791876364, + "grad_norm": 0.027433935552835464, + "learning_rate": 3.573132896947673e-07, + "loss": 0.1036, + "step": 21100 + }, + { + "epoch": 0.9408542288335634, + "grad_norm": 38.015682220458984, + "learning_rate": 3.4426846480345224e-07, + "loss": 0.0664, + "step": 21125 + }, + { + "epoch": 0.941967665790763, + "grad_norm": 17.695173263549805, + "learning_rate": 3.3146414251390423e-07, + "loss": 0.0429, + "step": 21150 + }, + { + "epoch": 0.9430811027479624, + "grad_norm": 1.5212093591690063, + "learning_rate": 3.189004794966022e-07, + "loss": 0.0012, + "step": 21175 + }, + { + "epoch": 0.9441945397051619, + "grad_norm": 13.568913459777832, + "learning_rate": 3.0657762947737815e-07, + "loss": 0.0847, + "step": 21200 + }, + { + "epoch": 0.9453079766623613, + "grad_norm": 0.4400452971458435, + "learning_rate": 2.9449574323552996e-07, + "loss": 0.0533, + "step": 21225 + }, + { + "epoch": 0.9464214136195609, + "grad_norm": 0.020213905721902847, + "learning_rate": 2.826549686019875e-07, + "loss": 0.0222, + "step": 21250 + }, + { + "epoch": 0.9475348505767603, + "grad_norm": 0.015498008579015732, + "learning_rate": 2.710554504575047e-07, + "loss": 0.0243, + "step": 21275 + }, + { + "epoch": 0.9486482875339598, + "grad_norm": 79.78962707519531, + "learning_rate": 2.596973307308726e-07, + "loss": 0.0592, + "step": 21300 + }, + { + "epoch": 0.9497617244911594, + "grad_norm": 0.05232081189751625, + "learning_rate": 2.485807483971958e-07, + "loss": 0.0283, + "step": 21325 + }, + { + "epoch": 0.9508751614483588, + "grad_norm": 0.22005562484264374, + "learning_rate": 2.3770583947619218e-07, + "loss": 0.0032, + "step": 21350 + }, + { + "epoch": 0.9519885984055583, + "grad_norm": 0.5018379092216492, + "learning_rate": 2.270727370305159e-07, + "loss": 0.0475, + "step": 21375 + }, + { + "epoch": 0.9531020353627577, + "grad_norm": 0.07405375689268112, + "learning_rate": 2.1668157116414346e-07, + "loss": 0.0568, + "step": 21400 + }, + { + "epoch": 0.9542154723199573, + "grad_norm": 13.536460876464844, + "learning_rate": 2.0653246902077263e-07, + "loss": 0.0938, + "step": 21425 + }, + { + "epoch": 0.9553289092771567, + "grad_norm": 0.15782712399959564, + "learning_rate": 1.9662555478227484e-07, + "loss": 0.0847, + "step": 21450 + }, + { + "epoch": 0.9564423462343562, + "grad_norm": 0.08711089193820953, + "learning_rate": 1.8696094966716537e-07, + "loss": 0.0362, + "step": 21475 + }, + { + "epoch": 0.9575557831915557, + "grad_norm": 0.024433070793747902, + "learning_rate": 1.7753877192913104e-07, + "loss": 0.0958, + "step": 21500 + }, + { + "epoch": 0.9586692201487552, + "grad_norm": 0.07410872727632523, + "learning_rate": 1.683591368555737e-07, + "loss": 0.0631, + "step": 21525 + }, + { + "epoch": 0.9597826571059547, + "grad_norm": 0.058046724647283554, + "learning_rate": 1.5942215676620908e-07, + "loss": 0.0167, + "step": 21550 + }, + { + "epoch": 0.9608960940631541, + "grad_norm": 0.18262982368469238, + "learning_rate": 1.5072794101168132e-07, + "loss": 0.0668, + "step": 21575 + }, + { + "epoch": 0.9620095310203536, + "grad_norm": 0.008218363858759403, + "learning_rate": 1.4227659597223719e-07, + "loss": 0.0456, + "step": 21600 + }, + { + "epoch": 0.9631229679775531, + "grad_norm": 13.682443618774414, + "learning_rate": 1.340682250564096e-07, + "loss": 0.0474, + "step": 21625 + }, + { + "epoch": 0.9642364049347526, + "grad_norm": 0.005989130586385727, + "learning_rate": 1.2610292869977392e-07, + "loss": 0.0474, + "step": 21650 + }, + { + "epoch": 0.965349841891952, + "grad_norm": 0.015712061896920204, + "learning_rate": 1.1838080436369359e-07, + "loss": 0.0892, + "step": 21675 + }, + { + "epoch": 0.9664632788491515, + "grad_norm": 0.28373441100120544, + "learning_rate": 1.1090194653414543e-07, + "loss": 0.0158, + "step": 21700 + }, + { + "epoch": 0.9675767158063511, + "grad_norm": 0.08839291334152222, + "learning_rate": 1.0366644672056059e-07, + "loss": 0.0526, + "step": 21725 + }, + { + "epoch": 0.9686901527635505, + "grad_norm": 0.43165767192840576, + "learning_rate": 9.667439345469875e-08, + "loss": 0.0482, + "step": 21750 + }, + { + "epoch": 0.96980358972075, + "grad_norm": 0.2389592081308365, + "learning_rate": 8.992587228957128e-08, + "loss": 0.0577, + "step": 21775 + }, + { + "epoch": 0.9709170266779495, + "grad_norm": 0.033785369247198105, + "learning_rate": 8.342096579839087e-08, + "loss": 0.0506, + "step": 21800 + }, + { + "epoch": 0.972030463635149, + "grad_norm": 0.030277960002422333, + "learning_rate": 7.715975357356131e-08, + "loss": 0.0483, + "step": 21825 + }, + { + "epoch": 0.9731439005923485, + "grad_norm": 0.016112027689814568, + "learning_rate": 7.11423122257049e-08, + "loss": 0.0405, + "step": 21850 + }, + { + "epoch": 0.9742573375495479, + "grad_norm": 23.555667877197266, + "learning_rate": 6.536871538272538e-08, + "loss": 0.043, + "step": 21875 + }, + { + "epoch": 0.9753707745067475, + "grad_norm": 0.23819568753242493, + "learning_rate": 5.983903368890653e-08, + "loss": 0.0304, + "step": 21900 + }, + { + "epoch": 0.9764842114639469, + "grad_norm": 17.99365997314453, + "learning_rate": 5.45533348040439e-08, + "loss": 0.0104, + "step": 21925 + }, + { + "epoch": 0.9775976484211464, + "grad_norm": 0.010882832109928131, + "learning_rate": 4.9511683402627683e-08, + "loss": 0.0709, + "step": 21950 + }, + { + "epoch": 0.9787110853783458, + "grad_norm": 3.7435531616210938, + "learning_rate": 4.471414117303896e-08, + "loss": 0.07, + "step": 21975 + }, + { + "epoch": 0.9798245223355454, + "grad_norm": 0.04117279127240181, + "learning_rate": 4.0160766816796925e-08, + "loss": 0.0327, + "step": 22000 + }, + { + "epoch": 0.9809379592927449, + "grad_norm": 0.08280256390571594, + "learning_rate": 3.585161604785503e-08, + "loss": 0.0667, + "step": 22025 + }, + { + "epoch": 0.9820513962499443, + "grad_norm": 0.022770730778574944, + "learning_rate": 3.17867415918971e-08, + "loss": 0.085, + "step": 22050 + }, + { + "epoch": 0.9831648332071438, + "grad_norm": 0.05374102294445038, + "learning_rate": 2.7966193185706702e-08, + "loss": 0.06, + "step": 22075 + }, + { + "epoch": 0.9842782701643433, + "grad_norm": 10.886147499084473, + "learning_rate": 2.4390017576561008e-08, + "loss": 0.0439, + "step": 22100 + }, + { + "epoch": 0.9853917071215428, + "grad_norm": 0.22477033734321594, + "learning_rate": 2.1058258521642337e-08, + "loss": 0.0363, + "step": 22125 + }, + { + "epoch": 0.9865051440787422, + "grad_norm": 0.01279439590871334, + "learning_rate": 1.797095678752303e-08, + "loss": 0.0624, + "step": 22150 + }, + { + "epoch": 0.9876185810359418, + "grad_norm": 0.14910484850406647, + "learning_rate": 1.51281501496503e-08, + "loss": 0.0406, + "step": 22175 + }, + { + "epoch": 0.9887320179931413, + "grad_norm": 13.914883613586426, + "learning_rate": 1.2529873391895486e-08, + "loss": 0.0638, + "step": 22200 + }, + { + "epoch": 0.9898454549503407, + "grad_norm": 0.388899564743042, + "learning_rate": 1.0176158306118844e-08, + "loss": 0.091, + "step": 22225 + }, + { + "epoch": 0.9909588919075402, + "grad_norm": 19.062267303466797, + "learning_rate": 8.06703369178985e-09, + "loss": 0.081, + "step": 22250 + }, + { + "epoch": 0.9920723288647397, + "grad_norm": 0.013312091119587421, + "learning_rate": 6.202525355627487e-09, + "loss": 0.0281, + "step": 22275 + }, + { + "epoch": 0.9931857658219392, + "grad_norm": 0.02743523009121418, + "learning_rate": 4.582656111289385e-09, + "loss": 0.1062, + "step": 22300 + }, + { + "epoch": 0.9942992027791386, + "grad_norm": 0.01560298539698124, + "learning_rate": 3.2074457790876036e-09, + "loss": 0.1068, + "step": 22325 + }, + { + "epoch": 0.9954126397363381, + "grad_norm": 45.84857177734375, + "learning_rate": 2.0769111857510405e-09, + "loss": 0.0872, + "step": 22350 + }, + { + "epoch": 0.9965260766935377, + "grad_norm": 0.06956358253955841, + "learning_rate": 1.1910661642189348e-09, + "loss": 0.1165, + "step": 22375 + }, + { + "epoch": 0.9976395136507371, + "grad_norm": 0.021001625806093216, + "learning_rate": 5.499215534676694e-10, + "loss": 0.0312, + "step": 22400 + }, + { + "epoch": 0.9987529506079366, + "grad_norm": 0.00945682730525732, + "learning_rate": 1.5348519838198628e-10, + "loss": 0.0877, + "step": 22425 + }, + { + "epoch": 0.999866387565136, + "grad_norm": 25.856142044067383, + "learning_rate": 1.761949657286266e-12, + "loss": 0.0302, + "step": 22450 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9805008477892265, + "eval_auc": 0.9956209792829493, + "eval_f1": 0.9869591765526867, + "eval_loss": 0.10316640883684158, + "eval_precision": 0.9820335893763833, + "eval_recall": 0.9919344233551045, + "eval_runtime": 4675.5639, + "eval_samples_per_second": 6.559, + "eval_steps_per_second": 0.273, + "step": 22453 + } + ], + "logging_steps": 25, + "max_steps": 22453, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.204190364178894e+19, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}