{ "best_metric": null, "best_model_checkpoint": null, "epoch": 39.95179987797437, "eval_steps": 100.0, "global_step": 32760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6101281269066504, "grad_norm": 50.660884857177734, "learning_rate": 1.188e-06, "loss": 40.3051, "step": 500 }, { "epoch": 1.0, "eval_cer": 1.1284080132764343, "eval_loss": 14.975272178649902, "eval_runtime": 87.9488, "eval_samples_per_second": 77.773, "eval_steps_per_second": 9.722, "eval_wer": 1.0, "step": 820 }, { "epoch": 1.2196461256863942, "grad_norm": 66.72471618652344, "learning_rate": 2.3880000000000003e-06, "loss": 15.5316, "step": 1000 }, { "epoch": 1.8297742525930445, "grad_norm": 58.356754302978516, "learning_rate": 3.588e-06, "loss": 12.3062, "step": 1500 }, { "epoch": 2.0, "eval_cer": 1.1284376481744902, "eval_loss": 9.953412055969238, "eval_runtime": 86.9725, "eval_samples_per_second": 78.646, "eval_steps_per_second": 9.831, "eval_wer": 1.0, "step": 1640 }, { "epoch": 2.4392922513727884, "grad_norm": 37.668678283691406, "learning_rate": 4.788e-06, "loss": 9.0113, "step": 2000 }, { "epoch": 3.0, "eval_cer": 1.1283783783783783, "eval_loss": 4.837707996368408, "eval_runtime": 90.6105, "eval_samples_per_second": 75.488, "eval_steps_per_second": 9.436, "eval_wer": 1.0, "step": 2460 }, { "epoch": 3.048810250152532, "grad_norm": 10.56650161743164, "learning_rate": 5.988e-06, "loss": 5.8841, "step": 2500 }, { "epoch": 3.6589383770591826, "grad_norm": 1.526659369468689, "learning_rate": 7.1880000000000005e-06, "loss": 3.9478, "step": 3000 }, { "epoch": 4.0, "eval_cer": 1.1284080132764343, "eval_loss": 3.201613426208496, "eval_runtime": 92.4896, "eval_samples_per_second": 73.954, "eval_steps_per_second": 9.244, "eval_wer": 1.0, "step": 3280 }, { "epoch": 4.268456375838926, "grad_norm": 1.9021031856536865, "learning_rate": 8.388e-06, "loss": 3.2121, "step": 3500 }, { "epoch": 4.878584502745577, "grad_norm": 2.086733102798462, "learning_rate": 9.588e-06, "loss": 2.7396, "step": 4000 }, { "epoch": 5.0, "eval_cer": 1.1284376481744902, "eval_loss": 2.4831321239471436, "eval_runtime": 89.7632, "eval_samples_per_second": 76.201, "eval_steps_per_second": 9.525, "eval_wer": 1.0, "step": 4100 }, { "epoch": 5.48810250152532, "grad_norm": 4.460277557373047, "learning_rate": 1.0787999999999999e-05, "loss": 2.2509, "step": 4500 }, { "epoch": 6.0, "eval_cer": 1.1053816974869606, "eval_loss": 0.9197890758514404, "eval_runtime": 86.71, "eval_samples_per_second": 78.884, "eval_steps_per_second": 9.86, "eval_wer": 0.9998538011695907, "step": 4920 }, { "epoch": 6.097620500305064, "grad_norm": 3.249329090118408, "learning_rate": 1.1988000000000001e-05, "loss": 1.2958, "step": 5000 }, { "epoch": 6.707748627211714, "grad_norm": 3.6012656688690186, "learning_rate": 1.3188e-05, "loss": 0.6886, "step": 5500 }, { "epoch": 7.0, "eval_cer": 1.0272641062114747, "eval_loss": 0.3222425580024719, "eval_runtime": 89.592, "eval_samples_per_second": 76.346, "eval_steps_per_second": 9.543, "eval_wer": 0.9991228070175439, "step": 5740 }, { "epoch": 7.317266625991458, "grad_norm": 3.2955262660980225, "learning_rate": 1.4388000000000002e-05, "loss": 0.439, "step": 6000 }, { "epoch": 7.927394752898109, "grad_norm": 25.04276466369629, "learning_rate": 1.5588e-05, "loss": 0.318, "step": 6500 }, { "epoch": 8.0, "eval_cer": 1.0191441441441442, "eval_loss": 0.16009780764579773, "eval_runtime": 88.8354, "eval_samples_per_second": 76.996, "eval_steps_per_second": 9.625, "eval_wer": 0.9991228070175439, "step": 6560 }, { "epoch": 8.536912751677852, "grad_norm": 2.714284896850586, "learning_rate": 1.6788e-05, "loss": 0.255, "step": 7000 }, { "epoch": 9.0, "eval_cer": 1.0176327643432908, "eval_loss": 0.09021047502756119, "eval_runtime": 89.3568, "eval_samples_per_second": 76.547, "eval_steps_per_second": 9.568, "eval_wer": 0.9988304093567252, "step": 7380 }, { "epoch": 9.146430750457595, "grad_norm": 4.7839813232421875, "learning_rate": 1.7988e-05, "loss": 0.2207, "step": 7500 }, { "epoch": 9.756558877364247, "grad_norm": 2.4737026691436768, "learning_rate": 1.9188e-05, "loss": 0.1835, "step": 8000 }, { "epoch": 10.0, "eval_cer": 1.019736842105263, "eval_loss": 0.07156170159578323, "eval_runtime": 88.4936, "eval_samples_per_second": 77.294, "eval_steps_per_second": 9.662, "eval_wer": 0.9991228070175439, "step": 8200 }, { "epoch": 10.36607687614399, "grad_norm": 6.031515121459961, "learning_rate": 2.0388e-05, "loss": 0.1691, "step": 8500 }, { "epoch": 10.97620500305064, "grad_norm": 3.6490681171417236, "learning_rate": 2.1588e-05, "loss": 0.16, "step": 9000 }, { "epoch": 11.0, "eval_cer": 1.017514224751067, "eval_loss": 0.04918373003602028, "eval_runtime": 88.5165, "eval_samples_per_second": 77.274, "eval_steps_per_second": 9.659, "eval_wer": 0.9988304093567252, "step": 9020 }, { "epoch": 11.585723001830385, "grad_norm": 4.373748779296875, "learning_rate": 2.2788000000000003e-05, "loss": 0.1444, "step": 9500 }, { "epoch": 12.0, "eval_cer": 1.016151019440493, "eval_loss": 0.03436645492911339, "eval_runtime": 88.2172, "eval_samples_per_second": 77.536, "eval_steps_per_second": 9.692, "eval_wer": 0.9989766081871345, "step": 9840 }, { "epoch": 12.195241000610128, "grad_norm": 0.14861944317817688, "learning_rate": 2.3988e-05, "loss": 0.1382, "step": 10000 }, { "epoch": 12.805369127516778, "grad_norm": 4.033771991729736, "learning_rate": 2.5188e-05, "loss": 0.1307, "step": 10500 }, { "epoch": 13.0, "eval_cer": 1.0176327643432908, "eval_loss": 0.038768209517002106, "eval_runtime": 95.1356, "eval_samples_per_second": 71.897, "eval_steps_per_second": 8.987, "eval_wer": 0.9988304093567252, "step": 10660 }, { "epoch": 13.414887126296522, "grad_norm": 2.4692184925079346, "learning_rate": 2.63856e-05, "loss": 0.1346, "step": 11000 }, { "epoch": 14.0, "eval_cer": 1.0166548127074442, "eval_loss": 0.0276629775762558, "eval_runtime": 87.0897, "eval_samples_per_second": 78.54, "eval_steps_per_second": 9.817, "eval_wer": 0.9989766081871345, "step": 11480 }, { "epoch": 14.024405125076266, "grad_norm": 3.765444278717041, "learning_rate": 2.7585600000000002e-05, "loss": 0.1287, "step": 11500 }, { "epoch": 14.634533251982916, "grad_norm": 1.3012793064117432, "learning_rate": 2.87856e-05, "loss": 0.1257, "step": 12000 }, { "epoch": 15.0, "eval_cer": 1.0164473684210527, "eval_loss": 0.02670404687523842, "eval_runtime": 92.4139, "eval_samples_per_second": 74.015, "eval_steps_per_second": 9.252, "eval_wer": 0.9988304093567252, "step": 12300 }, { "epoch": 15.24405125076266, "grad_norm": 6.9578728675842285, "learning_rate": 2.99856e-05, "loss": 0.1276, "step": 12500 }, { "epoch": 15.854179377669311, "grad_norm": 3.3747990131378174, "learning_rate": 2.9956013070043084e-05, "loss": 0.1157, "step": 13000 }, { "epoch": 16.0, "eval_cer": 1.0165955429113325, "eval_loss": 0.02758130058646202, "eval_runtime": 89.1614, "eval_samples_per_second": 76.715, "eval_steps_per_second": 9.589, "eval_wer": 0.9989766081871345, "step": 13120 }, { "epoch": 16.463697376449055, "grad_norm": 2.6571855545043945, "learning_rate": 2.9822174136311704e-05, "loss": 0.1104, "step": 13500 }, { "epoch": 17.0, "eval_cer": 1.0171586059743956, "eval_loss": 0.03412469103932381, "eval_runtime": 88.3916, "eval_samples_per_second": 77.383, "eval_steps_per_second": 9.673, "eval_wer": 0.9988304093567252, "step": 13940 }, { "epoch": 17.0732153752288, "grad_norm": 0.15126635134220123, "learning_rate": 2.9599814696946643e-05, "loss": 0.1152, "step": 14000 }, { "epoch": 17.683343502135447, "grad_norm": 1.5255239009857178, "learning_rate": 2.9289379955813937e-05, "loss": 0.1086, "step": 14500 }, { "epoch": 18.0, "eval_cer": 1.016743717401612, "eval_loss": 0.023562652990221977, "eval_runtime": 99.8828, "eval_samples_per_second": 68.48, "eval_steps_per_second": 8.56, "eval_wer": 0.9988304093567252, "step": 14760 }, { "epoch": 18.29286150091519, "grad_norm": 4.014769554138184, "learning_rate": 2.8893091974003682e-05, "loss": 0.1007, "step": 15000 }, { "epoch": 18.902989627821842, "grad_norm": 1.9409995079040527, "learning_rate": 2.84143727148899e-05, "loss": 0.1072, "step": 15500 }, { "epoch": 19.0, "eval_cer": 1.0175438596491229, "eval_loss": 0.030639806762337685, "eval_runtime": 84.6618, "eval_samples_per_second": 80.792, "eval_steps_per_second": 10.099, "eval_wer": 0.9988304093567252, "step": 15580 }, { "epoch": 19.512507626601586, "grad_norm": 13.441529273986816, "learning_rate": 2.7855379321676933e-05, "loss": 0.128, "step": 16000 }, { "epoch": 20.0, "eval_cer": 1.0160324798482694, "eval_loss": 0.024129284545779228, "eval_runtime": 90.1768, "eval_samples_per_second": 75.851, "eval_steps_per_second": 9.481, "eval_wer": 0.9988304093567252, "step": 16400 }, { "epoch": 20.12202562538133, "grad_norm": 2.291337490081787, "learning_rate": 2.7218107759869366e-05, "loss": 0.1062, "step": 16500 }, { "epoch": 20.73215375228798, "grad_norm": 4.577757835388184, "learning_rate": 2.650742754426605e-05, "loss": 0.0987, "step": 17000 }, { "epoch": 21.0, "eval_cer": 1.0165066382171646, "eval_loss": 0.02422538958489895, "eval_runtime": 97.4522, "eval_samples_per_second": 70.188, "eval_steps_per_second": 8.774, "eval_wer": 0.9988304093567252, "step": 17220 }, { "epoch": 21.341671751067725, "grad_norm": 6.208291530609131, "learning_rate": 2.5727608573195923e-05, "loss": 0.1035, "step": 17500 }, { "epoch": 21.951799877974373, "grad_norm": 9.813406944274902, "learning_rate": 2.4883336143432908e-05, "loss": 0.1031, "step": 18000 }, { "epoch": 22.0, "eval_cer": 1.0159435751541015, "eval_loss": 0.02291141264140606, "eval_runtime": 85.7962, "eval_samples_per_second": 79.724, "eval_steps_per_second": 9.965, "eval_wer": 0.9989766081871345, "step": 18040 }, { "epoch": 22.561317876754117, "grad_norm": 0.14426590502262115, "learning_rate": 2.3979682800065307e-05, "loss": 0.0903, "step": 18500 }, { "epoch": 23.0, "eval_cer": 1.016239924134661, "eval_loss": 0.02606978453695774, "eval_runtime": 103.6933, "eval_samples_per_second": 65.964, "eval_steps_per_second": 8.245, "eval_wer": 0.9988304093567252, "step": 18860 }, { "epoch": 23.17083587553386, "grad_norm": 33.88288497924805, "learning_rate": 2.3024043139715204e-05, "loss": 0.0964, "step": 19000 }, { "epoch": 23.780964002440513, "grad_norm": 2.688063859939575, "learning_rate": 2.2018330526045242e-05, "loss": 0.0895, "step": 19500 }, { "epoch": 24.0, "eval_cer": 1.0176920341394025, "eval_loss": 0.03217785060405731, "eval_runtime": 89.5142, "eval_samples_per_second": 76.412, "eval_steps_per_second": 9.552, "eval_wer": 0.9988304093567252, "step": 19680 }, { "epoch": 24.390482001220256, "grad_norm": 1.0760211944580078, "learning_rate": 2.0970450483020733e-05, "loss": 0.0844, "step": 20000 }, { "epoch": 25.0, "grad_norm": 0.8288090229034424, "learning_rate": 1.9886698867971603e-05, "loss": 0.0835, "step": 20500 }, { "epoch": 25.0, "eval_cer": 1.0152027027027026, "eval_loss": 0.018799621611833572, "eval_runtime": 88.0548, "eval_samples_per_second": 77.679, "eval_steps_per_second": 9.71, "eval_wer": 0.9988304093567252, "step": 20500 }, { "epoch": 25.610128126906652, "grad_norm": 2.2765700817108154, "learning_rate": 1.877358706127469e-05, "loss": 0.0744, "step": 21000 }, { "epoch": 26.0, "eval_cer": 1.0149063537221432, "eval_loss": 0.01787102408707142, "eval_runtime": 95.8003, "eval_samples_per_second": 71.399, "eval_steps_per_second": 8.925, "eval_wer": 0.9988304093567252, "step": 21320 }, { "epoch": 26.219646125686396, "grad_norm": 2.9212796688079834, "learning_rate": 1.7637802844774755e-05, "loss": 0.0738, "step": 21500 }, { "epoch": 26.829774252593044, "grad_norm": 3.6706600189208984, "learning_rate": 1.6486170220352805e-05, "loss": 0.0728, "step": 22000 }, { "epoch": 27.0, "eval_cer": 1.014521100047416, "eval_loss": 0.010682709515094757, "eval_runtime": 90.6475, "eval_samples_per_second": 75.457, "eval_steps_per_second": 9.432, "eval_wer": 0.9988304093567252, "step": 22140 }, { "epoch": 27.439292251372787, "grad_norm": 1.4717997312545776, "learning_rate": 1.5325608410059234e-05, "loss": 0.0704, "step": 22500 }, { "epoch": 28.0, "eval_cer": 1.0149063537221432, "eval_loss": 0.016120394691824913, "eval_runtime": 91.4543, "eval_samples_per_second": 74.791, "eval_steps_per_second": 9.349, "eval_wer": 0.9988304093567252, "step": 22960 }, { "epoch": 28.04881025015253, "grad_norm": 0.24731288850307465, "learning_rate": 1.4163090284146517e-05, "loss": 0.0651, "step": 23000 }, { "epoch": 28.658938377059183, "grad_norm": 1.4096115827560425, "learning_rate": 1.3007905796568247e-05, "loss": 0.068, "step": 23500 }, { "epoch": 29.0, "eval_cer": 1.0149656235182551, "eval_loss": 0.014005111530423164, "eval_runtime": 85.3622, "eval_samples_per_second": 80.129, "eval_steps_per_second": 10.016, "eval_wer": 0.9988304093567252, "step": 23780 }, { "epoch": 29.268456375838927, "grad_norm": 1.2507057189941406, "learning_rate": 1.1862367835867989e-05, "loss": 0.062, "step": 24000 }, { "epoch": 29.878584502745575, "grad_norm": 0.42576301097869873, "learning_rate": 1.073568135019168e-05, "loss": 0.0635, "step": 24500 }, { "epoch": 30.0, "eval_cer": 1.0147878141299194, "eval_loss": 0.017921432852745056, "eval_runtime": 86.6261, "eval_samples_per_second": 78.96, "eval_steps_per_second": 9.87, "eval_wer": 0.9988304093567252, "step": 24600 }, { "epoch": 30.48810250152532, "grad_norm": 1.0933780670166016, "learning_rate": 9.634615680568962e-06, "loss": 0.0606, "step": 25000 }, { "epoch": 31.0, "eval_cer": 1.0146100047415836, "eval_loss": 0.016976628452539444, "eval_runtime": 85.312, "eval_samples_per_second": 80.176, "eval_steps_per_second": 10.022, "eval_wer": 0.9988304093567252, "step": 25420 }, { "epoch": 31.097620500305062, "grad_norm": 2.098126173019409, "learning_rate": 8.56578623342252e-06, "loss": 0.0588, "step": 25500 }, { "epoch": 31.707748627211714, "grad_norm": 0.9397180080413818, "learning_rate": 7.535614733981355e-06, "loss": 0.0549, "step": 26000 }, { "epoch": 32.0, "eval_cer": 1.0146100047415836, "eval_loss": 0.012712378054857254, "eval_runtime": 78.6951, "eval_samples_per_second": 86.918, "eval_steps_per_second": 10.865, "eval_wer": 0.9988304093567252, "step": 26240 }, { "epoch": 32.31726662599146, "grad_norm": 3.3270745277404785, "learning_rate": 6.550290643366546e-06, "loss": 0.0545, "step": 26500 }, { "epoch": 32.92739475289811, "grad_norm": 1.39926016330719, "learning_rate": 5.615733971162722e-06, "loss": 0.0557, "step": 27000 }, { "epoch": 33.0, "eval_cer": 1.0145803698435278, "eval_loss": 0.011217311024665833, "eval_runtime": 76.5204, "eval_samples_per_second": 89.388, "eval_steps_per_second": 11.173, "eval_wer": 0.9988304093567252, "step": 27060 }, { "epoch": 33.53691275167785, "grad_norm": 0.6240960955619812, "learning_rate": 4.737559706904321e-06, "loss": 0.0525, "step": 27500 }, { "epoch": 34.0, "eval_cer": 1.0145507349454719, "eval_loss": 0.013999322429299355, "eval_runtime": 76.0714, "eval_samples_per_second": 89.915, "eval_steps_per_second": 11.239, "eval_wer": 0.9988304093567252, "step": 27880 }, { "epoch": 34.1464307504576, "grad_norm": 0.5480217933654785, "learning_rate": 3.924180691546633e-06, "loss": 0.0509, "step": 28000 }, { "epoch": 34.75655887736425, "grad_norm": 0.09108582139015198, "learning_rate": 3.1739539781329047e-06, "loss": 0.0478, "step": 28500 }, { "epoch": 35.0, "eval_cer": 1.0145507349454719, "eval_loss": 0.012541352771222591, "eval_runtime": 75.8184, "eval_samples_per_second": 90.216, "eval_steps_per_second": 11.277, "eval_wer": 0.9988304093567252, "step": 28700 }, { "epoch": 35.36607687614399, "grad_norm": 5.189497470855713, "learning_rate": 2.4947803389966218e-06, "loss": 0.051, "step": 29000 }, { "epoch": 35.97620500305064, "grad_norm": 1.7753878831863403, "learning_rate": 1.8907403751213792e-06, "loss": 0.0475, "step": 29500 }, { "epoch": 36.0, "eval_cer": 1.0145803698435278, "eval_loss": 0.012446345761418343, "eval_runtime": 75.8714, "eval_samples_per_second": 90.153, "eval_steps_per_second": 11.269, "eval_wer": 0.9988304093567252, "step": 29520 }, { "epoch": 36.58572300183038, "grad_norm": 1.131529688835144, "learning_rate": 1.3654632704576153e-06, "loss": 0.0455, "step": 30000 }, { "epoch": 37.0, "eval_cer": 1.0141951161688003, "eval_loss": 0.011377622373402119, "eval_runtime": 76.0985, "eval_samples_per_second": 89.884, "eval_steps_per_second": 11.235, "eval_wer": 0.9988304093567252, "step": 30340 }, { "epoch": 37.195241000610125, "grad_norm": 0.012475825846195221, "learning_rate": 9.229080945215807e-07, "loss": 0.0502, "step": 30500 }, { "epoch": 37.80536912751678, "grad_norm": 0.05113031342625618, "learning_rate": 5.639609229205172e-07, "loss": 0.0444, "step": 31000 }, { "epoch": 38.0, "eval_cer": 1.0142247510668563, "eval_loss": 0.011828911490738392, "eval_runtime": 94.6786, "eval_samples_per_second": 72.244, "eval_steps_per_second": 9.031, "eval_wer": 0.9988304093567252, "step": 31160 }, { "epoch": 38.41488712629652, "grad_norm": 2.288602113723755, "learning_rate": 2.9174814715380303e-07, "loss": 0.0476, "step": 31500 }, { "epoch": 39.0, "eval_cer": 1.0141951161688003, "eval_loss": 0.011857305653393269, "eval_runtime": 74.477, "eval_samples_per_second": 91.84, "eval_steps_per_second": 11.48, "eval_wer": 0.9988304093567252, "step": 31980 }, { "epoch": 39.024405125076264, "grad_norm": 1.6855770349502563, "learning_rate": 1.0790527198271116e-07, "loss": 0.0451, "step": 32000 }, { "epoch": 39.634533251982916, "grad_norm": 2.309912919998169, "learning_rate": 1.3536859442666582e-08, "loss": 0.0464, "step": 32500 }, { "epoch": 39.95179987797437, "eval_cer": 1.0143432906590801, "eval_loss": 0.011460067704319954, "eval_runtime": 89.8642, "eval_samples_per_second": 76.115, "eval_steps_per_second": 9.514, "eval_wer": 0.9988304093567252, "step": 32760 }, { "epoch": 39.95179987797437, "step": 32760, "total_flos": 2.3262081232451936e+19, "train_loss": 1.5701773451070355, "train_runtime": 35107.1781, "train_samples_per_second": 59.748, "train_steps_per_second": 0.933 } ], "logging_steps": 500, "max_steps": 32760, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3262081232451936e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }