{ "best_metric": null, "best_model_checkpoint": null, "epoch": 39.95179987797437, "eval_steps": 100.0, "global_step": 32760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6101281269066504, "grad_norm": 55.342838287353516, "learning_rate": 1.1904e-06, "loss": 29.232, "step": 500 }, { "epoch": 1.0, "eval_cer": 1.1283783783783783, "eval_loss": 11.350740432739258, "eval_runtime": 78.3217, "eval_samples_per_second": 87.332, "eval_steps_per_second": 10.917, "eval_wer": 1.0, "step": 820 }, { "epoch": 1.2196461256863942, "grad_norm": 47.82231903076172, "learning_rate": 2.3880000000000003e-06, "loss": 11.653, "step": 1000 }, { "epoch": 1.8297742525930445, "grad_norm": 40.30710220336914, "learning_rate": 3.588e-06, "loss": 9.4623, "step": 1500 }, { "epoch": 2.0, "eval_cer": 1.1284080132764343, "eval_loss": 7.82855224609375, "eval_runtime": 75.8772, "eval_samples_per_second": 90.146, "eval_steps_per_second": 11.268, "eval_wer": 1.0, "step": 1640 }, { "epoch": 2.4392922513727884, "grad_norm": 20.09738540649414, "learning_rate": 4.788e-06, "loss": 7.1817, "step": 2000 }, { "epoch": 3.0, "eval_cer": 1.1284080132764343, "eval_loss": 4.291374683380127, "eval_runtime": 70.7133, "eval_samples_per_second": 96.729, "eval_steps_per_second": 12.091, "eval_wer": 1.0, "step": 2460 }, { "epoch": 3.048810250152532, "grad_norm": 11.131582260131836, "learning_rate": 5.988e-06, "loss": 4.9943, "step": 2500 }, { "epoch": 3.6589383770591826, "grad_norm": 3.2968924045562744, "learning_rate": 7.1880000000000005e-06, "loss": 3.6746, "step": 3000 }, { "epoch": 4.0, "eval_cer": 1.1284080132764343, "eval_loss": 2.9958813190460205, "eval_runtime": 70.8535, "eval_samples_per_second": 96.537, "eval_steps_per_second": 12.067, "eval_wer": 1.0, "step": 3280 }, { "epoch": 4.268456375838926, "grad_norm": 1.5761828422546387, "learning_rate": 8.388e-06, "loss": 3.0354, "step": 3500 }, { "epoch": 4.878584502745577, "grad_norm": 2.042921543121338, "learning_rate": 9.588e-06, "loss": 2.6344, "step": 4000 }, { "epoch": 5.0, "eval_cer": 1.1284376481744902, "eval_loss": 2.379577159881592, "eval_runtime": 71.7131, "eval_samples_per_second": 95.38, "eval_steps_per_second": 11.923, "eval_wer": 1.0, "step": 4100 }, { "epoch": 5.48810250152532, "grad_norm": 4.554098606109619, "learning_rate": 1.0787999999999999e-05, "loss": 2.224, "step": 4500 }, { "epoch": 6.0, "eval_cer": 1.0735538169748695, "eval_loss": 1.1762758493423462, "eval_runtime": 74.8637, "eval_samples_per_second": 91.366, "eval_steps_per_second": 11.421, "eval_wer": 1.0, "step": 4920 }, { "epoch": 6.097620500305064, "grad_norm": 3.2407827377319336, "learning_rate": 1.1988000000000001e-05, "loss": 1.5426, "step": 5000 }, { "epoch": 6.707748627211714, "grad_norm": 8.200541496276855, "learning_rate": 1.3188e-05, "loss": 0.8482, "step": 5500 }, { "epoch": 7.0, "eval_cer": 0.9927098150782361, "eval_loss": 0.46800941228866577, "eval_runtime": 71.6177, "eval_samples_per_second": 95.507, "eval_steps_per_second": 11.938, "eval_wer": 0.9998538011695907, "step": 5740 }, { "epoch": 7.317266625991458, "grad_norm": 3.0007307529449463, "learning_rate": 1.4388000000000002e-05, "loss": 0.5869, "step": 6000 }, { "epoch": 7.927394752898109, "grad_norm": 4.457747459411621, "learning_rate": 1.5588e-05, "loss": 0.4357, "step": 6500 }, { "epoch": 8.0, "eval_cer": 0.9734471313418682, "eval_loss": 0.27095767855644226, "eval_runtime": 76.0314, "eval_samples_per_second": 89.963, "eval_steps_per_second": 11.245, "eval_wer": 0.9992690058479532, "step": 6560 }, { "epoch": 8.536912751677852, "grad_norm": 6.115583419799805, "learning_rate": 1.6788e-05, "loss": 0.3568, "step": 7000 }, { "epoch": 9.0, "eval_cer": 1.0188477951635846, "eval_loss": 0.1256408840417862, "eval_runtime": 71.254, "eval_samples_per_second": 95.995, "eval_steps_per_second": 11.999, "eval_wer": 0.999561403508772, "step": 7380 }, { "epoch": 9.146430750457595, "grad_norm": 4.245608806610107, "learning_rate": 1.7988e-05, "loss": 0.2776, "step": 7500 }, { "epoch": 9.756558877364247, "grad_norm": 5.325539588928223, "learning_rate": 1.9188e-05, "loss": 0.2374, "step": 8000 }, { "epoch": 10.0, "eval_cer": 1.0219001896633475, "eval_loss": 0.14763644337654114, "eval_runtime": 73.2557, "eval_samples_per_second": 93.372, "eval_steps_per_second": 11.671, "eval_wer": 0.9989766081871345, "step": 8200 }, { "epoch": 10.36607687614399, "grad_norm": 5.118485450744629, "learning_rate": 2.0388e-05, "loss": 0.2199, "step": 8500 }, { "epoch": 10.97620500305064, "grad_norm": 7.805136203765869, "learning_rate": 2.1588e-05, "loss": 0.1941, "step": 9000 }, { "epoch": 11.0, "eval_cer": 1.018669985775249, "eval_loss": 0.059657976031303406, "eval_runtime": 72.2722, "eval_samples_per_second": 94.642, "eval_steps_per_second": 11.83, "eval_wer": 0.9989766081871345, "step": 9020 }, { "epoch": 11.585723001830385, "grad_norm": 8.909073829650879, "learning_rate": 2.2788000000000003e-05, "loss": 0.1829, "step": 9500 }, { "epoch": 12.0, "eval_cer": 1.0188477951635846, "eval_loss": 0.046978700906038284, "eval_runtime": 70.6222, "eval_samples_per_second": 96.853, "eval_steps_per_second": 12.107, "eval_wer": 0.9989766081871345, "step": 9840 }, { "epoch": 12.195241000610128, "grad_norm": 2.5103747844696045, "learning_rate": 2.3988e-05, "loss": 0.1802, "step": 10000 }, { "epoch": 12.805369127516778, "grad_norm": 3.005159378051758, "learning_rate": 2.5188e-05, "loss": 0.1701, "step": 10500 }, { "epoch": 13.0, "eval_cer": 1.018136557610242, "eval_loss": 0.05100085958838463, "eval_runtime": 70.6009, "eval_samples_per_second": 96.883, "eval_steps_per_second": 12.11, "eval_wer": 0.9989766081871345, "step": 10660 }, { "epoch": 13.414887126296522, "grad_norm": 8.558382987976074, "learning_rate": 2.6388000000000002e-05, "loss": 0.1603, "step": 11000 }, { "epoch": 14.0, "eval_cer": 1.0192626837363679, "eval_loss": 0.037522342056035995, "eval_runtime": 70.9174, "eval_samples_per_second": 96.45, "eval_steps_per_second": 12.056, "eval_wer": 0.9988304093567252, "step": 11480 }, { "epoch": 14.024405125076266, "grad_norm": 0.5282774567604065, "learning_rate": 2.7588e-05, "loss": 0.1579, "step": 11500 }, { "epoch": 14.634533251982916, "grad_norm": 6.527963161468506, "learning_rate": 2.8788e-05, "loss": 0.1564, "step": 12000 }, { "epoch": 15.0, "eval_cer": 1.0064011379800855, "eval_loss": 0.0865325778722763, "eval_runtime": 72.0213, "eval_samples_per_second": 94.972, "eval_steps_per_second": 11.871, "eval_wer": 0.9991228070175439, "step": 12300 }, { "epoch": 15.24405125076266, "grad_norm": 3.7430195808410645, "learning_rate": 2.9988e-05, "loss": 0.1632, "step": 12500 }, { "epoch": 15.854179377669311, "grad_norm": 3.735182762145996, "learning_rate": 2.9956013070043084e-05, "loss": 0.1555, "step": 13000 }, { "epoch": 16.0, "eval_cer": 1.0171882408724513, "eval_loss": 0.0239888783544302, "eval_runtime": 79.7324, "eval_samples_per_second": 85.787, "eval_steps_per_second": 10.723, "eval_wer": 0.9989766081871345, "step": 13120 }, { "epoch": 16.463697376449055, "grad_norm": 2.248210906982422, "learning_rate": 2.9822174136311704e-05, "loss": 0.1446, "step": 13500 }, { "epoch": 17.0, "eval_cer": 1.0184329065908013, "eval_loss": 0.03877296671271324, "eval_runtime": 70.4641, "eval_samples_per_second": 97.071, "eval_steps_per_second": 12.134, "eval_wer": 0.9989766081871345, "step": 13940 }, { "epoch": 17.0732153752288, "grad_norm": 1.9990071058273315, "learning_rate": 2.9599280835811145e-05, "loss": 0.1432, "step": 14000 }, { "epoch": 17.683343502135447, "grad_norm": 1.6087830066680908, "learning_rate": 2.9289379955813937e-05, "loss": 0.1374, "step": 14500 }, { "epoch": 18.0, "eval_cer": 1.0229077761972498, "eval_loss": 0.07236260920763016, "eval_runtime": 72.8414, "eval_samples_per_second": 93.903, "eval_steps_per_second": 11.738, "eval_wer": 0.9991228070175439, "step": 14760 }, { "epoch": 18.29286150091519, "grad_norm": 1.2661323547363281, "learning_rate": 2.8893091974003682e-05, "loss": 0.1271, "step": 15000 }, { "epoch": 18.902989627821842, "grad_norm": 2.638475179672241, "learning_rate": 2.841333172308954e-05, "loss": 0.1358, "step": 15500 }, { "epoch": 19.0, "eval_cer": 1.016802987197724, "eval_loss": 0.029252657666802406, "eval_runtime": 79.4655, "eval_samples_per_second": 86.075, "eval_steps_per_second": 10.759, "eval_wer": 0.9988304093567252, "step": 15580 }, { "epoch": 19.512507626601586, "grad_norm": 3.171802043914795, "learning_rate": 2.785298169149414e-05, "loss": 0.126, "step": 16000 }, { "epoch": 20.0, "eval_cer": 1.0173364153627311, "eval_loss": 0.02503075823187828, "eval_runtime": 82.6447, "eval_samples_per_second": 82.764, "eval_steps_per_second": 10.345, "eval_wer": 0.9989766081871345, "step": 16400 }, { "epoch": 20.12202562538133, "grad_norm": 2.895329236984253, "learning_rate": 2.7215408565964914e-05, "loss": 0.1259, "step": 16500 }, { "epoch": 20.73215375228798, "grad_norm": 8.28530502319336, "learning_rate": 2.650444300389672e-05, "loss": 0.1238, "step": 17000 }, { "epoch": 21.0, "eval_cer": 1.017366050260787, "eval_loss": 0.03371915966272354, "eval_runtime": 80.9247, "eval_samples_per_second": 84.523, "eval_steps_per_second": 10.565, "eval_wer": 0.9991228070175439, "step": 17220 }, { "epoch": 21.341671751067725, "grad_norm": 10.950956344604492, "learning_rate": 2.5724356618032884e-05, "loss": 0.1193, "step": 17500 }, { "epoch": 21.951799877974373, "grad_norm": 2.780343770980835, "learning_rate": 2.4879836311824927e-05, "loss": 0.1106, "step": 18000 }, { "epoch": 22.0, "eval_cer": 1.0159139402560455, "eval_loss": 0.018149759620428085, "eval_runtime": 71.3853, "eval_samples_per_second": 95.818, "eval_steps_per_second": 11.977, "eval_wer": 0.9988304093567252, "step": 18040 }, { "epoch": 22.561317876754117, "grad_norm": 1.4264533519744873, "learning_rate": 2.397595611964874e-05, "loss": 0.1124, "step": 18500 }, { "epoch": 23.0, "eval_cer": 1.0165066382171646, "eval_loss": 0.019898999482393265, "eval_runtime": 81.6329, "eval_samples_per_second": 83.79, "eval_steps_per_second": 10.474, "eval_wer": 0.9989766081871345, "step": 18860 }, { "epoch": 23.17083587553386, "grad_norm": 3.757054328918457, "learning_rate": 2.302011238680703e-05, "loss": 0.1079, "step": 19000 }, { "epoch": 23.780964002440513, "grad_norm": 6.899264812469482, "learning_rate": 2.2014218885552525e-05, "loss": 0.104, "step": 19500 }, { "epoch": 24.0, "eval_cer": 1.015054528212423, "eval_loss": 0.01506368163973093, "eval_runtime": 72.1117, "eval_samples_per_second": 94.853, "eval_steps_per_second": 11.857, "eval_wer": 0.9989766081871345, "step": 19680 }, { "epoch": 24.390482001220256, "grad_norm": 1.1912193298339844, "learning_rate": 2.096618265844089e-05, "loss": 0.1025, "step": 20000 }, { "epoch": 25.0, "grad_norm": 2.540283679962158, "learning_rate": 1.988230050118496e-05, "loss": 0.1039, "step": 20500 }, { "epoch": 25.0, "eval_cer": 1.0149359886201992, "eval_loss": 0.013627970591187477, "eval_runtime": 73.5874, "eval_samples_per_second": 92.951, "eval_steps_per_second": 11.619, "eval_wer": 0.9989766081871345, "step": 20500 }, { "epoch": 25.610128126906652, "grad_norm": 1.817094326019287, "learning_rate": 1.876908457848333e-05, "loss": 0.094, "step": 21000 }, { "epoch": 26.0, "eval_cer": 1.0147878141299194, "eval_loss": 0.013174526393413544, "eval_runtime": 78.6671, "eval_samples_per_second": 86.949, "eval_steps_per_second": 10.869, "eval_wer": 0.9988304093567252, "step": 21320 }, { "epoch": 26.219646125686396, "grad_norm": 1.6206278800964355, "learning_rate": 1.7633223297728993e-05, "loss": 0.0919, "step": 21500 }, { "epoch": 26.829774252593044, "grad_norm": 0.5614987015724182, "learning_rate": 1.6481541123819273e-05, "loss": 0.0921, "step": 22000 }, { "epoch": 27.0, "eval_cer": 1.0157954006638217, "eval_loss": 0.017084894701838493, "eval_runtime": 76.9298, "eval_samples_per_second": 88.912, "eval_steps_per_second": 11.114, "eval_wer": 0.9989766081871345, "step": 22140 }, { "epoch": 27.439292251372787, "grad_norm": 1.8238394260406494, "learning_rate": 1.532095757650705e-05, "loss": 0.0832, "step": 22500 }, { "epoch": 28.0, "eval_cer": 1.0146396396396395, "eval_loss": 0.012186683714389801, "eval_runtime": 72.328, "eval_samples_per_second": 94.569, "eval_steps_per_second": 11.821, "eval_wer": 0.9989766081871345, "step": 22960 }, { "epoch": 28.04881025015253, "grad_norm": 3.1914637088775635, "learning_rate": 1.4160767960306099e-05, "loss": 0.0899, "step": 23000 }, { "epoch": 28.658938377059183, "grad_norm": 1.6626567840576172, "learning_rate": 1.300329518493389e-05, "loss": 0.08, "step": 23500 }, { "epoch": 29.0, "eval_cer": 1.015054528212423, "eval_loss": 0.014703178778290749, "eval_runtime": 69.3866, "eval_samples_per_second": 98.578, "eval_steps_per_second": 12.322, "eval_wer": 0.9988304093567252, "step": 23780 }, { "epoch": 29.268456375838927, "grad_norm": 1.1348814964294434, "learning_rate": 1.1857818981811845e-05, "loss": 0.0822, "step": 24000 }, { "epoch": 29.878584502745575, "grad_norm": 0.9157238602638245, "learning_rate": 1.0731221584071209e-05, "loss": 0.0797, "step": 24500 }, { "epoch": 30.0, "eval_cer": 1.0146692745376955, "eval_loss": 0.01217756699770689, "eval_runtime": 70.4102, "eval_samples_per_second": 97.145, "eval_steps_per_second": 12.143, "eval_wer": 0.9989766081871345, "step": 24600 }, { "epoch": 30.48810250152532, "grad_norm": 2.62127947807312, "learning_rate": 9.630271797484814e-06, "loss": 0.0775, "step": 25000 }, { "epoch": 31.0, "eval_cer": 1.015054528212423, "eval_loss": 0.013129099272191525, "eval_runtime": 69.5638, "eval_samples_per_second": 98.327, "eval_steps_per_second": 12.291, "eval_wer": 0.9989766081871345, "step": 25420 }, { "epoch": 31.097620500305062, "grad_norm": 1.8141522407531738, "learning_rate": 8.561584332228596e-06, "loss": 0.071, "step": 25500 }, { "epoch": 31.707748627211714, "grad_norm": 0.32249507308006287, "learning_rate": 7.533597307465705e-06, "loss": 0.0675, "step": 26000 }, { "epoch": 32.0, "eval_cer": 1.0148174490279753, "eval_loss": 0.01305685006082058, "eval_runtime": 70.307, "eval_samples_per_second": 97.288, "eval_steps_per_second": 12.161, "eval_wer": 0.9991228070175439, "step": 26240 }, { "epoch": 32.31726662599146, "grad_norm": 0.06660401076078415, "learning_rate": 6.550290643366546e-06, "loss": 0.07, "step": 26500 }, { "epoch": 32.92739475289811, "grad_norm": 0.08903522789478302, "learning_rate": 5.617548632301114e-06, "loss": 0.0676, "step": 27000 }, { "epoch": 33.0, "eval_cer": 1.0146100047415836, "eval_loss": 0.017076797783374786, "eval_runtime": 71.0362, "eval_samples_per_second": 96.289, "eval_steps_per_second": 12.036, "eval_wer": 0.9988304093567252, "step": 27060 }, { "epoch": 33.53691275167785, "grad_norm": 4.330328464508057, "learning_rate": 4.739256219207167e-06, "loss": 0.069, "step": 27500 }, { "epoch": 34.0, "eval_cer": 1.0145803698435278, "eval_loss": 0.014736202545464039, "eval_runtime": 73.5978, "eval_samples_per_second": 92.938, "eval_steps_per_second": 11.617, "eval_wer": 0.9991228070175439, "step": 27880 }, { "epoch": 34.1464307504576, "grad_norm": 0.1510840207338333, "learning_rate": 3.922612254686006e-06, "loss": 0.0629, "step": 28000 }, { "epoch": 34.75655887736425, "grad_norm": 0.51495361328125, "learning_rate": 3.1725232868909293e-06, "loss": 0.0588, "step": 28500 }, { "epoch": 35.0, "eval_cer": 1.0146396396396395, "eval_loss": 0.014013656415045261, "eval_runtime": 73.9594, "eval_samples_per_second": 92.483, "eval_steps_per_second": 11.56, "eval_wer": 0.9989766081871345, "step": 28700 }, { "epoch": 35.36607687614399, "grad_norm": 2.207364559173584, "learning_rate": 2.493495989231198e-06, "loss": 0.0623, "step": 29000 }, { "epoch": 35.97620500305064, "grad_norm": 1.6189491748809814, "learning_rate": 1.8896100834437107e-06, "loss": 0.0608, "step": 29500 }, { "epoch": 36.0, "eval_cer": 1.0145803698435278, "eval_loss": 0.015181286260485649, "eval_runtime": 73.2042, "eval_samples_per_second": 93.437, "eval_steps_per_second": 11.68, "eval_wer": 0.9988304093567252, "step": 29520 }, { "epoch": 36.58572300183038, "grad_norm": 3.607675790786743, "learning_rate": 1.3644938278693997e-06, "loss": 0.0598, "step": 30000 }, { "epoch": 37.0, "eval_cer": 1.0146396396396395, "eval_loss": 0.015759674832224846, "eval_runtime": 73.8898, "eval_samples_per_second": 92.57, "eval_steps_per_second": 11.571, "eval_wer": 0.9989766081871345, "step": 30340 }, { "epoch": 37.195241000610125, "grad_norm": 1.4858845472335815, "learning_rate": 9.213022182052699e-07, "loss": 0.0579, "step": 30500 }, { "epoch": 37.80536912751678, "grad_norm": 0.11195004731416702, "learning_rate": 5.626980317060648e-07, "loss": 0.0578, "step": 31000 }, { "epoch": 38.0, "eval_cer": 1.0146989094357515, "eval_loss": 0.014966564252972603, "eval_runtime": 73.4635, "eval_samples_per_second": 93.107, "eval_steps_per_second": 11.638, "eval_wer": 0.9989766081871345, "step": 31160 }, { "epoch": 38.41488712629652, "grad_norm": 1.9088082313537598, "learning_rate": 2.912918111057888e-07, "loss": 0.0563, "step": 31500 }, { "epoch": 39.0, "eval_cer": 1.0147285443338074, "eval_loss": 0.015355916693806648, "eval_runtime": 63.935, "eval_samples_per_second": 106.984, "eval_steps_per_second": 13.373, "eval_wer": 0.9989766081871345, "step": 31980 }, { "epoch": 39.024405125076264, "grad_norm": 3.6777658462524414, "learning_rate": 1.0762696080869105e-07, "loss": 0.059, "step": 32000 }, { "epoch": 39.634533251982916, "grad_norm": 1.0012741088867188, "learning_rate": 1.3438245287707985e-08, "loss": 0.0591, "step": 32500 }, { "epoch": 39.95179987797437, "eval_cer": 1.0146692745376955, "eval_loss": 0.014642004854977131, "eval_runtime": 66.4954, "eval_samples_per_second": 102.864, "eval_steps_per_second": 12.858, "eval_wer": 0.9989766081871345, "step": 32760 }, { "epoch": 39.95179987797437, "step": 32760, "total_flos": 1.7574609702583173e+19, "train_loss": 1.277195220610743, "train_runtime": 29323.9298, "train_samples_per_second": 71.532, "train_steps_per_second": 1.117 } ], "logging_steps": 500, "max_steps": 32760, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7574609702583173e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }