| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.0, | |
| "eval_steps": 720, | |
| "global_step": 32391, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13892747985551543, | |
| "grad_norm": 0.889398455619812, | |
| "learning_rate": 0.0007907752153375939, | |
| "loss": 0.3895, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2000555709919422, | |
| "eval_loss": 0.2990996241569519, | |
| "eval_runtime": 6.0564, | |
| "eval_samples_per_second": 82.558, | |
| "eval_steps_per_second": 5.284, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.27785495971103086, | |
| "grad_norm": 0.5278475880622864, | |
| "learning_rate": 0.0007815133833472261, | |
| "loss": 0.2897, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4001111419838844, | |
| "eval_loss": 0.2636842131614685, | |
| "eval_runtime": 6.0156, | |
| "eval_samples_per_second": 83.118, | |
| "eval_steps_per_second": 5.32, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.41678243956654626, | |
| "grad_norm": 1.208860158920288, | |
| "learning_rate": 0.0007722515513568584, | |
| "loss": 0.2681, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5557099194220617, | |
| "grad_norm": 0.7001394629478455, | |
| "learning_rate": 0.0007629897193664907, | |
| "loss": 0.2536, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6001667129758266, | |
| "eval_loss": 0.25593382120132446, | |
| "eval_runtime": 6.0644, | |
| "eval_samples_per_second": 82.449, | |
| "eval_steps_per_second": 5.277, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6946373992775771, | |
| "grad_norm": 0.7066700458526611, | |
| "learning_rate": 0.000753727887376123, | |
| "loss": 0.2453, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8002222839677688, | |
| "eval_loss": 0.24263954162597656, | |
| "eval_runtime": 6.2582, | |
| "eval_samples_per_second": 79.896, | |
| "eval_steps_per_second": 5.113, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.8335648791330925, | |
| "grad_norm": 0.8669331073760986, | |
| "learning_rate": 0.000744484579049736, | |
| "loss": 0.2378, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.972492358988608, | |
| "grad_norm": 0.752050518989563, | |
| "learning_rate": 0.0007352412707233491, | |
| "loss": 0.2335, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.000277854959711, | |
| "eval_loss": 0.24230095744132996, | |
| "eval_runtime": 6.2445, | |
| "eval_samples_per_second": 80.07, | |
| "eval_steps_per_second": 5.124, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1114198388441234, | |
| "grad_norm": 0.5285800695419312, | |
| "learning_rate": 0.0007259794387329814, | |
| "loss": 0.2214, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2003334259516532, | |
| "eval_loss": 0.23442834615707397, | |
| "eval_runtime": 6.0486, | |
| "eval_samples_per_second": 82.664, | |
| "eval_steps_per_second": 5.291, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.2503473186996388, | |
| "grad_norm": 0.7171725630760193, | |
| "learning_rate": 0.0007167176067426138, | |
| "loss": 0.2204, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3892747985551543, | |
| "grad_norm": 0.6683080792427063, | |
| "learning_rate": 0.000707455774752246, | |
| "loss": 0.217, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4003889969435954, | |
| "eval_loss": 0.2214350551366806, | |
| "eval_runtime": 6.0608, | |
| "eval_samples_per_second": 82.498, | |
| "eval_steps_per_second": 5.28, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.5282022784106695, | |
| "grad_norm": 0.8774034380912781, | |
| "learning_rate": 0.0006981939427618783, | |
| "loss": 0.2154, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6004445679355377, | |
| "eval_loss": 0.2222467064857483, | |
| "eval_runtime": 6.2569, | |
| "eval_samples_per_second": 79.912, | |
| "eval_steps_per_second": 5.114, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.667129758266185, | |
| "grad_norm": 0.6040942668914795, | |
| "learning_rate": 0.0006889506344354914, | |
| "loss": 0.2133, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8005001389274797, | |
| "eval_loss": 0.2248920053243637, | |
| "eval_runtime": 6.2381, | |
| "eval_samples_per_second": 80.153, | |
| "eval_steps_per_second": 5.13, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.8060572381217006, | |
| "grad_norm": 0.8217343688011169, | |
| "learning_rate": 0.0006796888024451237, | |
| "loss": 0.2108, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9449847179772157, | |
| "grad_norm": 0.6363208889961243, | |
| "learning_rate": 0.0006704269704547561, | |
| "loss": 0.2095, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.000555709919422, | |
| "eval_loss": 0.2111140936613083, | |
| "eval_runtime": 6.4306, | |
| "eval_samples_per_second": 77.753, | |
| "eval_steps_per_second": 4.976, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.0839121978327313, | |
| "grad_norm": 0.5115044116973877, | |
| "learning_rate": 0.0006611651384643883, | |
| "loss": 0.1984, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2006112809113643, | |
| "eval_loss": 0.21921053528785706, | |
| "eval_runtime": 6.2111, | |
| "eval_samples_per_second": 80.501, | |
| "eval_steps_per_second": 5.152, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.222839677688247, | |
| "grad_norm": 0.7524166703224182, | |
| "learning_rate": 0.0006519218301380013, | |
| "loss": 0.1981, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.361767157543762, | |
| "grad_norm": 0.7301591038703918, | |
| "learning_rate": 0.0006426599981476336, | |
| "loss": 0.1973, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4006668519033063, | |
| "eval_loss": 0.2109183967113495, | |
| "eval_runtime": 6.2385, | |
| "eval_samples_per_second": 80.147, | |
| "eval_steps_per_second": 5.129, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.5006946373992776, | |
| "grad_norm": 0.6623874306678772, | |
| "learning_rate": 0.000633398166157266, | |
| "loss": 0.1938, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.600722422895249, | |
| "eval_loss": 0.19926229119300842, | |
| "eval_runtime": 6.2567, | |
| "eval_samples_per_second": 79.914, | |
| "eval_steps_per_second": 5.115, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 2.639622117254793, | |
| "grad_norm": 0.5896228551864624, | |
| "learning_rate": 0.0006241363341668982, | |
| "loss": 0.1917, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7785495971103087, | |
| "grad_norm": 0.5303069353103638, | |
| "learning_rate": 0.0006148745021765306, | |
| "loss": 0.1941, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.800777993887191, | |
| "eval_loss": 0.20040422677993774, | |
| "eval_runtime": 6.445, | |
| "eval_samples_per_second": 77.58, | |
| "eval_steps_per_second": 4.965, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 2.917477076965824, | |
| "grad_norm": 0.5772528052330017, | |
| "learning_rate": 0.0006056126701861628, | |
| "loss": 0.1962, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.000833564879133, | |
| "eval_loss": 0.19769687950611115, | |
| "eval_runtime": 6.251, | |
| "eval_samples_per_second": 79.987, | |
| "eval_steps_per_second": 5.119, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.0564045568213394, | |
| "grad_norm": 0.8229184746742249, | |
| "learning_rate": 0.0005963508381957952, | |
| "loss": 0.1868, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.1953320366768545, | |
| "grad_norm": 0.6219042539596558, | |
| "learning_rate": 0.0005871075298694081, | |
| "loss": 0.1813, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.2008891358710754, | |
| "eval_loss": 0.20137149095535278, | |
| "eval_runtime": 6.1829, | |
| "eval_samples_per_second": 80.868, | |
| "eval_steps_per_second": 5.176, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 3.33425951653237, | |
| "grad_norm": 0.6010075807571411, | |
| "learning_rate": 0.0005778456978790405, | |
| "loss": 0.1822, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4009447068630174, | |
| "eval_loss": 0.20252275466918945, | |
| "eval_runtime": 6.2359, | |
| "eval_samples_per_second": 80.181, | |
| "eval_steps_per_second": 5.132, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 3.4731869963878856, | |
| "grad_norm": 0.6477861404418945, | |
| "learning_rate": 0.0005685838658886728, | |
| "loss": 0.1814, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6010002778549595, | |
| "eval_loss": 0.19002339243888855, | |
| "eval_runtime": 6.0426, | |
| "eval_samples_per_second": 82.745, | |
| "eval_steps_per_second": 5.296, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 3.612114476243401, | |
| "grad_norm": 0.6676946878433228, | |
| "learning_rate": 0.0005593220338983052, | |
| "loss": 0.1808, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7510419560989163, | |
| "grad_norm": 1.1407862901687622, | |
| "learning_rate": 0.0005500602019079374, | |
| "loss": 0.1788, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.801055848846902, | |
| "eval_loss": 0.18976138532161713, | |
| "eval_runtime": 6.3955, | |
| "eval_samples_per_second": 78.18, | |
| "eval_steps_per_second": 5.004, | |
| "step": 13680 | |
| }, | |
| { | |
| "epoch": 3.889969435954432, | |
| "grad_norm": 0.818808913230896, | |
| "learning_rate": 0.0005407983699175697, | |
| "loss": 0.1795, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.001111419838844, | |
| "eval_loss": 0.18558593094348907, | |
| "eval_runtime": 6.8949, | |
| "eval_samples_per_second": 72.517, | |
| "eval_steps_per_second": 4.641, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.0288969158099475, | |
| "grad_norm": 0.5338103771209717, | |
| "learning_rate": 0.000531536537927202, | |
| "loss": 0.1732, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.167824395665463, | |
| "grad_norm": 0.6316047310829163, | |
| "learning_rate": 0.0005222747059368344, | |
| "loss": 0.1678, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.201166990830786, | |
| "eval_loss": 0.19053161144256592, | |
| "eval_runtime": 6.4789, | |
| "eval_samples_per_second": 77.173, | |
| "eval_steps_per_second": 4.939, | |
| "step": 15120 | |
| }, | |
| { | |
| "epoch": 4.306751875520978, | |
| "grad_norm": 0.9015474319458008, | |
| "learning_rate": 0.0005130313976104474, | |
| "loss": 0.1693, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.4012225618227285, | |
| "eval_loss": 0.19033345580101013, | |
| "eval_runtime": 6.2453, | |
| "eval_samples_per_second": 80.061, | |
| "eval_steps_per_second": 5.124, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 4.445679355376494, | |
| "grad_norm": 0.5398434400558472, | |
| "learning_rate": 0.0005037695656200797, | |
| "loss": 0.1705, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.584606835232009, | |
| "grad_norm": 0.6211907863616943, | |
| "learning_rate": 0.0004945077336297119, | |
| "loss": 0.1675, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.601278132814671, | |
| "eval_loss": 0.1857684701681137, | |
| "eval_runtime": 6.2063, | |
| "eval_samples_per_second": 80.563, | |
| "eval_steps_per_second": 5.156, | |
| "step": 16560 | |
| }, | |
| { | |
| "epoch": 4.723534315087524, | |
| "grad_norm": 0.6166074872016907, | |
| "learning_rate": 0.0004852459016393443, | |
| "loss": 0.1681, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.801333703806613, | |
| "eval_loss": 0.18442219495773315, | |
| "eval_runtime": 6.2555, | |
| "eval_samples_per_second": 79.93, | |
| "eval_steps_per_second": 5.116, | |
| "step": 17280 | |
| }, | |
| { | |
| "epoch": 4.86246179494304, | |
| "grad_norm": 0.5619335174560547, | |
| "learning_rate": 0.00047600259331295736, | |
| "loss": 0.1687, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "grad_norm": 0.7084242701530457, | |
| "learning_rate": 0.0004667407613225896, | |
| "loss": 0.1635, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "eval_loss": 0.1823691427707672, | |
| "eval_runtime": 6.4719, | |
| "eval_samples_per_second": 77.257, | |
| "eval_steps_per_second": 4.944, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.14031675465407, | |
| "grad_norm": 0.6466693878173828, | |
| "learning_rate": 0.0004574789293322219, | |
| "loss": 0.1537, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.201444845790498, | |
| "eval_loss": 0.18211981654167175, | |
| "eval_runtime": 6.28, | |
| "eval_samples_per_second": 79.618, | |
| "eval_steps_per_second": 5.096, | |
| "step": 18720 | |
| }, | |
| { | |
| "epoch": 5.279244234509586, | |
| "grad_norm": 0.760137677192688, | |
| "learning_rate": 0.00044821709734185424, | |
| "loss": 0.155, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.401500416782439, | |
| "eval_loss": 0.17618674039840698, | |
| "eval_runtime": 6.2993, | |
| "eval_samples_per_second": 79.374, | |
| "eval_steps_per_second": 5.08, | |
| "step": 19440 | |
| }, | |
| { | |
| "epoch": 5.418171714365101, | |
| "grad_norm": 0.6954505443572998, | |
| "learning_rate": 0.00043897378901546733, | |
| "loss": 0.155, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.5570991942206165, | |
| "grad_norm": 0.5655320882797241, | |
| "learning_rate": 0.00042971195702509957, | |
| "loss": 0.1559, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.601555987774382, | |
| "eval_loss": 0.18366502225399017, | |
| "eval_runtime": 6.3293, | |
| "eval_samples_per_second": 78.998, | |
| "eval_steps_per_second": 5.056, | |
| "step": 20160 | |
| }, | |
| { | |
| "epoch": 5.6960266740761325, | |
| "grad_norm": 0.5522324442863464, | |
| "learning_rate": 0.00042045012503473186, | |
| "loss": 0.1565, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.801611558766324, | |
| "eval_loss": 0.1724882870912552, | |
| "eval_runtime": 6.3076, | |
| "eval_samples_per_second": 79.269, | |
| "eval_steps_per_second": 5.073, | |
| "step": 20880 | |
| }, | |
| { | |
| "epoch": 5.834954153931648, | |
| "grad_norm": 0.5771639347076416, | |
| "learning_rate": 0.0004111882930443642, | |
| "loss": 0.1548, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.973881633787163, | |
| "grad_norm": 0.7794287204742432, | |
| "learning_rate": 0.0004019264610539965, | |
| "loss": 0.151, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.001667129758266, | |
| "eval_loss": 0.17095144093036652, | |
| "eval_runtime": 6.2425, | |
| "eval_samples_per_second": 80.096, | |
| "eval_steps_per_second": 5.126, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.112809113642679, | |
| "grad_norm": 0.755778968334198, | |
| "learning_rate": 0.0003926646290636288, | |
| "loss": 0.1444, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.201722700750208, | |
| "eval_loss": 0.1705874651670456, | |
| "eval_runtime": 6.3293, | |
| "eval_samples_per_second": 78.998, | |
| "eval_steps_per_second": 5.056, | |
| "step": 22320 | |
| }, | |
| { | |
| "epoch": 6.251736593498194, | |
| "grad_norm": 0.5752654671669006, | |
| "learning_rate": 0.0003834027970732611, | |
| "loss": 0.1423, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.390664073353709, | |
| "grad_norm": 0.5828496217727661, | |
| "learning_rate": 0.0003741409650828934, | |
| "loss": 0.1447, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.401778271742151, | |
| "eval_loss": 0.1719101518392563, | |
| "eval_runtime": 6.3021, | |
| "eval_samples_per_second": 79.339, | |
| "eval_steps_per_second": 5.078, | |
| "step": 23040 | |
| }, | |
| { | |
| "epoch": 6.529591553209225, | |
| "grad_norm": 0.5708982944488525, | |
| "learning_rate": 0.0003648976567565065, | |
| "loss": 0.1429, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.601833842734093, | |
| "eval_loss": 0.17164455354213715, | |
| "eval_runtime": 6.3252, | |
| "eval_samples_per_second": 79.048, | |
| "eval_steps_per_second": 5.059, | |
| "step": 23760 | |
| }, | |
| { | |
| "epoch": 6.66851903306474, | |
| "grad_norm": 0.6801475286483765, | |
| "learning_rate": 0.0003556358247661388, | |
| "loss": 0.1428, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.801889413726035, | |
| "eval_loss": 0.16734325885772705, | |
| "eval_runtime": 6.4095, | |
| "eval_samples_per_second": 78.01, | |
| "eval_steps_per_second": 4.993, | |
| "step": 24480 | |
| }, | |
| { | |
| "epoch": 6.807446512920255, | |
| "grad_norm": 0.5865324139595032, | |
| "learning_rate": 0.00034637399277577107, | |
| "loss": 0.1424, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.946373992775771, | |
| "grad_norm": 0.7098519206047058, | |
| "learning_rate": 0.00033711216078540336, | |
| "loss": 0.1407, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.001944984717977, | |
| "eval_loss": 0.16686856746673584, | |
| "eval_runtime": 6.4894, | |
| "eval_samples_per_second": 77.049, | |
| "eval_steps_per_second": 4.931, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.085301472631286, | |
| "grad_norm": 0.8402431607246399, | |
| "learning_rate": 0.00032785032879503566, | |
| "loss": 0.1335, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.20200055570992, | |
| "eval_loss": 0.16496331989765167, | |
| "eval_runtime": 6.166, | |
| "eval_samples_per_second": 81.09, | |
| "eval_steps_per_second": 5.19, | |
| "step": 25920 | |
| }, | |
| { | |
| "epoch": 7.2242289524868015, | |
| "grad_norm": 0.8292597532272339, | |
| "learning_rate": 0.0003186070204686487, | |
| "loss": 0.1297, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.3631564323423175, | |
| "grad_norm": 0.8218772411346436, | |
| "learning_rate": 0.00030934518847828104, | |
| "loss": 0.131, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.402056126701861, | |
| "eval_loss": 0.1669527292251587, | |
| "eval_runtime": 6.3072, | |
| "eval_samples_per_second": 79.274, | |
| "eval_steps_per_second": 5.074, | |
| "step": 26640 | |
| }, | |
| { | |
| "epoch": 7.502083912197833, | |
| "grad_norm": 0.39585080742836, | |
| "learning_rate": 0.0003000833564879133, | |
| "loss": 0.1314, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.602111697693804, | |
| "eval_loss": 0.1617124080657959, | |
| "eval_runtime": 6.4904, | |
| "eval_samples_per_second": 77.037, | |
| "eval_steps_per_second": 4.93, | |
| "step": 27360 | |
| }, | |
| { | |
| "epoch": 7.641011392053348, | |
| "grad_norm": 0.5454237461090088, | |
| "learning_rate": 0.00029082152449754563, | |
| "loss": 0.131, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 7.779938871908864, | |
| "grad_norm": 0.48045065999031067, | |
| "learning_rate": 0.000281559692507178, | |
| "loss": 0.1293, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 7.802167268685746, | |
| "eval_loss": 0.1614612638950348, | |
| "eval_runtime": 6.2421, | |
| "eval_samples_per_second": 80.101, | |
| "eval_steps_per_second": 5.126, | |
| "step": 28080 | |
| }, | |
| { | |
| "epoch": 7.918866351764379, | |
| "grad_norm": 0.4793488681316376, | |
| "learning_rate": 0.0002722978605168102, | |
| "loss": 0.1309, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.002222839677689, | |
| "eval_loss": 0.16146394610404968, | |
| "eval_runtime": 6.3071, | |
| "eval_samples_per_second": 79.276, | |
| "eval_steps_per_second": 5.074, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.057793831619895, | |
| "grad_norm": 0.8855862617492676, | |
| "learning_rate": 0.0002630545521904233, | |
| "loss": 0.1235, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.19672131147541, | |
| "grad_norm": 0.6180191040039062, | |
| "learning_rate": 0.0002537927202000556, | |
| "loss": 0.1192, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.20227841066963, | |
| "eval_loss": 0.1589631289243698, | |
| "eval_runtime": 6.2706, | |
| "eval_samples_per_second": 79.738, | |
| "eval_steps_per_second": 5.103, | |
| "step": 29520 | |
| }, | |
| { | |
| "epoch": 8.335648791330925, | |
| "grad_norm": 0.6425427198410034, | |
| "learning_rate": 0.0002445308882096879, | |
| "loss": 0.1186, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.402333981661572, | |
| "eval_loss": 0.1566726714372635, | |
| "eval_runtime": 6.2113, | |
| "eval_samples_per_second": 80.499, | |
| "eval_steps_per_second": 5.152, | |
| "step": 30240 | |
| }, | |
| { | |
| "epoch": 8.474576271186441, | |
| "grad_norm": 0.4892035126686096, | |
| "learning_rate": 0.0002352690562193202, | |
| "loss": 0.1191, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.602389552653515, | |
| "eval_loss": 0.15965625643730164, | |
| "eval_runtime": 6.0862, | |
| "eval_samples_per_second": 82.153, | |
| "eval_steps_per_second": 5.258, | |
| "step": 30960 | |
| }, | |
| { | |
| "epoch": 8.613503751041955, | |
| "grad_norm": 0.5929815173149109, | |
| "learning_rate": 0.00022602574789293324, | |
| "loss": 0.1192, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 8.752431230897471, | |
| "grad_norm": 0.5730671882629395, | |
| "learning_rate": 0.00021676391590256554, | |
| "loss": 0.1181, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 8.802445123645457, | |
| "eval_loss": 0.15325884521007538, | |
| "eval_runtime": 6.072, | |
| "eval_samples_per_second": 82.346, | |
| "eval_steps_per_second": 5.27, | |
| "step": 31680 | |
| }, | |
| { | |
| "epoch": 8.891358710752987, | |
| "grad_norm": 1.0519418716430664, | |
| "learning_rate": 0.00020750208391219786, | |
| "loss": 0.1181, | |
| "step": 32000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 43188, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.9164816786419876e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |