| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.0, | |
| "eval_steps": 720, | |
| "global_step": 21594, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13892747985551543, | |
| "grad_norm": 0.889398455619812, | |
| "learning_rate": 0.0007907752153375939, | |
| "loss": 0.3895, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2000555709919422, | |
| "eval_loss": 0.2990996241569519, | |
| "eval_runtime": 6.0564, | |
| "eval_samples_per_second": 82.558, | |
| "eval_steps_per_second": 5.284, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.27785495971103086, | |
| "grad_norm": 0.5278475880622864, | |
| "learning_rate": 0.0007815133833472261, | |
| "loss": 0.2897, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4001111419838844, | |
| "eval_loss": 0.2636842131614685, | |
| "eval_runtime": 6.0156, | |
| "eval_samples_per_second": 83.118, | |
| "eval_steps_per_second": 5.32, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.41678243956654626, | |
| "grad_norm": 1.208860158920288, | |
| "learning_rate": 0.0007722515513568584, | |
| "loss": 0.2681, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5557099194220617, | |
| "grad_norm": 0.7001394629478455, | |
| "learning_rate": 0.0007629897193664907, | |
| "loss": 0.2536, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6001667129758266, | |
| "eval_loss": 0.25593382120132446, | |
| "eval_runtime": 6.0644, | |
| "eval_samples_per_second": 82.449, | |
| "eval_steps_per_second": 5.277, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6946373992775771, | |
| "grad_norm": 0.7066700458526611, | |
| "learning_rate": 0.000753727887376123, | |
| "loss": 0.2453, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8002222839677688, | |
| "eval_loss": 0.24263954162597656, | |
| "eval_runtime": 6.2582, | |
| "eval_samples_per_second": 79.896, | |
| "eval_steps_per_second": 5.113, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.8335648791330925, | |
| "grad_norm": 0.8669331073760986, | |
| "learning_rate": 0.000744484579049736, | |
| "loss": 0.2378, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.972492358988608, | |
| "grad_norm": 0.752050518989563, | |
| "learning_rate": 0.0007352412707233491, | |
| "loss": 0.2335, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.000277854959711, | |
| "eval_loss": 0.24230095744132996, | |
| "eval_runtime": 6.2445, | |
| "eval_samples_per_second": 80.07, | |
| "eval_steps_per_second": 5.124, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1114198388441234, | |
| "grad_norm": 0.5285800695419312, | |
| "learning_rate": 0.0007259794387329814, | |
| "loss": 0.2214, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2003334259516532, | |
| "eval_loss": 0.23442834615707397, | |
| "eval_runtime": 6.0486, | |
| "eval_samples_per_second": 82.664, | |
| "eval_steps_per_second": 5.291, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.2503473186996388, | |
| "grad_norm": 0.7171725630760193, | |
| "learning_rate": 0.0007167176067426138, | |
| "loss": 0.2204, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3892747985551543, | |
| "grad_norm": 0.6683080792427063, | |
| "learning_rate": 0.000707455774752246, | |
| "loss": 0.217, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4003889969435954, | |
| "eval_loss": 0.2214350551366806, | |
| "eval_runtime": 6.0608, | |
| "eval_samples_per_second": 82.498, | |
| "eval_steps_per_second": 5.28, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.5282022784106695, | |
| "grad_norm": 0.8774034380912781, | |
| "learning_rate": 0.0006981939427618783, | |
| "loss": 0.2154, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6004445679355377, | |
| "eval_loss": 0.2222467064857483, | |
| "eval_runtime": 6.2569, | |
| "eval_samples_per_second": 79.912, | |
| "eval_steps_per_second": 5.114, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.667129758266185, | |
| "grad_norm": 0.6040942668914795, | |
| "learning_rate": 0.0006889506344354914, | |
| "loss": 0.2133, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8005001389274797, | |
| "eval_loss": 0.2248920053243637, | |
| "eval_runtime": 6.2381, | |
| "eval_samples_per_second": 80.153, | |
| "eval_steps_per_second": 5.13, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.8060572381217006, | |
| "grad_norm": 0.8217343688011169, | |
| "learning_rate": 0.0006796888024451237, | |
| "loss": 0.2108, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9449847179772157, | |
| "grad_norm": 0.6363208889961243, | |
| "learning_rate": 0.0006704269704547561, | |
| "loss": 0.2095, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.000555709919422, | |
| "eval_loss": 0.2111140936613083, | |
| "eval_runtime": 6.4306, | |
| "eval_samples_per_second": 77.753, | |
| "eval_steps_per_second": 4.976, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.0839121978327313, | |
| "grad_norm": 0.5115044116973877, | |
| "learning_rate": 0.0006611651384643883, | |
| "loss": 0.1984, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2006112809113643, | |
| "eval_loss": 0.21921053528785706, | |
| "eval_runtime": 6.2111, | |
| "eval_samples_per_second": 80.501, | |
| "eval_steps_per_second": 5.152, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.222839677688247, | |
| "grad_norm": 0.7524166703224182, | |
| "learning_rate": 0.0006519218301380013, | |
| "loss": 0.1981, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.361767157543762, | |
| "grad_norm": 0.7301591038703918, | |
| "learning_rate": 0.0006426599981476336, | |
| "loss": 0.1973, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4006668519033063, | |
| "eval_loss": 0.2109183967113495, | |
| "eval_runtime": 6.2385, | |
| "eval_samples_per_second": 80.147, | |
| "eval_steps_per_second": 5.129, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.5006946373992776, | |
| "grad_norm": 0.6623874306678772, | |
| "learning_rate": 0.000633398166157266, | |
| "loss": 0.1938, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.600722422895249, | |
| "eval_loss": 0.19926229119300842, | |
| "eval_runtime": 6.2567, | |
| "eval_samples_per_second": 79.914, | |
| "eval_steps_per_second": 5.115, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 2.639622117254793, | |
| "grad_norm": 0.5896228551864624, | |
| "learning_rate": 0.0006241363341668982, | |
| "loss": 0.1917, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7785495971103087, | |
| "grad_norm": 0.5303069353103638, | |
| "learning_rate": 0.0006148745021765306, | |
| "loss": 0.1941, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.800777993887191, | |
| "eval_loss": 0.20040422677993774, | |
| "eval_runtime": 6.445, | |
| "eval_samples_per_second": 77.58, | |
| "eval_steps_per_second": 4.965, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 2.917477076965824, | |
| "grad_norm": 0.5772528052330017, | |
| "learning_rate": 0.0006056126701861628, | |
| "loss": 0.1962, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.000833564879133, | |
| "eval_loss": 0.19769687950611115, | |
| "eval_runtime": 6.251, | |
| "eval_samples_per_second": 79.987, | |
| "eval_steps_per_second": 5.119, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.0564045568213394, | |
| "grad_norm": 0.8229184746742249, | |
| "learning_rate": 0.0005963508381957952, | |
| "loss": 0.1868, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.1953320366768545, | |
| "grad_norm": 0.6219042539596558, | |
| "learning_rate": 0.0005871075298694081, | |
| "loss": 0.1813, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.2008891358710754, | |
| "eval_loss": 0.20137149095535278, | |
| "eval_runtime": 6.1829, | |
| "eval_samples_per_second": 80.868, | |
| "eval_steps_per_second": 5.176, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 3.33425951653237, | |
| "grad_norm": 0.6010075807571411, | |
| "learning_rate": 0.0005778456978790405, | |
| "loss": 0.1822, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4009447068630174, | |
| "eval_loss": 0.20252275466918945, | |
| "eval_runtime": 6.2359, | |
| "eval_samples_per_second": 80.181, | |
| "eval_steps_per_second": 5.132, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 3.4731869963878856, | |
| "grad_norm": 0.6477861404418945, | |
| "learning_rate": 0.0005685838658886728, | |
| "loss": 0.1814, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6010002778549595, | |
| "eval_loss": 0.19002339243888855, | |
| "eval_runtime": 6.0426, | |
| "eval_samples_per_second": 82.745, | |
| "eval_steps_per_second": 5.296, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 3.612114476243401, | |
| "grad_norm": 0.6676946878433228, | |
| "learning_rate": 0.0005593220338983052, | |
| "loss": 0.1808, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7510419560989163, | |
| "grad_norm": 1.1407862901687622, | |
| "learning_rate": 0.0005500602019079374, | |
| "loss": 0.1788, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.801055848846902, | |
| "eval_loss": 0.18976138532161713, | |
| "eval_runtime": 6.3955, | |
| "eval_samples_per_second": 78.18, | |
| "eval_steps_per_second": 5.004, | |
| "step": 13680 | |
| }, | |
| { | |
| "epoch": 3.889969435954432, | |
| "grad_norm": 0.818808913230896, | |
| "learning_rate": 0.0005407983699175697, | |
| "loss": 0.1795, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.001111419838844, | |
| "eval_loss": 0.18558593094348907, | |
| "eval_runtime": 6.8949, | |
| "eval_samples_per_second": 72.517, | |
| "eval_steps_per_second": 4.641, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.0288969158099475, | |
| "grad_norm": 0.5338103771209717, | |
| "learning_rate": 0.000531536537927202, | |
| "loss": 0.1732, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.167824395665463, | |
| "grad_norm": 0.6316047310829163, | |
| "learning_rate": 0.0005222747059368344, | |
| "loss": 0.1678, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.201166990830786, | |
| "eval_loss": 0.19053161144256592, | |
| "eval_runtime": 6.4789, | |
| "eval_samples_per_second": 77.173, | |
| "eval_steps_per_second": 4.939, | |
| "step": 15120 | |
| }, | |
| { | |
| "epoch": 4.306751875520978, | |
| "grad_norm": 0.9015474319458008, | |
| "learning_rate": 0.0005130313976104474, | |
| "loss": 0.1693, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.4012225618227285, | |
| "eval_loss": 0.19033345580101013, | |
| "eval_runtime": 6.2453, | |
| "eval_samples_per_second": 80.061, | |
| "eval_steps_per_second": 5.124, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 4.445679355376494, | |
| "grad_norm": 0.5398434400558472, | |
| "learning_rate": 0.0005037695656200797, | |
| "loss": 0.1705, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.584606835232009, | |
| "grad_norm": 0.6211907863616943, | |
| "learning_rate": 0.0004945077336297119, | |
| "loss": 0.1675, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.601278132814671, | |
| "eval_loss": 0.1857684701681137, | |
| "eval_runtime": 6.2063, | |
| "eval_samples_per_second": 80.563, | |
| "eval_steps_per_second": 5.156, | |
| "step": 16560 | |
| }, | |
| { | |
| "epoch": 4.723534315087524, | |
| "grad_norm": 0.6166074872016907, | |
| "learning_rate": 0.0004852459016393443, | |
| "loss": 0.1681, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.801333703806613, | |
| "eval_loss": 0.18442219495773315, | |
| "eval_runtime": 6.2555, | |
| "eval_samples_per_second": 79.93, | |
| "eval_steps_per_second": 5.116, | |
| "step": 17280 | |
| }, | |
| { | |
| "epoch": 4.86246179494304, | |
| "grad_norm": 0.5619335174560547, | |
| "learning_rate": 0.00047600259331295736, | |
| "loss": 0.1687, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "grad_norm": 0.7084242701530457, | |
| "learning_rate": 0.0004667407613225896, | |
| "loss": 0.1635, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "eval_loss": 0.1823691427707672, | |
| "eval_runtime": 6.4719, | |
| "eval_samples_per_second": 77.257, | |
| "eval_steps_per_second": 4.944, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.14031675465407, | |
| "grad_norm": 0.6466693878173828, | |
| "learning_rate": 0.0004574789293322219, | |
| "loss": 0.1537, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.201444845790498, | |
| "eval_loss": 0.18211981654167175, | |
| "eval_runtime": 6.28, | |
| "eval_samples_per_second": 79.618, | |
| "eval_steps_per_second": 5.096, | |
| "step": 18720 | |
| }, | |
| { | |
| "epoch": 5.279244234509586, | |
| "grad_norm": 0.760137677192688, | |
| "learning_rate": 0.00044821709734185424, | |
| "loss": 0.155, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.401500416782439, | |
| "eval_loss": 0.17618674039840698, | |
| "eval_runtime": 6.2993, | |
| "eval_samples_per_second": 79.374, | |
| "eval_steps_per_second": 5.08, | |
| "step": 19440 | |
| }, | |
| { | |
| "epoch": 5.418171714365101, | |
| "grad_norm": 0.6954505443572998, | |
| "learning_rate": 0.00043897378901546733, | |
| "loss": 0.155, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.5570991942206165, | |
| "grad_norm": 0.5655320882797241, | |
| "learning_rate": 0.00042971195702509957, | |
| "loss": 0.1559, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.601555987774382, | |
| "eval_loss": 0.18366502225399017, | |
| "eval_runtime": 6.3293, | |
| "eval_samples_per_second": 78.998, | |
| "eval_steps_per_second": 5.056, | |
| "step": 20160 | |
| }, | |
| { | |
| "epoch": 5.6960266740761325, | |
| "grad_norm": 0.5522324442863464, | |
| "learning_rate": 0.00042045012503473186, | |
| "loss": 0.1565, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.801611558766324, | |
| "eval_loss": 0.1724882870912552, | |
| "eval_runtime": 6.3076, | |
| "eval_samples_per_second": 79.269, | |
| "eval_steps_per_second": 5.073, | |
| "step": 20880 | |
| }, | |
| { | |
| "epoch": 5.834954153931648, | |
| "grad_norm": 0.5771639347076416, | |
| "learning_rate": 0.0004111882930443642, | |
| "loss": 0.1548, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.973881633787163, | |
| "grad_norm": 0.7794287204742432, | |
| "learning_rate": 0.0004019264610539965, | |
| "loss": 0.151, | |
| "step": 21500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 43188, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2773560145338696e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |