{ "best_metric": 5.272278467814128, "best_model_checkpoint": "./results/checkpoint-5496", "epoch": 7.0, "eval_steps": 500, "global_step": 6412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1091703056768559, "grad_norm": 25.480337142944336, "learning_rate": 2.959061135371179e-05, "loss": 56.923, "step": 100 }, { "epoch": 0.2183406113537118, "grad_norm": 39.87223815917969, "learning_rate": 2.918122270742358e-05, "loss": 46.6475, "step": 200 }, { "epoch": 0.32751091703056767, "grad_norm": 48.05048751831055, "learning_rate": 2.877183406113537e-05, "loss": 33.6867, "step": 300 }, { "epoch": 0.4366812227074236, "grad_norm": 31.941883087158203, "learning_rate": 2.8362445414847164e-05, "loss": 21.1084, "step": 400 }, { "epoch": 0.5458515283842795, "grad_norm": 55.025856018066406, "learning_rate": 2.7953056768558954e-05, "loss": 12.9495, "step": 500 }, { "epoch": 0.6550218340611353, "grad_norm": 34.957523345947266, "learning_rate": 2.7543668122270742e-05, "loss": 10.0745, "step": 600 }, { "epoch": 0.7641921397379913, "grad_norm": 24.020906448364258, "learning_rate": 2.7134279475982533e-05, "loss": 8.3541, "step": 700 }, { "epoch": 0.8733624454148472, "grad_norm": 32.709571838378906, "learning_rate": 2.6724890829694323e-05, "loss": 7.5128, "step": 800 }, { "epoch": 0.982532751091703, "grad_norm": 38.94672393798828, "learning_rate": 2.6315502183406114e-05, "loss": 7.2241, "step": 900 }, { "epoch": 1.0, "eval_avg_mae": 7.529487609863281, "eval_loss": 7.529487609863281, "eval_mae_lex": 6.992014408111572, "eval_mae_sem": 5.432034492492676, "eval_mae_syn": 10.164413452148438, "eval_runtime": 27.1764, "eval_samples_per_second": 269.609, "eval_steps_per_second": 8.426, "step": 916 }, { "epoch": 1.091703056768559, "grad_norm": 26.391666412353516, "learning_rate": 2.5906113537117905e-05, "loss": 6.7699, "step": 1000 }, { "epoch": 1.2008733624454149, "grad_norm": 22.994029998779297, "learning_rate": 2.5496724890829696e-05, "loss": 6.5552, "step": 1100 }, { "epoch": 1.3100436681222707, "grad_norm": 20.722883224487305, "learning_rate": 2.5087336244541486e-05, "loss": 6.5897, "step": 1200 }, { "epoch": 1.4192139737991267, "grad_norm": 32.02668380737305, "learning_rate": 2.4677947598253277e-05, "loss": 6.5073, "step": 1300 }, { "epoch": 1.5283842794759825, "grad_norm": 32.40359115600586, "learning_rate": 2.4268558951965064e-05, "loss": 6.4684, "step": 1400 }, { "epoch": 1.6375545851528384, "grad_norm": 47.73025131225586, "learning_rate": 2.3859170305676855e-05, "loss": 6.3165, "step": 1500 }, { "epoch": 1.7467248908296944, "grad_norm": 47.35511016845703, "learning_rate": 2.344978165938865e-05, "loss": 6.2866, "step": 1600 }, { "epoch": 1.8558951965065502, "grad_norm": 44.51765441894531, "learning_rate": 2.3040393013100437e-05, "loss": 6.3404, "step": 1700 }, { "epoch": 1.965065502183406, "grad_norm": 26.496959686279297, "learning_rate": 2.2631004366812227e-05, "loss": 6.1681, "step": 1800 }, { "epoch": 2.0, "eval_avg_mae": 6.067600250244141, "eval_loss": 6.067600250244141, "eval_mae_lex": 5.595421314239502, "eval_mae_sem": 4.1164045333862305, "eval_mae_syn": 8.490975379943848, "eval_runtime": 27.2193, "eval_samples_per_second": 269.184, "eval_steps_per_second": 8.413, "step": 1832 }, { "epoch": 2.074235807860262, "grad_norm": 36.847415924072266, "learning_rate": 2.2221615720524018e-05, "loss": 5.9928, "step": 1900 }, { "epoch": 2.183406113537118, "grad_norm": 37.08506393432617, "learning_rate": 2.181222707423581e-05, "loss": 5.9648, "step": 2000 }, { "epoch": 2.2925764192139737, "grad_norm": 35.595909118652344, "learning_rate": 2.1402838427947596e-05, "loss": 5.8648, "step": 2100 }, { "epoch": 2.4017467248908297, "grad_norm": 23.82405662536621, "learning_rate": 2.099344978165939e-05, "loss": 5.9043, "step": 2200 }, { "epoch": 2.5109170305676853, "grad_norm": 30.872852325439453, "learning_rate": 2.058406113537118e-05, "loss": 5.8428, "step": 2300 }, { "epoch": 2.6200873362445414, "grad_norm": 42.079261779785156, "learning_rate": 2.0174672489082972e-05, "loss": 5.8529, "step": 2400 }, { "epoch": 2.7292576419213974, "grad_norm": 23.549190521240234, "learning_rate": 1.976528384279476e-05, "loss": 5.8328, "step": 2500 }, { "epoch": 2.8384279475982535, "grad_norm": 32.223079681396484, "learning_rate": 1.935589519650655e-05, "loss": 5.8484, "step": 2600 }, { "epoch": 2.947598253275109, "grad_norm": 25.67125129699707, "learning_rate": 1.894650655021834e-05, "loss": 5.5861, "step": 2700 }, { "epoch": 3.0, "eval_avg_mae": 5.645811716715495, "eval_loss": 5.645811080932617, "eval_mae_lex": 4.994715213775635, "eval_mae_sem": 3.6993861198425293, "eval_mae_syn": 8.24333381652832, "eval_runtime": 27.1472, "eval_samples_per_second": 269.899, "eval_steps_per_second": 8.435, "step": 2748 }, { "epoch": 3.056768558951965, "grad_norm": 31.40544319152832, "learning_rate": 1.8537117903930135e-05, "loss": 5.4937, "step": 2800 }, { "epoch": 3.165938864628821, "grad_norm": 28.67197608947754, "learning_rate": 1.8127729257641922e-05, "loss": 5.5573, "step": 2900 }, { "epoch": 3.2751091703056767, "grad_norm": 26.671180725097656, "learning_rate": 1.7718340611353713e-05, "loss": 5.6147, "step": 3000 }, { "epoch": 3.3842794759825328, "grad_norm": 32.73609924316406, "learning_rate": 1.7308951965065504e-05, "loss": 5.2704, "step": 3100 }, { "epoch": 3.493449781659389, "grad_norm": 26.268295288085938, "learning_rate": 1.689956331877729e-05, "loss": 5.4946, "step": 3200 }, { "epoch": 3.6026200873362444, "grad_norm": 24.3873233795166, "learning_rate": 1.649017467248908e-05, "loss": 5.5757, "step": 3300 }, { "epoch": 3.7117903930131004, "grad_norm": 26.872316360473633, "learning_rate": 1.6080786026200872e-05, "loss": 5.3305, "step": 3400 }, { "epoch": 3.8209606986899565, "grad_norm": 31.78321647644043, "learning_rate": 1.5671397379912666e-05, "loss": 5.4091, "step": 3500 }, { "epoch": 3.930131004366812, "grad_norm": 37.95060729980469, "learning_rate": 1.5262008733624454e-05, "loss": 5.464, "step": 3600 }, { "epoch": 4.0, "eval_avg_mae": 5.841625213623047, "eval_loss": 5.841624736785889, "eval_mae_lex": 5.572142124176025, "eval_mae_sem": 3.7872631549835205, "eval_mae_syn": 8.1654691696167, "eval_runtime": 27.1382, "eval_samples_per_second": 269.988, "eval_steps_per_second": 8.438, "step": 3664 }, { "epoch": 4.039301310043668, "grad_norm": 37.76191329956055, "learning_rate": 1.4852620087336245e-05, "loss": 5.4072, "step": 3700 }, { "epoch": 4.148471615720524, "grad_norm": 33.29827117919922, "learning_rate": 1.4443231441048035e-05, "loss": 5.0985, "step": 3800 }, { "epoch": 4.25764192139738, "grad_norm": 47.478206634521484, "learning_rate": 1.4033842794759826e-05, "loss": 5.1541, "step": 3900 }, { "epoch": 4.366812227074236, "grad_norm": 31.66642189025879, "learning_rate": 1.3624454148471617e-05, "loss": 5.106, "step": 4000 }, { "epoch": 4.475982532751091, "grad_norm": 27.389015197753906, "learning_rate": 1.3215065502183406e-05, "loss": 5.1793, "step": 4100 }, { "epoch": 4.585152838427947, "grad_norm": 26.702226638793945, "learning_rate": 1.2805676855895198e-05, "loss": 5.0975, "step": 4200 }, { "epoch": 4.6943231441048034, "grad_norm": 31.537691116333008, "learning_rate": 1.2396288209606987e-05, "loss": 4.9613, "step": 4300 }, { "epoch": 4.8034934497816595, "grad_norm": 26.946945190429688, "learning_rate": 1.1986899563318778e-05, "loss": 5.1799, "step": 4400 }, { "epoch": 4.9126637554585155, "grad_norm": 27.92361068725586, "learning_rate": 1.1577510917030569e-05, "loss": 5.0393, "step": 4500 }, { "epoch": 5.0, "eval_avg_mae": 5.481770197550456, "eval_loss": 5.481770038604736, "eval_mae_lex": 4.809901714324951, "eval_mae_sem": 3.8779022693634033, "eval_mae_syn": 7.757506370544434, "eval_runtime": 27.0994, "eval_samples_per_second": 270.375, "eval_steps_per_second": 8.45, "step": 4580 }, { "epoch": 5.021834061135372, "grad_norm": 25.387714385986328, "learning_rate": 1.1168122270742358e-05, "loss": 5.0585, "step": 4600 }, { "epoch": 5.131004366812227, "grad_norm": 33.1529655456543, "learning_rate": 1.0758733624454149e-05, "loss": 4.774, "step": 4700 }, { "epoch": 5.240174672489083, "grad_norm": 31.700637817382812, "learning_rate": 1.034934497816594e-05, "loss": 4.762, "step": 4800 }, { "epoch": 5.349344978165939, "grad_norm": 34.32217025756836, "learning_rate": 9.93995633187773e-06, "loss": 4.8645, "step": 4900 }, { "epoch": 5.458515283842795, "grad_norm": 52.338130950927734, "learning_rate": 9.530567685589519e-06, "loss": 4.9913, "step": 5000 }, { "epoch": 5.567685589519651, "grad_norm": 27.761211395263672, "learning_rate": 9.12117903930131e-06, "loss": 4.9047, "step": 5100 }, { "epoch": 5.676855895196507, "grad_norm": 36.54159164428711, "learning_rate": 8.7117903930131e-06, "loss": 4.7824, "step": 5200 }, { "epoch": 5.786026200873362, "grad_norm": 31.954957962036133, "learning_rate": 8.302401746724891e-06, "loss": 4.7555, "step": 5300 }, { "epoch": 5.895196506550218, "grad_norm": 33.35627365112305, "learning_rate": 7.89301310043668e-06, "loss": 4.8389, "step": 5400 }, { "epoch": 6.0, "eval_avg_mae": 5.272278467814128, "eval_loss": 5.272278308868408, "eval_mae_lex": 4.76518440246582, "eval_mae_sem": 3.384120225906372, "eval_mae_syn": 7.6675310134887695, "eval_runtime": 27.0821, "eval_samples_per_second": 270.548, "eval_steps_per_second": 8.456, "step": 5496 }, { "epoch": 6.004366812227074, "grad_norm": 29.006881713867188, "learning_rate": 7.483624454148472e-06, "loss": 4.8767, "step": 5500 }, { "epoch": 6.11353711790393, "grad_norm": 29.40102195739746, "learning_rate": 7.074235807860262e-06, "loss": 4.6832, "step": 5600 }, { "epoch": 6.222707423580786, "grad_norm": 32.63214111328125, "learning_rate": 6.664847161572053e-06, "loss": 4.6676, "step": 5700 }, { "epoch": 6.331877729257642, "grad_norm": 32.83290481567383, "learning_rate": 6.2554585152838425e-06, "loss": 4.6235, "step": 5800 }, { "epoch": 6.441048034934497, "grad_norm": 36.680030822753906, "learning_rate": 5.846069868995633e-06, "loss": 4.4681, "step": 5900 }, { "epoch": 6.550218340611353, "grad_norm": 37.740535736083984, "learning_rate": 5.436681222707424e-06, "loss": 4.518, "step": 6000 }, { "epoch": 6.6593886462882095, "grad_norm": 33.8775749206543, "learning_rate": 5.027292576419214e-06, "loss": 4.6268, "step": 6100 }, { "epoch": 6.7685589519650655, "grad_norm": 50.033729553222656, "learning_rate": 4.617903930131005e-06, "loss": 4.6026, "step": 6200 }, { "epoch": 6.877729257641922, "grad_norm": 34.454341888427734, "learning_rate": 4.208515283842795e-06, "loss": 4.618, "step": 6300 }, { "epoch": 6.986899563318778, "grad_norm": 41.41761779785156, "learning_rate": 3.799126637554585e-06, "loss": 4.6299, "step": 6400 }, { "epoch": 7.0, "eval_avg_mae": 5.449769337972005, "eval_loss": 5.449769496917725, "eval_mae_lex": 4.877564430236816, "eval_mae_sem": 3.8569977283477783, "eval_mae_syn": 7.614744663238525, "eval_runtime": 27.0562, "eval_samples_per_second": 270.806, "eval_steps_per_second": 8.464, "step": 6412 } ], "logging_steps": 100, "max_steps": 7328, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3494363633370368e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }