|
{ |
|
"best_metric": 5.272278467814128, |
|
"best_model_checkpoint": "./results/checkpoint-5496", |
|
"epoch": 7.0, |
|
"eval_steps": 500, |
|
"global_step": 6412, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1091703056768559, |
|
"grad_norm": 25.480337142944336, |
|
"learning_rate": 2.959061135371179e-05, |
|
"loss": 56.923, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2183406113537118, |
|
"grad_norm": 39.87223815917969, |
|
"learning_rate": 2.918122270742358e-05, |
|
"loss": 46.6475, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.32751091703056767, |
|
"grad_norm": 48.05048751831055, |
|
"learning_rate": 2.877183406113537e-05, |
|
"loss": 33.6867, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4366812227074236, |
|
"grad_norm": 31.941883087158203, |
|
"learning_rate": 2.8362445414847164e-05, |
|
"loss": 21.1084, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5458515283842795, |
|
"grad_norm": 55.025856018066406, |
|
"learning_rate": 2.7953056768558954e-05, |
|
"loss": 12.9495, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6550218340611353, |
|
"grad_norm": 34.957523345947266, |
|
"learning_rate": 2.7543668122270742e-05, |
|
"loss": 10.0745, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7641921397379913, |
|
"grad_norm": 24.020906448364258, |
|
"learning_rate": 2.7134279475982533e-05, |
|
"loss": 8.3541, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8733624454148472, |
|
"grad_norm": 32.709571838378906, |
|
"learning_rate": 2.6724890829694323e-05, |
|
"loss": 7.5128, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.982532751091703, |
|
"grad_norm": 38.94672393798828, |
|
"learning_rate": 2.6315502183406114e-05, |
|
"loss": 7.2241, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_avg_mae": 7.529487609863281, |
|
"eval_loss": 7.529487609863281, |
|
"eval_mae_lex": 6.992014408111572, |
|
"eval_mae_sem": 5.432034492492676, |
|
"eval_mae_syn": 10.164413452148438, |
|
"eval_runtime": 27.1764, |
|
"eval_samples_per_second": 269.609, |
|
"eval_steps_per_second": 8.426, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.091703056768559, |
|
"grad_norm": 26.391666412353516, |
|
"learning_rate": 2.5906113537117905e-05, |
|
"loss": 6.7699, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2008733624454149, |
|
"grad_norm": 22.994029998779297, |
|
"learning_rate": 2.5496724890829696e-05, |
|
"loss": 6.5552, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3100436681222707, |
|
"grad_norm": 20.722883224487305, |
|
"learning_rate": 2.5087336244541486e-05, |
|
"loss": 6.5897, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4192139737991267, |
|
"grad_norm": 32.02668380737305, |
|
"learning_rate": 2.4677947598253277e-05, |
|
"loss": 6.5073, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5283842794759825, |
|
"grad_norm": 32.40359115600586, |
|
"learning_rate": 2.4268558951965064e-05, |
|
"loss": 6.4684, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6375545851528384, |
|
"grad_norm": 47.73025131225586, |
|
"learning_rate": 2.3859170305676855e-05, |
|
"loss": 6.3165, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7467248908296944, |
|
"grad_norm": 47.35511016845703, |
|
"learning_rate": 2.344978165938865e-05, |
|
"loss": 6.2866, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8558951965065502, |
|
"grad_norm": 44.51765441894531, |
|
"learning_rate": 2.3040393013100437e-05, |
|
"loss": 6.3404, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.965065502183406, |
|
"grad_norm": 26.496959686279297, |
|
"learning_rate": 2.2631004366812227e-05, |
|
"loss": 6.1681, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_avg_mae": 6.067600250244141, |
|
"eval_loss": 6.067600250244141, |
|
"eval_mae_lex": 5.595421314239502, |
|
"eval_mae_sem": 4.1164045333862305, |
|
"eval_mae_syn": 8.490975379943848, |
|
"eval_runtime": 27.2193, |
|
"eval_samples_per_second": 269.184, |
|
"eval_steps_per_second": 8.413, |
|
"step": 1832 |
|
}, |
|
{ |
|
"epoch": 2.074235807860262, |
|
"grad_norm": 36.847415924072266, |
|
"learning_rate": 2.2221615720524018e-05, |
|
"loss": 5.9928, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.183406113537118, |
|
"grad_norm": 37.08506393432617, |
|
"learning_rate": 2.181222707423581e-05, |
|
"loss": 5.9648, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.2925764192139737, |
|
"grad_norm": 35.595909118652344, |
|
"learning_rate": 2.1402838427947596e-05, |
|
"loss": 5.8648, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.4017467248908297, |
|
"grad_norm": 23.82405662536621, |
|
"learning_rate": 2.099344978165939e-05, |
|
"loss": 5.9043, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.5109170305676853, |
|
"grad_norm": 30.872852325439453, |
|
"learning_rate": 2.058406113537118e-05, |
|
"loss": 5.8428, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.6200873362445414, |
|
"grad_norm": 42.079261779785156, |
|
"learning_rate": 2.0174672489082972e-05, |
|
"loss": 5.8529, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.7292576419213974, |
|
"grad_norm": 23.549190521240234, |
|
"learning_rate": 1.976528384279476e-05, |
|
"loss": 5.8328, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.8384279475982535, |
|
"grad_norm": 32.223079681396484, |
|
"learning_rate": 1.935589519650655e-05, |
|
"loss": 5.8484, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.947598253275109, |
|
"grad_norm": 25.67125129699707, |
|
"learning_rate": 1.894650655021834e-05, |
|
"loss": 5.5861, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_avg_mae": 5.645811716715495, |
|
"eval_loss": 5.645811080932617, |
|
"eval_mae_lex": 4.994715213775635, |
|
"eval_mae_sem": 3.6993861198425293, |
|
"eval_mae_syn": 8.24333381652832, |
|
"eval_runtime": 27.1472, |
|
"eval_samples_per_second": 269.899, |
|
"eval_steps_per_second": 8.435, |
|
"step": 2748 |
|
}, |
|
{ |
|
"epoch": 3.056768558951965, |
|
"grad_norm": 31.40544319152832, |
|
"learning_rate": 1.8537117903930135e-05, |
|
"loss": 5.4937, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.165938864628821, |
|
"grad_norm": 28.67197608947754, |
|
"learning_rate": 1.8127729257641922e-05, |
|
"loss": 5.5573, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.2751091703056767, |
|
"grad_norm": 26.671180725097656, |
|
"learning_rate": 1.7718340611353713e-05, |
|
"loss": 5.6147, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.3842794759825328, |
|
"grad_norm": 32.73609924316406, |
|
"learning_rate": 1.7308951965065504e-05, |
|
"loss": 5.2704, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.493449781659389, |
|
"grad_norm": 26.268295288085938, |
|
"learning_rate": 1.689956331877729e-05, |
|
"loss": 5.4946, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.6026200873362444, |
|
"grad_norm": 24.3873233795166, |
|
"learning_rate": 1.649017467248908e-05, |
|
"loss": 5.5757, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.7117903930131004, |
|
"grad_norm": 26.872316360473633, |
|
"learning_rate": 1.6080786026200872e-05, |
|
"loss": 5.3305, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.8209606986899565, |
|
"grad_norm": 31.78321647644043, |
|
"learning_rate": 1.5671397379912666e-05, |
|
"loss": 5.4091, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.930131004366812, |
|
"grad_norm": 37.95060729980469, |
|
"learning_rate": 1.5262008733624454e-05, |
|
"loss": 5.464, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_avg_mae": 5.841625213623047, |
|
"eval_loss": 5.841624736785889, |
|
"eval_mae_lex": 5.572142124176025, |
|
"eval_mae_sem": 3.7872631549835205, |
|
"eval_mae_syn": 8.1654691696167, |
|
"eval_runtime": 27.1382, |
|
"eval_samples_per_second": 269.988, |
|
"eval_steps_per_second": 8.438, |
|
"step": 3664 |
|
}, |
|
{ |
|
"epoch": 4.039301310043668, |
|
"grad_norm": 37.76191329956055, |
|
"learning_rate": 1.4852620087336245e-05, |
|
"loss": 5.4072, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 4.148471615720524, |
|
"grad_norm": 33.29827117919922, |
|
"learning_rate": 1.4443231441048035e-05, |
|
"loss": 5.0985, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 4.25764192139738, |
|
"grad_norm": 47.478206634521484, |
|
"learning_rate": 1.4033842794759826e-05, |
|
"loss": 5.1541, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 4.366812227074236, |
|
"grad_norm": 31.66642189025879, |
|
"learning_rate": 1.3624454148471617e-05, |
|
"loss": 5.106, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.475982532751091, |
|
"grad_norm": 27.389015197753906, |
|
"learning_rate": 1.3215065502183406e-05, |
|
"loss": 5.1793, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 4.585152838427947, |
|
"grad_norm": 26.702226638793945, |
|
"learning_rate": 1.2805676855895198e-05, |
|
"loss": 5.0975, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 4.6943231441048034, |
|
"grad_norm": 31.537691116333008, |
|
"learning_rate": 1.2396288209606987e-05, |
|
"loss": 4.9613, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.8034934497816595, |
|
"grad_norm": 26.946945190429688, |
|
"learning_rate": 1.1986899563318778e-05, |
|
"loss": 5.1799, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.9126637554585155, |
|
"grad_norm": 27.92361068725586, |
|
"learning_rate": 1.1577510917030569e-05, |
|
"loss": 5.0393, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_avg_mae": 5.481770197550456, |
|
"eval_loss": 5.481770038604736, |
|
"eval_mae_lex": 4.809901714324951, |
|
"eval_mae_sem": 3.8779022693634033, |
|
"eval_mae_syn": 7.757506370544434, |
|
"eval_runtime": 27.0994, |
|
"eval_samples_per_second": 270.375, |
|
"eval_steps_per_second": 8.45, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 5.021834061135372, |
|
"grad_norm": 25.387714385986328, |
|
"learning_rate": 1.1168122270742358e-05, |
|
"loss": 5.0585, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 5.131004366812227, |
|
"grad_norm": 33.1529655456543, |
|
"learning_rate": 1.0758733624454149e-05, |
|
"loss": 4.774, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 5.240174672489083, |
|
"grad_norm": 31.700637817382812, |
|
"learning_rate": 1.034934497816594e-05, |
|
"loss": 4.762, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 5.349344978165939, |
|
"grad_norm": 34.32217025756836, |
|
"learning_rate": 9.93995633187773e-06, |
|
"loss": 4.8645, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 5.458515283842795, |
|
"grad_norm": 52.338130950927734, |
|
"learning_rate": 9.530567685589519e-06, |
|
"loss": 4.9913, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 5.567685589519651, |
|
"grad_norm": 27.761211395263672, |
|
"learning_rate": 9.12117903930131e-06, |
|
"loss": 4.9047, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 5.676855895196507, |
|
"grad_norm": 36.54159164428711, |
|
"learning_rate": 8.7117903930131e-06, |
|
"loss": 4.7824, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 5.786026200873362, |
|
"grad_norm": 31.954957962036133, |
|
"learning_rate": 8.302401746724891e-06, |
|
"loss": 4.7555, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 5.895196506550218, |
|
"grad_norm": 33.35627365112305, |
|
"learning_rate": 7.89301310043668e-06, |
|
"loss": 4.8389, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_avg_mae": 5.272278467814128, |
|
"eval_loss": 5.272278308868408, |
|
"eval_mae_lex": 4.76518440246582, |
|
"eval_mae_sem": 3.384120225906372, |
|
"eval_mae_syn": 7.6675310134887695, |
|
"eval_runtime": 27.0821, |
|
"eval_samples_per_second": 270.548, |
|
"eval_steps_per_second": 8.456, |
|
"step": 5496 |
|
}, |
|
{ |
|
"epoch": 6.004366812227074, |
|
"grad_norm": 29.006881713867188, |
|
"learning_rate": 7.483624454148472e-06, |
|
"loss": 4.8767, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 6.11353711790393, |
|
"grad_norm": 29.40102195739746, |
|
"learning_rate": 7.074235807860262e-06, |
|
"loss": 4.6832, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 6.222707423580786, |
|
"grad_norm": 32.63214111328125, |
|
"learning_rate": 6.664847161572053e-06, |
|
"loss": 4.6676, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 6.331877729257642, |
|
"grad_norm": 32.83290481567383, |
|
"learning_rate": 6.2554585152838425e-06, |
|
"loss": 4.6235, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 6.441048034934497, |
|
"grad_norm": 36.680030822753906, |
|
"learning_rate": 5.846069868995633e-06, |
|
"loss": 4.4681, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 6.550218340611353, |
|
"grad_norm": 37.740535736083984, |
|
"learning_rate": 5.436681222707424e-06, |
|
"loss": 4.518, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.6593886462882095, |
|
"grad_norm": 33.8775749206543, |
|
"learning_rate": 5.027292576419214e-06, |
|
"loss": 4.6268, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 6.7685589519650655, |
|
"grad_norm": 50.033729553222656, |
|
"learning_rate": 4.617903930131005e-06, |
|
"loss": 4.6026, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 6.877729257641922, |
|
"grad_norm": 34.454341888427734, |
|
"learning_rate": 4.208515283842795e-06, |
|
"loss": 4.618, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 6.986899563318778, |
|
"grad_norm": 41.41761779785156, |
|
"learning_rate": 3.799126637554585e-06, |
|
"loss": 4.6299, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_avg_mae": 5.449769337972005, |
|
"eval_loss": 5.449769496917725, |
|
"eval_mae_lex": 4.877564430236816, |
|
"eval_mae_sem": 3.8569977283477783, |
|
"eval_mae_syn": 7.614744663238525, |
|
"eval_runtime": 27.0562, |
|
"eval_samples_per_second": 270.806, |
|
"eval_steps_per_second": 8.464, |
|
"step": 6412 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 7328, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3494363633370368e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|