|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.0, |
|
"eval_steps": 2492, |
|
"global_step": 74736, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.040141297366730895, |
|
"grad_norm": 1.3413853645324707, |
|
"learning_rate": 0.00019893081086874966, |
|
"loss": 0.8055, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08028259473346179, |
|
"grad_norm": 1.8721694946289062, |
|
"learning_rate": 0.00019759365453668473, |
|
"loss": 0.5632, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12042389210019268, |
|
"grad_norm": 1.127410650253296, |
|
"learning_rate": 0.0001962538185326116, |
|
"loss": 0.5087, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.16056518946692358, |
|
"grad_norm": 1.7598766088485718, |
|
"learning_rate": 0.00019491398252853852, |
|
"loss": 0.4601, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.20006422607578678, |
|
"eval_loss": 0.37580204010009766, |
|
"eval_runtime": 9.2262, |
|
"eval_samples_per_second": 10.839, |
|
"eval_steps_per_second": 5.419, |
|
"step": 2492 |
|
}, |
|
{ |
|
"epoch": 0.20070648683365447, |
|
"grad_norm": 1.5294134616851807, |
|
"learning_rate": 0.00019357414652446543, |
|
"loss": 0.4423, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.24084778420038536, |
|
"grad_norm": 2.4775636196136475, |
|
"learning_rate": 0.00019223699019240047, |
|
"loss": 0.4201, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2809890815671163, |
|
"grad_norm": 1.5586802959442139, |
|
"learning_rate": 0.00019089715418832737, |
|
"loss": 0.3942, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.32113037893384716, |
|
"grad_norm": 2.8762762546539307, |
|
"learning_rate": 0.00018955731818425425, |
|
"loss": 0.3846, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.36127167630057805, |
|
"grad_norm": 1.1510844230651855, |
|
"learning_rate": 0.00018821748218018116, |
|
"loss": 0.369, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.40012845215157355, |
|
"eval_loss": 0.3327239751815796, |
|
"eval_runtime": 9.2684, |
|
"eval_samples_per_second": 10.789, |
|
"eval_steps_per_second": 5.395, |
|
"step": 4984 |
|
}, |
|
{ |
|
"epoch": 0.40141297366730894, |
|
"grad_norm": 2.390737771987915, |
|
"learning_rate": 0.00018687764617610804, |
|
"loss": 0.3662, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.4415542710340398, |
|
"grad_norm": 1.9558954238891602, |
|
"learning_rate": 0.00018553781017203495, |
|
"loss": 0.3516, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.4816955684007707, |
|
"grad_norm": 1.7609800100326538, |
|
"learning_rate": 0.00018419797416796186, |
|
"loss": 0.3399, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5218368657675017, |
|
"grad_norm": 2.026273488998413, |
|
"learning_rate": 0.00018285813816388874, |
|
"loss": 0.3374, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5619781631342325, |
|
"grad_norm": 1.3401412963867188, |
|
"learning_rate": 0.0001815209818318238, |
|
"loss": 0.327, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6001926782273603, |
|
"eval_loss": 0.2893865406513214, |
|
"eval_runtime": 9.2199, |
|
"eval_samples_per_second": 10.846, |
|
"eval_steps_per_second": 5.423, |
|
"step": 7476 |
|
}, |
|
{ |
|
"epoch": 0.6021194605009634, |
|
"grad_norm": 2.0142829418182373, |
|
"learning_rate": 0.0001801811458277507, |
|
"loss": 0.3291, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6422607578676943, |
|
"grad_norm": 0.9912647604942322, |
|
"learning_rate": 0.0001788413098236776, |
|
"loss": 0.3121, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6824020552344252, |
|
"grad_norm": 2.552272319793701, |
|
"learning_rate": 0.00017750415349161264, |
|
"loss": 0.3117, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7225433526011561, |
|
"grad_norm": 2.693617582321167, |
|
"learning_rate": 0.00017616431748753954, |
|
"loss": 0.3069, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.762684649967887, |
|
"grad_norm": 3.3755452632904053, |
|
"learning_rate": 0.00017482448148346645, |
|
"loss": 0.2988, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8002569043031471, |
|
"eval_loss": 0.28478091955184937, |
|
"eval_runtime": 9.2844, |
|
"eval_samples_per_second": 10.771, |
|
"eval_steps_per_second": 5.385, |
|
"step": 9968 |
|
}, |
|
{ |
|
"epoch": 0.8028259473346179, |
|
"grad_norm": 1.348958134651184, |
|
"learning_rate": 0.00017348464547939333, |
|
"loss": 0.297, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8429672447013488, |
|
"grad_norm": 1.8039051294326782, |
|
"learning_rate": 0.00017214480947532024, |
|
"loss": 0.3044, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.8831085420680796, |
|
"grad_norm": 1.7400553226470947, |
|
"learning_rate": 0.00017080497347124712, |
|
"loss": 0.2938, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9232498394348105, |
|
"grad_norm": 1.4909226894378662, |
|
"learning_rate": 0.00016946513746717403, |
|
"loss": 0.2785, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.9633911368015414, |
|
"grad_norm": 2.5195188522338867, |
|
"learning_rate": 0.00016812530146310094, |
|
"loss": 0.2873, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.0003211303789339, |
|
"eval_loss": 0.2697654068470001, |
|
"eval_runtime": 9.4165, |
|
"eval_samples_per_second": 10.62, |
|
"eval_steps_per_second": 5.31, |
|
"step": 12460 |
|
}, |
|
{ |
|
"epoch": 1.0035324341682723, |
|
"grad_norm": 2.530623435974121, |
|
"learning_rate": 0.00016678546545902782, |
|
"loss": 0.2723, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0436737315350033, |
|
"grad_norm": 1.2821416854858398, |
|
"learning_rate": 0.00016544830912696288, |
|
"loss": 0.2632, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.083815028901734, |
|
"grad_norm": 1.7591134309768677, |
|
"learning_rate": 0.00016410847312288976, |
|
"loss": 0.2633, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.123956326268465, |
|
"grad_norm": 1.4008091688156128, |
|
"learning_rate": 0.00016276863711881667, |
|
"loss": 0.245, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.1640976236351959, |
|
"grad_norm": 1.567612886428833, |
|
"learning_rate": 0.00016142880111474355, |
|
"loss": 0.2487, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.2003853564547207, |
|
"eval_loss": 0.26689666509628296, |
|
"eval_runtime": 9.3095, |
|
"eval_samples_per_second": 10.742, |
|
"eval_steps_per_second": 5.371, |
|
"step": 14952 |
|
}, |
|
{ |
|
"epoch": 1.2042389210019269, |
|
"grad_norm": 1.9705049991607666, |
|
"learning_rate": 0.00016008896511067046, |
|
"loss": 0.247, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.2443802183686576, |
|
"grad_norm": 1.31686270236969, |
|
"learning_rate": 0.00015875180877860553, |
|
"loss": 0.2459, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.2845215157353886, |
|
"grad_norm": 1.3225802183151245, |
|
"learning_rate": 0.0001574119727745324, |
|
"loss": 0.253, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.3246628131021194, |
|
"grad_norm": 1.7328706979751587, |
|
"learning_rate": 0.00015607213677045932, |
|
"loss": 0.2404, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.3648041104688504, |
|
"grad_norm": 1.6794183254241943, |
|
"learning_rate": 0.00015473498043839433, |
|
"loss": 0.2371, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.4004495825305074, |
|
"eval_loss": 0.24297775328159332, |
|
"eval_runtime": 9.4719, |
|
"eval_samples_per_second": 10.558, |
|
"eval_steps_per_second": 5.279, |
|
"step": 17444 |
|
}, |
|
{ |
|
"epoch": 1.4049454078355812, |
|
"grad_norm": 1.2241907119750977, |
|
"learning_rate": 0.00015339514443432124, |
|
"loss": 0.2407, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.4450867052023122, |
|
"grad_norm": 2.555837631225586, |
|
"learning_rate": 0.00015205530843024815, |
|
"loss": 0.2344, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.485228002569043, |
|
"grad_norm": 2.1652560234069824, |
|
"learning_rate": 0.00015071547242617505, |
|
"loss": 0.2404, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.525369299935774, |
|
"grad_norm": 3.3938980102539062, |
|
"learning_rate": 0.00014937563642210196, |
|
"loss": 0.2383, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.565510597302505, |
|
"grad_norm": 2.078359365463257, |
|
"learning_rate": 0.00014803580041802884, |
|
"loss": 0.2324, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.6005138086062942, |
|
"eval_loss": 0.23732736706733704, |
|
"eval_runtime": 9.3828, |
|
"eval_samples_per_second": 10.658, |
|
"eval_steps_per_second": 5.329, |
|
"step": 19936 |
|
}, |
|
{ |
|
"epoch": 1.6056518946692357, |
|
"grad_norm": 2.344526767730713, |
|
"learning_rate": 0.00014669596441395575, |
|
"loss": 0.2337, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.6457931920359665, |
|
"grad_norm": 3.3463504314422607, |
|
"learning_rate": 0.00014535612840988263, |
|
"loss": 0.2269, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.6859344894026975, |
|
"grad_norm": 1.8101106882095337, |
|
"learning_rate": 0.00014401629240580954, |
|
"loss": 0.2281, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.7260757867694285, |
|
"grad_norm": 2.2214066982269287, |
|
"learning_rate": 0.00014267645640173645, |
|
"loss": 0.2299, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.7662170841361593, |
|
"grad_norm": 3.5231621265411377, |
|
"learning_rate": 0.00014133930006967149, |
|
"loss": 0.2278, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.800578034682081, |
|
"eval_loss": 0.23720289766788483, |
|
"eval_runtime": 9.3853, |
|
"eval_samples_per_second": 10.655, |
|
"eval_steps_per_second": 5.327, |
|
"step": 22428 |
|
}, |
|
{ |
|
"epoch": 1.80635838150289, |
|
"grad_norm": 1.503125548362732, |
|
"learning_rate": 0.0001399994640655984, |
|
"loss": 0.2286, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.846499678869621, |
|
"grad_norm": 1.3709886074066162, |
|
"learning_rate": 0.00013865962806152527, |
|
"loss": 0.2204, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.886640976236352, |
|
"grad_norm": 0.8027909398078918, |
|
"learning_rate": 0.00013731979205745218, |
|
"loss": 0.2122, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.9267822736030829, |
|
"grad_norm": 1.0423333644866943, |
|
"learning_rate": 0.0001359826357253872, |
|
"loss": 0.2209, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.9669235709698136, |
|
"grad_norm": 3.140822172164917, |
|
"learning_rate": 0.00013464547939332226, |
|
"loss": 0.2212, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.0006422607578678, |
|
"eval_loss": 0.21895286440849304, |
|
"eval_runtime": 9.4416, |
|
"eval_samples_per_second": 10.591, |
|
"eval_steps_per_second": 5.296, |
|
"step": 24920 |
|
}, |
|
{ |
|
"epoch": 2.0070648683365446, |
|
"grad_norm": 2.0828654766082764, |
|
"learning_rate": 0.00013330564338924917, |
|
"loss": 0.2157, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.0472061657032756, |
|
"grad_norm": 1.9575061798095703, |
|
"learning_rate": 0.00013196580738517605, |
|
"loss": 0.1942, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.0873474630700066, |
|
"grad_norm": 2.263435125350952, |
|
"learning_rate": 0.00013062597138110296, |
|
"loss": 0.1939, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.127488760436737, |
|
"grad_norm": 1.9007372856140137, |
|
"learning_rate": 0.00012928613537702984, |
|
"loss": 0.1964, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.167630057803468, |
|
"grad_norm": 1.8549082279205322, |
|
"learning_rate": 0.00012794629937295675, |
|
"loss": 0.1991, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.2007064868336546, |
|
"eval_loss": 0.21308277547359467, |
|
"eval_runtime": 9.4114, |
|
"eval_samples_per_second": 10.625, |
|
"eval_steps_per_second": 5.313, |
|
"step": 27412 |
|
}, |
|
{ |
|
"epoch": 2.207771355170199, |
|
"grad_norm": 1.3462762832641602, |
|
"learning_rate": 0.00012660646336888366, |
|
"loss": 0.1923, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.24791265253693, |
|
"grad_norm": 1.3775655031204224, |
|
"learning_rate": 0.00012526662736481056, |
|
"loss": 0.1956, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.2880539499036607, |
|
"grad_norm": 3.3470311164855957, |
|
"learning_rate": 0.0001239294710327456, |
|
"loss": 0.188, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.3281952472703917, |
|
"grad_norm": 2.1436495780944824, |
|
"learning_rate": 0.00012259231470068064, |
|
"loss": 0.1963, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.3683365446371227, |
|
"grad_norm": 1.8383592367172241, |
|
"learning_rate": 0.00012125247869660754, |
|
"loss": 0.1854, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 2.4007707129094413, |
|
"eval_loss": 0.20915324985980988, |
|
"eval_runtime": 9.2953, |
|
"eval_samples_per_second": 10.758, |
|
"eval_steps_per_second": 5.379, |
|
"step": 29904 |
|
}, |
|
{ |
|
"epoch": 2.4084778420038537, |
|
"grad_norm": 1.6092569828033447, |
|
"learning_rate": 0.00011991264269253443, |
|
"loss": 0.1949, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.4486191393705843, |
|
"grad_norm": 1.8148771524429321, |
|
"learning_rate": 0.00011857280668846134, |
|
"loss": 0.1902, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 2.4887604367373153, |
|
"grad_norm": 2.0251662731170654, |
|
"learning_rate": 0.00011723297068438823, |
|
"loss": 0.1921, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.5289017341040463, |
|
"grad_norm": 4.4525861740112305, |
|
"learning_rate": 0.00011589313468031513, |
|
"loss": 0.1972, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.5690430314707773, |
|
"grad_norm": 2.327648878097534, |
|
"learning_rate": 0.00011455329867624202, |
|
"loss": 0.1769, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.600834938985228, |
|
"eval_loss": 0.20336776971817017, |
|
"eval_runtime": 9.3362, |
|
"eval_samples_per_second": 10.711, |
|
"eval_steps_per_second": 5.356, |
|
"step": 32396 |
|
}, |
|
{ |
|
"epoch": 2.6091843288375083, |
|
"grad_norm": 1.7865198850631714, |
|
"learning_rate": 0.00011321346267216893, |
|
"loss": 0.1888, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.649325626204239, |
|
"grad_norm": 2.9860942363739014, |
|
"learning_rate": 0.00011187362666809582, |
|
"loss": 0.1948, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.68946692357097, |
|
"grad_norm": 1.5154398679733276, |
|
"learning_rate": 0.00011053647033603088, |
|
"loss": 0.1936, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.729608220937701, |
|
"grad_norm": 1.3628119230270386, |
|
"learning_rate": 0.00010919663433195777, |
|
"loss": 0.1826, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.7697495183044314, |
|
"grad_norm": 1.1184334754943848, |
|
"learning_rate": 0.00010785679832788467, |
|
"loss": 0.1818, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.800899165061015, |
|
"eval_loss": 0.19036497175693512, |
|
"eval_runtime": 9.2882, |
|
"eval_samples_per_second": 10.766, |
|
"eval_steps_per_second": 5.383, |
|
"step": 34888 |
|
}, |
|
{ |
|
"epoch": 2.8098908156711624, |
|
"grad_norm": 3.3239927291870117, |
|
"learning_rate": 0.00010651696232381156, |
|
"loss": 0.1738, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.8500321130378934, |
|
"grad_norm": 1.8970750570297241, |
|
"learning_rate": 0.00010517712631973846, |
|
"loss": 0.1848, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.8901734104046244, |
|
"grad_norm": 2.5936954021453857, |
|
"learning_rate": 0.00010383996998767351, |
|
"loss": 0.1834, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.9303147077713554, |
|
"grad_norm": 4.450264930725098, |
|
"learning_rate": 0.00010250013398360042, |
|
"loss": 0.1768, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.970456005138086, |
|
"grad_norm": 2.1718015670776367, |
|
"learning_rate": 0.00010116029797952731, |
|
"loss": 0.1798, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.0009633911368017, |
|
"eval_loss": 0.19798487424850464, |
|
"eval_runtime": 9.4331, |
|
"eval_samples_per_second": 10.601, |
|
"eval_steps_per_second": 5.3, |
|
"step": 37380 |
|
}, |
|
{ |
|
"epoch": 3.010597302504817, |
|
"grad_norm": 1.2895421981811523, |
|
"learning_rate": 9.98204619754542e-05, |
|
"loss": 0.1679, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.050738599871548, |
|
"grad_norm": 2.5621845722198486, |
|
"learning_rate": 9.848330564338926e-05, |
|
"loss": 0.1556, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 3.090879897238279, |
|
"grad_norm": 4.237947940826416, |
|
"learning_rate": 9.714346963931615e-05, |
|
"loss": 0.1627, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 3.1310211946050095, |
|
"grad_norm": 0.6110877990722656, |
|
"learning_rate": 9.580363363524305e-05, |
|
"loss": 0.1577, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 3.1711624919717405, |
|
"grad_norm": 1.6049071550369263, |
|
"learning_rate": 9.446379763116994e-05, |
|
"loss": 0.1656, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 3.2010276172125884, |
|
"eval_loss": 0.18195098638534546, |
|
"eval_runtime": 9.445, |
|
"eval_samples_per_second": 10.588, |
|
"eval_steps_per_second": 5.294, |
|
"step": 39872 |
|
}, |
|
{ |
|
"epoch": 3.2113037893384715, |
|
"grad_norm": 1.347450852394104, |
|
"learning_rate": 9.3126641299105e-05, |
|
"loss": 0.1523, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 3.2514450867052025, |
|
"grad_norm": 2.5243570804595947, |
|
"learning_rate": 9.17868052950319e-05, |
|
"loss": 0.1549, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 3.291586384071933, |
|
"grad_norm": 1.4282008409500122, |
|
"learning_rate": 9.044964896296694e-05, |
|
"loss": 0.1624, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 3.331727681438664, |
|
"grad_norm": 1.0618197917938232, |
|
"learning_rate": 8.910981295889384e-05, |
|
"loss": 0.1506, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 3.371868978805395, |
|
"grad_norm": 2.2008793354034424, |
|
"learning_rate": 8.776997695482073e-05, |
|
"loss": 0.1533, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 3.401091843288375, |
|
"eval_loss": 0.1724245399236679, |
|
"eval_runtime": 9.5166, |
|
"eval_samples_per_second": 10.508, |
|
"eval_steps_per_second": 5.254, |
|
"step": 42364 |
|
}, |
|
{ |
|
"epoch": 3.412010276172126, |
|
"grad_norm": 2.9689488410949707, |
|
"learning_rate": 8.643014095074763e-05, |
|
"loss": 0.1587, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 3.4521515735388566, |
|
"grad_norm": 2.200533628463745, |
|
"learning_rate": 8.509030494667452e-05, |
|
"loss": 0.1521, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.4922928709055876, |
|
"grad_norm": 2.433441162109375, |
|
"learning_rate": 8.375046894260143e-05, |
|
"loss": 0.1559, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 3.5324341682723186, |
|
"grad_norm": 1.3238328695297241, |
|
"learning_rate": 8.241063293852834e-05, |
|
"loss": 0.1513, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.5725754656390496, |
|
"grad_norm": 1.2658567428588867, |
|
"learning_rate": 8.107079693445523e-05, |
|
"loss": 0.1557, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 3.601156069364162, |
|
"eval_loss": 0.1686776727437973, |
|
"eval_runtime": 9.5131, |
|
"eval_samples_per_second": 10.512, |
|
"eval_steps_per_second": 5.256, |
|
"step": 44856 |
|
}, |
|
{ |
|
"epoch": 3.61271676300578, |
|
"grad_norm": 0.9541091918945312, |
|
"learning_rate": 7.973096093038212e-05, |
|
"loss": 0.1538, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.652858060372511, |
|
"grad_norm": 2.3008804321289062, |
|
"learning_rate": 7.839112492630902e-05, |
|
"loss": 0.1595, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 3.692999357739242, |
|
"grad_norm": 0.9546318054199219, |
|
"learning_rate": 7.705128892223593e-05, |
|
"loss": 0.1511, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.733140655105973, |
|
"grad_norm": 2.0182559490203857, |
|
"learning_rate": 7.571413259017097e-05, |
|
"loss": 0.1524, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 3.773281952472704, |
|
"grad_norm": 3.313910484313965, |
|
"learning_rate": 7.437429658609787e-05, |
|
"loss": 0.1536, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.8012202954399488, |
|
"eval_loss": 0.17214125394821167, |
|
"eval_runtime": 9.4988, |
|
"eval_samples_per_second": 10.528, |
|
"eval_steps_per_second": 5.264, |
|
"step": 47348 |
|
}, |
|
{ |
|
"epoch": 3.8134232498394347, |
|
"grad_norm": 1.0791243314743042, |
|
"learning_rate": 7.303446058202477e-05, |
|
"loss": 0.1559, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 3.8535645472061657, |
|
"grad_norm": 1.7102487087249756, |
|
"learning_rate": 7.169462457795166e-05, |
|
"loss": 0.1558, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.8937058445728967, |
|
"grad_norm": 1.6259522438049316, |
|
"learning_rate": 7.03574682458867e-05, |
|
"loss": 0.1519, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 3.9338471419396273, |
|
"grad_norm": 1.94523286819458, |
|
"learning_rate": 6.90176322418136e-05, |
|
"loss": 0.1498, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.9739884393063583, |
|
"grad_norm": 1.2504680156707764, |
|
"learning_rate": 6.768047590974865e-05, |
|
"loss": 0.1531, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 4.0012845215157355, |
|
"eval_loss": 0.1633564978837967, |
|
"eval_runtime": 9.4116, |
|
"eval_samples_per_second": 10.625, |
|
"eval_steps_per_second": 5.313, |
|
"step": 49840 |
|
}, |
|
{ |
|
"epoch": 4.014129736673089, |
|
"grad_norm": 2.535567283630371, |
|
"learning_rate": 6.634063990567555e-05, |
|
"loss": 0.1421, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 4.05427103403982, |
|
"grad_norm": 2.8016157150268555, |
|
"learning_rate": 6.500348357361059e-05, |
|
"loss": 0.1304, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 4.094412331406551, |
|
"grad_norm": 3.923179864883423, |
|
"learning_rate": 6.366364756953749e-05, |
|
"loss": 0.1222, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 4.134553628773282, |
|
"grad_norm": 4.409369945526123, |
|
"learning_rate": 6.232381156546439e-05, |
|
"loss": 0.1344, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 4.174694926140013, |
|
"grad_norm": 1.9566134214401245, |
|
"learning_rate": 6.0983975561391295e-05, |
|
"loss": 0.1256, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 4.201348747591522, |
|
"eval_loss": 0.15292324125766754, |
|
"eval_runtime": 9.4583, |
|
"eval_samples_per_second": 10.573, |
|
"eval_steps_per_second": 5.286, |
|
"step": 52332 |
|
}, |
|
{ |
|
"epoch": 4.214836223506744, |
|
"grad_norm": 1.6278679370880127, |
|
"learning_rate": 5.964413955731819e-05, |
|
"loss": 0.1274, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 4.254977520873474, |
|
"grad_norm": 0.310170441865921, |
|
"learning_rate": 5.830430355324509e-05, |
|
"loss": 0.1277, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 4.295118818240206, |
|
"grad_norm": 2.6923305988311768, |
|
"learning_rate": 5.6964467549171985e-05, |
|
"loss": 0.1284, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 4.335260115606936, |
|
"grad_norm": 2.4311087131500244, |
|
"learning_rate": 5.562463154509888e-05, |
|
"loss": 0.1274, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 4.375401412973667, |
|
"grad_norm": 0.18654285371303558, |
|
"learning_rate": 5.4287475213033926e-05, |
|
"loss": 0.1274, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 4.401412973667309, |
|
"eval_loss": 0.14706894755363464, |
|
"eval_runtime": 9.5362, |
|
"eval_samples_per_second": 10.486, |
|
"eval_steps_per_second": 5.243, |
|
"step": 54824 |
|
}, |
|
{ |
|
"epoch": 4.415542710340398, |
|
"grad_norm": 4.156493663787842, |
|
"learning_rate": 5.2947639208960834e-05, |
|
"loss": 0.1257, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 4.455684007707129, |
|
"grad_norm": 1.4045820236206055, |
|
"learning_rate": 5.160780320488773e-05, |
|
"loss": 0.1302, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 4.49582530507386, |
|
"grad_norm": 0.5079956650733948, |
|
"learning_rate": 5.026796720081462e-05, |
|
"loss": 0.1302, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 4.535966602440591, |
|
"grad_norm": 2.671663522720337, |
|
"learning_rate": 4.892813119674152e-05, |
|
"loss": 0.1319, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 4.5761078998073215, |
|
"grad_norm": 2.7671585083007812, |
|
"learning_rate": 4.758829519266842e-05, |
|
"loss": 0.1294, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 4.601477199743096, |
|
"eval_loss": 0.1432746946811676, |
|
"eval_runtime": 9.2082, |
|
"eval_samples_per_second": 10.86, |
|
"eval_steps_per_second": 5.43, |
|
"step": 57316 |
|
}, |
|
{ |
|
"epoch": 4.616249197174053, |
|
"grad_norm": 4.464232921600342, |
|
"learning_rate": 4.624845918859532e-05, |
|
"loss": 0.1236, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 4.6563904945407835, |
|
"grad_norm": 0.22100146114826202, |
|
"learning_rate": 4.490862318452222e-05, |
|
"loss": 0.1214, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 4.696531791907514, |
|
"grad_norm": 3.2810275554656982, |
|
"learning_rate": 4.357146685245726e-05, |
|
"loss": 0.1299, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 4.7366730892742455, |
|
"grad_norm": 4.431360721588135, |
|
"learning_rate": 4.2234310520392307e-05, |
|
"loss": 0.129, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 4.776814386640976, |
|
"grad_norm": 2.974076509475708, |
|
"learning_rate": 4.08944745163192e-05, |
|
"loss": 0.1209, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 4.801541425818883, |
|
"eval_loss": 0.13750909268856049, |
|
"eval_runtime": 9.2555, |
|
"eval_samples_per_second": 10.804, |
|
"eval_steps_per_second": 5.402, |
|
"step": 59808 |
|
}, |
|
{ |
|
"epoch": 4.8169556840077075, |
|
"grad_norm": 0.5916352272033691, |
|
"learning_rate": 3.95546385122461e-05, |
|
"loss": 0.1325, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.857096981374438, |
|
"grad_norm": 3.2611584663391113, |
|
"learning_rate": 3.8214802508173e-05, |
|
"loss": 0.126, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 4.897238278741169, |
|
"grad_norm": 2.499826192855835, |
|
"learning_rate": 3.68749665040999e-05, |
|
"loss": 0.127, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 4.9373795761079, |
|
"grad_norm": 3.263596773147583, |
|
"learning_rate": 3.55351305000268e-05, |
|
"loss": 0.1263, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 4.977520873474631, |
|
"grad_norm": 3.2504055500030518, |
|
"learning_rate": 3.419529449595369e-05, |
|
"loss": 0.1298, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 5.001605651894669, |
|
"eval_loss": 0.13082526624202728, |
|
"eval_runtime": 9.18, |
|
"eval_samples_per_second": 10.893, |
|
"eval_steps_per_second": 5.447, |
|
"step": 62300 |
|
}, |
|
{ |
|
"epoch": 5.017662170841362, |
|
"grad_norm": 1.3521040678024292, |
|
"learning_rate": 3.28554584918806e-05, |
|
"loss": 0.0973, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 5.057803468208093, |
|
"grad_norm": 1.273701786994934, |
|
"learning_rate": 3.151830215981564e-05, |
|
"loss": 0.1059, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 5.097944765574823, |
|
"grad_norm": 3.1900885105133057, |
|
"learning_rate": 3.017846615574254e-05, |
|
"loss": 0.1095, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 5.138086062941555, |
|
"grad_norm": 1.1862250566482544, |
|
"learning_rate": 2.883863015166944e-05, |
|
"loss": 0.1053, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 5.178227360308285, |
|
"grad_norm": 0.06329891085624695, |
|
"learning_rate": 2.7498794147596337e-05, |
|
"loss": 0.1064, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 5.201669877970456, |
|
"eval_loss": 0.12689721584320068, |
|
"eval_runtime": 9.2645, |
|
"eval_samples_per_second": 10.794, |
|
"eval_steps_per_second": 5.397, |
|
"step": 64792 |
|
}, |
|
{ |
|
"epoch": 5.218368657675016, |
|
"grad_norm": 1.3881759643554688, |
|
"learning_rate": 2.615895814352323e-05, |
|
"loss": 0.0983, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 5.258509955041747, |
|
"grad_norm": 2.040412664413452, |
|
"learning_rate": 2.4819122139450132e-05, |
|
"loss": 0.11, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 5.298651252408478, |
|
"grad_norm": 1.5292569398880005, |
|
"learning_rate": 2.348196580738518e-05, |
|
"loss": 0.1087, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 5.338792549775209, |
|
"grad_norm": 1.2713596820831299, |
|
"learning_rate": 2.2142129803312077e-05, |
|
"loss": 0.1082, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 5.37893384714194, |
|
"grad_norm": 1.5126820802688599, |
|
"learning_rate": 2.080497347124712e-05, |
|
"loss": 0.1063, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 5.401734104046243, |
|
"eval_loss": 0.12622660398483276, |
|
"eval_runtime": 9.2201, |
|
"eval_samples_per_second": 10.846, |
|
"eval_steps_per_second": 5.423, |
|
"step": 67284 |
|
}, |
|
{ |
|
"epoch": 5.41907514450867, |
|
"grad_norm": 1.634347915649414, |
|
"learning_rate": 1.9465137467174018e-05, |
|
"loss": 0.1076, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 5.459216441875402, |
|
"grad_norm": 1.4614862203598022, |
|
"learning_rate": 1.8125301463100915e-05, |
|
"loss": 0.1001, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 5.499357739242132, |
|
"grad_norm": 1.3177045583724976, |
|
"learning_rate": 1.6785465459027816e-05, |
|
"loss": 0.1041, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 5.539499036608863, |
|
"grad_norm": 2.3282182216644287, |
|
"learning_rate": 1.5445629454954714e-05, |
|
"loss": 0.1021, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 5.579640333975594, |
|
"grad_norm": 1.7429168224334717, |
|
"learning_rate": 1.4105793450881613e-05, |
|
"loss": 0.1036, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 5.60179833012203, |
|
"eval_loss": 0.12125935405492783, |
|
"eval_runtime": 9.2288, |
|
"eval_samples_per_second": 10.836, |
|
"eval_steps_per_second": 5.418, |
|
"step": 69776 |
|
}, |
|
{ |
|
"epoch": 5.619781631342325, |
|
"grad_norm": 0.19996996223926544, |
|
"learning_rate": 1.2765957446808511e-05, |
|
"loss": 0.098, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.659922928709056, |
|
"grad_norm": 3.154064893722534, |
|
"learning_rate": 1.142612144273541e-05, |
|
"loss": 0.101, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 5.700064226075787, |
|
"grad_norm": 0.9969759583473206, |
|
"learning_rate": 1.0086285438662308e-05, |
|
"loss": 0.1043, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 5.740205523442517, |
|
"grad_norm": 0.29482555389404297, |
|
"learning_rate": 8.749129106597353e-06, |
|
"loss": 0.0978, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 5.780346820809249, |
|
"grad_norm": 7.377176761627197, |
|
"learning_rate": 7.409293102524252e-06, |
|
"loss": 0.1084, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 5.8018625561978165, |
|
"eval_loss": 0.11656199395656586, |
|
"eval_runtime": 9.2458, |
|
"eval_samples_per_second": 10.816, |
|
"eval_steps_per_second": 5.408, |
|
"step": 72268 |
|
}, |
|
{ |
|
"epoch": 5.820488118175979, |
|
"grad_norm": 1.861075520515442, |
|
"learning_rate": 6.06945709845115e-06, |
|
"loss": 0.1045, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 5.860629415542711, |
|
"grad_norm": 3.3500561714172363, |
|
"learning_rate": 4.729621094378049e-06, |
|
"loss": 0.103, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 5.900770712909441, |
|
"grad_norm": 0.29247429966926575, |
|
"learning_rate": 3.3924647623130934e-06, |
|
"loss": 0.1022, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 5.940912010276172, |
|
"grad_norm": 2.5670969486236572, |
|
"learning_rate": 2.0526287582399915e-06, |
|
"loss": 0.1048, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 5.981053307642903, |
|
"grad_norm": 1.4825931787490845, |
|
"learning_rate": 7.1279275416689e-07, |
|
"loss": 0.1009, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"step": 74736, |
|
"total_flos": 1.1033260642249114e+18, |
|
"train_loss": 0.01737067021080218, |
|
"train_runtime": 11655.5312, |
|
"train_samples_per_second": 12.824, |
|
"train_steps_per_second": 6.412 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 74736, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1033260642249114e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|