|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 50000, |
|
"global_step": 928, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04310344827586207, |
|
"grad_norm": 5.7193193435668945, |
|
"learning_rate": 1.7857142857142859e-06, |
|
"loss": 0.7126, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08620689655172414, |
|
"grad_norm": 2.6962673664093018, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 0.6662, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12931034482758622, |
|
"grad_norm": 1.6143770217895508, |
|
"learning_rate": 4.999939076763487e-06, |
|
"loss": 0.5303, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 1.3705445528030396, |
|
"learning_rate": 4.997807075247147e-06, |
|
"loss": 0.453, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.21551724137931033, |
|
"grad_norm": 1.129184603691101, |
|
"learning_rate": 4.992631880567301e-06, |
|
"loss": 0.4192, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.25862068965517243, |
|
"grad_norm": 1.3925697803497314, |
|
"learning_rate": 4.984419797901491e-06, |
|
"loss": 0.418, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3017241379310345, |
|
"grad_norm": 1.4394880533218384, |
|
"learning_rate": 4.973180832407471e-06, |
|
"loss": 0.4118, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 1.3788193464279175, |
|
"learning_rate": 4.958928677033465e-06, |
|
"loss": 0.4065, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3879310344827586, |
|
"grad_norm": 1.1382204294204712, |
|
"learning_rate": 4.9416806958354206e-06, |
|
"loss": 0.3819, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 1.081235647201538, |
|
"learning_rate": 4.921457902821578e-06, |
|
"loss": 0.399, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.47413793103448276, |
|
"grad_norm": 1.21293044090271, |
|
"learning_rate": 4.898284936350144e-06, |
|
"loss": 0.3962, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 1.3453164100646973, |
|
"learning_rate": 4.8721900291112415e-06, |
|
"loss": 0.3818, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5603448275862069, |
|
"grad_norm": 1.2487531900405884, |
|
"learning_rate": 4.84320497372973e-06, |
|
"loss": 0.3901, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.603448275862069, |
|
"grad_norm": 1.1610265970230103, |
|
"learning_rate": 4.811365084030784e-06, |
|
"loss": 0.3867, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.646551724137931, |
|
"grad_norm": 1.0656601190567017, |
|
"learning_rate": 4.776709152015443e-06, |
|
"loss": 0.3882, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 1.2680625915527344, |
|
"learning_rate": 4.7392794005985324e-06, |
|
"loss": 0.3739, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7327586206896551, |
|
"grad_norm": 1.3623387813568115, |
|
"learning_rate": 4.699121432166542e-06, |
|
"loss": 0.3733, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7758620689655172, |
|
"grad_norm": 1.096368432044983, |
|
"learning_rate": 4.656284173018144e-06, |
|
"loss": 0.3625, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8189655172413793, |
|
"grad_norm": 0.9958175420761108, |
|
"learning_rate": 4.610819813755038e-06, |
|
"loss": 0.3698, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 1.023909091949463, |
|
"learning_rate": 4.562783745695738e-06, |
|
"loss": 0.3471, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9051724137931034, |
|
"grad_norm": 1.061679482460022, |
|
"learning_rate": 4.512234493389785e-06, |
|
"loss": 0.3712, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9482758620689655, |
|
"grad_norm": 1.3929497003555298, |
|
"learning_rate": 4.4592336433146e-06, |
|
"loss": 0.3651, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9913793103448276, |
|
"grad_norm": 0.8806857466697693, |
|
"learning_rate": 4.403845768841842e-06, |
|
"loss": 0.3641, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 0.9649218916893005, |
|
"learning_rate": 4.346138351564711e-06, |
|
"loss": 0.3057, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0775862068965518, |
|
"grad_norm": 1.1328972578048706, |
|
"learning_rate": 4.286181699082008e-06, |
|
"loss": 0.3045, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1206896551724137, |
|
"grad_norm": 1.1146079301834106, |
|
"learning_rate": 4.224048859339175e-06, |
|
"loss": 0.301, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1637931034482758, |
|
"grad_norm": 1.1736148595809937, |
|
"learning_rate": 4.159815531630604e-06, |
|
"loss": 0.3236, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 1.1570653915405273, |
|
"learning_rate": 4.093559974371725e-06, |
|
"loss": 0.3078, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.335072636604309, |
|
"learning_rate": 4.02536290975317e-06, |
|
"loss": 0.3145, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.293103448275862, |
|
"grad_norm": 1.0565122365951538, |
|
"learning_rate": 3.955307425393224e-06, |
|
"loss": 0.3018, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3362068965517242, |
|
"grad_norm": 1.3002219200134277, |
|
"learning_rate": 3.88347887310836e-06, |
|
"loss": 0.3215, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 1.0199170112609863, |
|
"learning_rate": 3.8099647649251984e-06, |
|
"loss": 0.2973, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4224137931034484, |
|
"grad_norm": 1.118922233581543, |
|
"learning_rate": 3.7348546664605777e-06, |
|
"loss": 0.3095, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.4655172413793103, |
|
"grad_norm": 1.080239176750183, |
|
"learning_rate": 3.658240087799655e-06, |
|
"loss": 0.3172, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5086206896551724, |
|
"grad_norm": 0.9963559508323669, |
|
"learning_rate": 3.5802143720049565e-06, |
|
"loss": 0.2923, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 1.1155935525894165, |
|
"learning_rate": 3.5008725813922383e-06, |
|
"loss": 0.3032, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.5948275862068966, |
|
"grad_norm": 1.4892568588256836, |
|
"learning_rate": 3.4203113817116955e-06, |
|
"loss": 0.3113, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.6379310344827587, |
|
"grad_norm": 1.1873013973236084, |
|
"learning_rate": 3.338628924375638e-06, |
|
"loss": 0.3122, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6810344827586206, |
|
"grad_norm": 1.215104103088379, |
|
"learning_rate": 3.2559247268761117e-06, |
|
"loss": 0.2986, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 1.2356464862823486, |
|
"learning_rate": 3.1722995515381644e-06, |
|
"loss": 0.2969, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7672413793103448, |
|
"grad_norm": 1.217281699180603, |
|
"learning_rate": 3.087855282756475e-06, |
|
"loss": 0.294, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.8103448275862069, |
|
"grad_norm": 1.3647332191467285, |
|
"learning_rate": 3.002694802864912e-06, |
|
"loss": 0.2987, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.853448275862069, |
|
"grad_norm": 1.295482873916626, |
|
"learning_rate": 2.9169218667902562e-06, |
|
"loss": 0.3087, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 1.1709827184677124, |
|
"learning_rate": 2.8306409756428067e-06, |
|
"loss": 0.2945, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.9396551724137931, |
|
"grad_norm": 1.2571437358856201, |
|
"learning_rate": 2.743957249397874e-06, |
|
"loss": 0.3036, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.9827586206896552, |
|
"grad_norm": 1.7302639484405518, |
|
"learning_rate": 2.6569762988232838e-06, |
|
"loss": 0.3119, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.0258620689655173, |
|
"grad_norm": 0.9986317157745361, |
|
"learning_rate": 2.569804096808923e-06, |
|
"loss": 0.2789, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 1.130508303642273, |
|
"learning_rate": 2.482546849255096e-06, |
|
"loss": 0.2435, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.1120689655172415, |
|
"grad_norm": 1.446877121925354, |
|
"learning_rate": 2.3953108656770018e-06, |
|
"loss": 0.2475, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.1551724137931036, |
|
"grad_norm": 1.60965096950531, |
|
"learning_rate": 2.3082024296829538e-06, |
|
"loss": 0.2384, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.1982758620689653, |
|
"grad_norm": 1.383519172668457, |
|
"learning_rate": 2.2213276694841866e-06, |
|
"loss": 0.2356, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.2413793103448274, |
|
"grad_norm": 1.3211714029312134, |
|
"learning_rate": 2.134792428593971e-06, |
|
"loss": 0.2466, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.2844827586206895, |
|
"grad_norm": 1.302563190460205, |
|
"learning_rate": 2.0487021368736002e-06, |
|
"loss": 0.2456, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.3275862068965516, |
|
"grad_norm": 1.30355703830719, |
|
"learning_rate": 1.963161682082342e-06, |
|
"loss": 0.2406, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.3706896551724137, |
|
"grad_norm": 1.1259819269180298, |
|
"learning_rate": 1.8782752820878636e-06, |
|
"loss": 0.2225, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 1.2397651672363281, |
|
"learning_rate": 1.7941463578928088e-06, |
|
"loss": 0.2312, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.456896551724138, |
|
"grad_norm": 1.3761606216430664, |
|
"learning_rate": 1.7108774076322443e-06, |
|
"loss": 0.2415, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.3666378259658813, |
|
"learning_rate": 1.6285698816954626e-06, |
|
"loss": 0.2487, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.543103448275862, |
|
"grad_norm": 1.183164119720459, |
|
"learning_rate": 1.547324059124315e-06, |
|
"loss": 0.2378, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.586206896551724, |
|
"grad_norm": 1.4499019384384155, |
|
"learning_rate": 1.467238925438646e-06, |
|
"loss": 0.2765, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.6293103448275863, |
|
"grad_norm": 1.447636604309082, |
|
"learning_rate": 1.388412052037682e-06, |
|
"loss": 0.2584, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.6724137931034484, |
|
"grad_norm": 1.449857473373413, |
|
"learning_rate": 1.3109394773243117e-06, |
|
"loss": 0.2323, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.7155172413793105, |
|
"grad_norm": 1.3404827117919922, |
|
"learning_rate": 1.234915589697091e-06, |
|
"loss": 0.2384, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 1.3379237651824951, |
|
"learning_rate": 1.160433012552508e-06, |
|
"loss": 0.2382, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.8017241379310347, |
|
"grad_norm": 1.4709217548370361, |
|
"learning_rate": 1.0875824914376555e-06, |
|
"loss": 0.2303, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.844827586206897, |
|
"grad_norm": 1.3744317293167114, |
|
"learning_rate": 1.0164527834907468e-06, |
|
"loss": 0.2423, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.887931034482759, |
|
"grad_norm": 1.7406655550003052, |
|
"learning_rate": 9.471305493042243e-07, |
|
"loss": 0.2426, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.9310344827586206, |
|
"grad_norm": 1.6748679876327515, |
|
"learning_rate": 8.797002473421729e-07, |
|
"loss": 0.2494, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.9741379310344827, |
|
"grad_norm": 1.5766329765319824, |
|
"learning_rate": 8.142440310406923e-07, |
|
"loss": 0.2407, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.0172413793103448, |
|
"grad_norm": 1.2436530590057373, |
|
"learning_rate": 7.508416487165862e-07, |
|
"loss": 0.2254, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.060344827586207, |
|
"grad_norm": 1.3559637069702148, |
|
"learning_rate": 6.895703464063319e-07, |
|
"loss": 0.2015, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.103448275862069, |
|
"grad_norm": 1.3423281908035278, |
|
"learning_rate": 6.305047737536707e-07, |
|
"loss": 0.1952, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.146551724137931, |
|
"grad_norm": 1.4924376010894775, |
|
"learning_rate": 5.737168930605272e-07, |
|
"loss": 0.1967, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.189655172413793, |
|
"grad_norm": 1.4202907085418701, |
|
"learning_rate": 5.192758916120236e-07, |
|
"loss": 0.2128, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.2327586206896552, |
|
"grad_norm": 1.252336025238037, |
|
"learning_rate": 4.672480973824312e-07, |
|
"loss": 0.1907, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.2758620689655173, |
|
"grad_norm": 1.7021604776382446, |
|
"learning_rate": 4.1769689822475147e-07, |
|
"loss": 0.2046, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.3189655172413794, |
|
"grad_norm": 1.637277364730835, |
|
"learning_rate": 3.7068266464238085e-07, |
|
"loss": 0.1973, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.3620689655172415, |
|
"grad_norm": 1.2321523427963257, |
|
"learning_rate": 3.262626762369525e-07, |
|
"loss": 0.2107, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.405172413793103, |
|
"grad_norm": 1.3565343618392944, |
|
"learning_rate": 2.844910519219632e-07, |
|
"loss": 0.2055, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 1.6926054954528809, |
|
"learning_rate": 2.454186839872158e-07, |
|
"loss": 0.2083, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.4913793103448274, |
|
"grad_norm": 1.5500190258026123, |
|
"learning_rate": 2.0909317609440093e-07, |
|
"loss": 0.1973, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.5344827586206895, |
|
"grad_norm": 1.4574564695358276, |
|
"learning_rate": 1.7555878527937164e-07, |
|
"loss": 0.2336, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.5775862068965516, |
|
"grad_norm": 1.371783971786499, |
|
"learning_rate": 1.4485636803175828e-07, |
|
"loss": 0.2028, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.6206896551724137, |
|
"grad_norm": 1.5117493867874146, |
|
"learning_rate": 1.1702333051763271e-07, |
|
"loss": 0.1897, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.663793103448276, |
|
"grad_norm": 1.4221725463867188, |
|
"learning_rate": 9.209358300585474e-08, |
|
"loss": 0.2053, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.706896551724138, |
|
"grad_norm": 1.3613110780715942, |
|
"learning_rate": 7.009749855363457e-08, |
|
"loss": 0.2019, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.6522403955459595, |
|
"learning_rate": 5.106187600163987e-08, |
|
"loss": 0.2012, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.793103448275862, |
|
"grad_norm": 1.1785835027694702, |
|
"learning_rate": 3.5009907323737826e-08, |
|
"loss": 0.2041, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.836206896551724, |
|
"grad_norm": 1.3110618591308594, |
|
"learning_rate": 2.1961149371145795e-08, |
|
"loss": 0.2157, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.8793103448275863, |
|
"grad_norm": 1.1076109409332275, |
|
"learning_rate": 1.193150004542204e-08, |
|
"loss": 0.2124, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.9224137931034484, |
|
"grad_norm": 1.5730844736099243, |
|
"learning_rate": 4.933178929321103e-09, |
|
"loss": 0.2164, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.9655172413793105, |
|
"grad_norm": 1.6986498832702637, |
|
"learning_rate": 9.747123991141193e-10, |
|
"loss": 0.209, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 928, |
|
"total_flos": 1.573815416315904e+17, |
|
"train_loss": 0.29322911118125095, |
|
"train_runtime": 2500.7039, |
|
"train_samples_per_second": 5.936, |
|
"train_steps_per_second": 0.371 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 928, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.573815416315904e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|