{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.995306228883699, "eval_steps": 500, "global_step": 1032, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029024285163601733, "grad_norm": 88.48429870605469, "learning_rate": 9.677419354838708e-05, "loss": 0.7958, "step": 10 }, { "epoch": 0.058048570327203466, "grad_norm": 78.84452056884766, "learning_rate": 0.00019354838709677416, "loss": 0.7504, "step": 20 }, { "epoch": 0.08707285549080519, "grad_norm": 213.83749389648438, "learning_rate": 0.00029032258064516127, "loss": 0.7391, "step": 30 }, { "epoch": 0.11609714065440693, "grad_norm": 64.08855438232422, "learning_rate": 0.00029994016586766087, "loss": 0.7567, "step": 40 }, { "epoch": 0.14512142581800866, "grad_norm": 71.85431671142578, "learning_rate": 0.00029973339311370587, "loss": 0.7117, "step": 50 }, { "epoch": 0.17414571098161039, "grad_norm": 66.32382202148438, "learning_rate": 0.00029937914664890375, "loss": 0.6959, "step": 60 }, { "epoch": 0.20316999614521214, "grad_norm": 52.99678039550781, "learning_rate": 0.00029887777537365414, "loss": 0.6835, "step": 70 }, { "epoch": 0.23219428130881387, "grad_norm": 53.15193557739258, "learning_rate": 0.0002982297730928522, "loss": 0.6855, "step": 80 }, { "epoch": 0.2612185664724156, "grad_norm": 62.969337463378906, "learning_rate": 0.00029743577802953563, "loss": 0.6758, "step": 90 }, { "epoch": 0.2902428516360173, "grad_norm": 47.597293853759766, "learning_rate": 0.00029649657219629316, "loss": 0.665, "step": 100 }, { "epoch": 0.31926713679961904, "grad_norm": 50.93095397949219, "learning_rate": 0.00029541308062505385, "loss": 0.6689, "step": 110 }, { "epoch": 0.34829142196322077, "grad_norm": 44.195335388183594, "learning_rate": 0.00029418637045601514, "loss": 0.6553, "step": 120 }, { "epoch": 0.3773157071268225, "grad_norm": 46.52369689941406, "learning_rate": 0.00029281764988660705, "loss": 0.6584, "step": 130 }, { "epoch": 0.4063399922904243, "grad_norm": 51.798343658447266, "learning_rate": 0.0002913082669815285, "loss": 0.6514, "step": 140 }, { "epoch": 0.435364277454026, "grad_norm": 53.8443489074707, "learning_rate": 0.0002896597083450262, "loss": 0.6276, "step": 150 }, { "epoch": 0.46438856261762773, "grad_norm": 44.94770812988281, "learning_rate": 0.0002878735976567259, "loss": 0.6428, "step": 160 }, { "epoch": 0.49341284778122946, "grad_norm": 38.52789306640625, "learning_rate": 0.0002859516940724558, "loss": 0.6415, "step": 170 }, { "epoch": 0.5224371329448312, "grad_norm": 52.5710563659668, "learning_rate": 0.0002838958904916392, "loss": 0.6302, "step": 180 }, { "epoch": 0.551461418108433, "grad_norm": 46.27107238769531, "learning_rate": 0.00028170821169296126, "loss": 0.6246, "step": 190 }, { "epoch": 0.5804857032720346, "grad_norm": 42.310123443603516, "learning_rate": 0.00027939081234014705, "loss": 0.627, "step": 200 }, { "epoch": 0.6095099884356364, "grad_norm": 48.09523391723633, "learning_rate": 0.0002769459748598149, "loss": 0.623, "step": 210 }, { "epoch": 0.6385342735992381, "grad_norm": 62.250152587890625, "learning_rate": 0.0002743761071934942, "loss": 0.6312, "step": 220 }, { "epoch": 0.6675585587628399, "grad_norm": 42.713130950927734, "learning_rate": 0.00027168374042602366, "loss": 0.6101, "step": 230 }, { "epoch": 0.6965828439264415, "grad_norm": 49.83562469482422, "learning_rate": 0.00026887152629266354, "loss": 0.6, "step": 240 }, { "epoch": 0.7256071290900433, "grad_norm": 39.01671600341797, "learning_rate": 0.0002659422345673789, "loss": 0.6038, "step": 250 }, { "epoch": 0.754631414253645, "grad_norm": 35.13432693481445, "learning_rate": 0.0002628987503348651, "loss": 0.5956, "step": 260 }, { "epoch": 0.7836556994172468, "grad_norm": 41.503684997558594, "learning_rate": 0.00025974407114900353, "loss": 0.6134, "step": 270 }, { "epoch": 0.8126799845808486, "grad_norm": 39.328548431396484, "learning_rate": 0.0002564813040805443, "loss": 0.59, "step": 280 }, { "epoch": 0.8417042697444502, "grad_norm": 34.63987731933594, "learning_rate": 0.0002531136626569259, "loss": 0.5834, "step": 290 }, { "epoch": 0.870728554908052, "grad_norm": 37.82402801513672, "learning_rate": 0.0002496444636972439, "loss": 0.6023, "step": 300 }, { "epoch": 0.8997528400716537, "grad_norm": 38.01532745361328, "learning_rate": 0.0002460771240454877, "loss": 0.5866, "step": 310 }, { "epoch": 0.9287771252352555, "grad_norm": 37.758487701416016, "learning_rate": 0.00024241515720526083, "loss": 0.6001, "step": 320 }, { "epoch": 0.9578014103988571, "grad_norm": 34.032989501953125, "learning_rate": 0.0002386621698793015, "loss": 0.5833, "step": 330 }, { "epoch": 0.9868256955624589, "grad_norm": 41.784881591796875, "learning_rate": 0.0002348218584172095, "loss": 0.5876, "step": 340 }, { "epoch": 1.0158499807260606, "grad_norm": 35.09678268432617, "learning_rate": 0.00023089800517487986, "loss": 0.5319, "step": 350 }, { "epoch": 1.0448742658896624, "grad_norm": 32.305877685546875, "learning_rate": 0.00022689447478922784, "loss": 0.4666, "step": 360 }, { "epoch": 1.0738985510532641, "grad_norm": 35.80933380126953, "learning_rate": 0.0002228152103718745, "loss": 0.4619, "step": 370 }, { "epoch": 1.102922836216866, "grad_norm": 32.89548873901367, "learning_rate": 0.00021866422962554238, "loss": 0.4739, "step": 380 }, { "epoch": 1.1319471213804675, "grad_norm": 36.34146499633789, "learning_rate": 0.0002144456208869851, "loss": 0.4676, "step": 390 }, { "epoch": 1.1609714065440693, "grad_norm": 42.522438049316406, "learning_rate": 0.00021016353910034938, "loss": 0.4765, "step": 400 }, { "epoch": 1.189995691707671, "grad_norm": 34.677650451660156, "learning_rate": 0.00020582220172493467, "loss": 0.4715, "step": 410 }, { "epoch": 1.2190199768712728, "grad_norm": 33.74694061279297, "learning_rate": 0.0002014258845813811, "loss": 0.4655, "step": 420 }, { "epoch": 1.2480442620348744, "grad_norm": 30.60100555419922, "learning_rate": 0.00019697891764037685, "loss": 0.461, "step": 430 }, { "epoch": 1.2770685471984762, "grad_norm": 38.6037483215332, "learning_rate": 0.00019248568075803257, "loss": 0.4719, "step": 440 }, { "epoch": 1.306092832362078, "grad_norm": 32.19020080566406, "learning_rate": 0.00018795059936212348, "loss": 0.4586, "step": 450 }, { "epoch": 1.3351171175256797, "grad_norm": 32.962276458740234, "learning_rate": 0.00018337814009344714, "loss": 0.4697, "step": 460 }, { "epoch": 1.3641414026892815, "grad_norm": 29.69386863708496, "learning_rate": 0.00017877280640659068, "loss": 0.4639, "step": 470 }, { "epoch": 1.393165687852883, "grad_norm": 31.52634620666504, "learning_rate": 0.00017413913413443915, "loss": 0.4579, "step": 480 }, { "epoch": 1.4221899730164849, "grad_norm": 35.30017852783203, "learning_rate": 0.0001694816870207949, "loss": 0.4684, "step": 490 }, { "epoch": 1.4512142581800866, "grad_norm": 33.88492202758789, "learning_rate": 0.00016480505222550682, "loss": 0.4534, "step": 500 }, { "epoch": 1.4802385433436884, "grad_norm": 30.00653076171875, "learning_rate": 0.00016011383580653697, "loss": 0.464, "step": 510 }, { "epoch": 1.50926282850729, "grad_norm": 33.75349807739258, "learning_rate": 0.00015541265818341433, "loss": 0.4497, "step": 520 }, { "epoch": 1.5382871136708918, "grad_norm": 31.689538955688477, "learning_rate": 0.00015070614958654393, "loss": 0.4412, "step": 530 }, { "epoch": 1.5673113988344936, "grad_norm": 28.848291397094727, "learning_rate": 0.00014599894549685273, "loss": 0.4467, "step": 540 }, { "epoch": 1.5963356839980953, "grad_norm": 27.079084396362305, "learning_rate": 0.0001412956820802647, "loss": 0.4428, "step": 550 }, { "epoch": 1.6253599691616971, "grad_norm": 29.99922752380371, "learning_rate": 0.0001366009916215007, "loss": 0.4374, "step": 560 }, { "epoch": 1.654384254325299, "grad_norm": 28.763559341430664, "learning_rate": 0.00013191949796170156, "loss": 0.4419, "step": 570 }, { "epoch": 1.6834085394889005, "grad_norm": 30.430801391601562, "learning_rate": 0.00012725581194436694, "loss": 0.445, "step": 580 }, { "epoch": 1.7124328246525022, "grad_norm": 28.43861198425293, "learning_rate": 0.00012261452687409576, "loss": 0.4452, "step": 590 }, { "epoch": 1.7414571098161038, "grad_norm": 33.317378997802734, "learning_rate": 0.00011800021399260094, "loss": 0.4378, "step": 600 }, { "epoch": 1.7704813949797056, "grad_norm": 27.84680938720703, "learning_rate": 0.00011341741797645384, "loss": 0.4375, "step": 610 }, { "epoch": 1.7995056801433074, "grad_norm": 32.20744705200195, "learning_rate": 0.0001088706524609933, "loss": 0.4281, "step": 620 }, { "epoch": 1.8285299653069091, "grad_norm": 29.68756675720215, "learning_rate": 0.00010436439559480705, "loss": 0.4338, "step": 630 }, { "epoch": 1.857554250470511, "grad_norm": 31.973575592041016, "learning_rate": 9.990308562916479e-05, "loss": 0.4265, "step": 640 }, { "epoch": 1.8865785356341127, "grad_norm": 26.948545455932617, "learning_rate": 9.549111654674586e-05, "loss": 0.4165, "step": 650 }, { "epoch": 1.9156028207977145, "grad_norm": 27.91978645324707, "learning_rate": 9.11328337339681e-05, "loss": 0.416, "step": 660 }, { "epoch": 1.944627105961316, "grad_norm": 34.58734130859375, "learning_rate": 8.68325297011791e-05, "loss": 0.4196, "step": 670 }, { "epoch": 1.9736513911249178, "grad_norm": 24.959909439086914, "learning_rate": 8.259443985492576e-05, "loss": 0.4305, "step": 680 }, { "epoch": 2.0026756762885194, "grad_norm": 39.029258728027344, "learning_rate": 7.842273832646591e-05, "loss": 0.4122, "step": 690 }, { "epoch": 2.031699961452121, "grad_norm": 27.386505126953125, "learning_rate": 7.432153386063034e-05, "loss": 0.2751, "step": 700 }, { "epoch": 2.060724246615723, "grad_norm": 30.209821701049805, "learning_rate": 7.029486576908444e-05, "loss": 0.2654, "step": 710 }, { "epoch": 2.0897485317793247, "grad_norm": 31.79279327392578, "learning_rate": 6.63466999519756e-05, "loss": 0.2648, "step": 720 }, { "epoch": 2.1187728169429265, "grad_norm": 31.363250732421875, "learning_rate": 6.248092499188372e-05, "loss": 0.2587, "step": 730 }, { "epoch": 2.1477971021065283, "grad_norm": 33.62345886230469, "learning_rate": 5.870134832392269e-05, "loss": 0.2564, "step": 740 }, { "epoch": 2.17682138727013, "grad_norm": 31.332040786743164, "learning_rate": 5.5011692485764734e-05, "loss": 0.253, "step": 750 }, { "epoch": 2.205845672433732, "grad_norm": 30.034757614135742, "learning_rate": 5.141559145128093e-05, "loss": 0.26, "step": 760 }, { "epoch": 2.234869957597333, "grad_norm": 30.40983772277832, "learning_rate": 4.791658705140897e-05, "loss": 0.2507, "step": 770 }, { "epoch": 2.263894242760935, "grad_norm": 27.134634017944336, "learning_rate": 4.451812548577333e-05, "loss": 0.2518, "step": 780 }, { "epoch": 2.2929185279245368, "grad_norm": 27.9604434967041, "learning_rate": 4.1223553928493564e-05, "loss": 0.2494, "step": 790 }, { "epoch": 2.3219428130881385, "grad_norm": 33.73405838012695, "learning_rate": 3.803611723152345e-05, "loss": 0.2441, "step": 800 }, { "epoch": 2.3509670982517403, "grad_norm": 31.413331985473633, "learning_rate": 3.495895472876854e-05, "loss": 0.2479, "step": 810 }, { "epoch": 2.379991383415342, "grad_norm": 28.82455062866211, "learning_rate": 3.199509714412901e-05, "loss": 0.2529, "step": 820 }, { "epoch": 2.409015668578944, "grad_norm": 31.402931213378906, "learning_rate": 2.9147463606513528e-05, "loss": 0.2499, "step": 830 }, { "epoch": 2.4380399537425457, "grad_norm": 25.637739181518555, "learning_rate": 2.6418858774763992e-05, "loss": 0.236, "step": 840 }, { "epoch": 2.467064238906147, "grad_norm": 27.47572898864746, "learning_rate": 2.38119700753228e-05, "loss": 0.2432, "step": 850 }, { "epoch": 2.496088524069749, "grad_norm": 28.527973175048828, "learning_rate": 2.1329365055363595e-05, "loss": 0.2428, "step": 860 }, { "epoch": 2.5251128092333506, "grad_norm": 28.3017578125, "learning_rate": 1.89734888539916e-05, "loss": 0.2457, "step": 870 }, { "epoch": 2.5541370943969524, "grad_norm": 27.692001342773438, "learning_rate": 1.674666179400504e-05, "loss": 0.2409, "step": 880 }, { "epoch": 2.583161379560554, "grad_norm": 30.592241287231445, "learning_rate": 1.4651077096589486e-05, "loss": 0.2371, "step": 890 }, { "epoch": 2.612185664724156, "grad_norm": 26.051584243774414, "learning_rate": 1.2688798721195053e-05, "loss": 0.2389, "step": 900 }, { "epoch": 2.6412099498877577, "grad_norm": 28.38836097717285, "learning_rate": 1.086175933272514e-05, "loss": 0.2407, "step": 910 }, { "epoch": 2.6702342350513595, "grad_norm": 27.81374740600586, "learning_rate": 9.171758398038015e-06, "loss": 0.2389, "step": 920 }, { "epoch": 2.6992585202149613, "grad_norm": 27.540956497192383, "learning_rate": 7.620460413636342e-06, "loss": 0.2453, "step": 930 }, { "epoch": 2.728282805378563, "grad_norm": 27.374300003051758, "learning_rate": 6.209393266290291e-06, "loss": 0.234, "step": 940 }, { "epoch": 2.757307090542165, "grad_norm": 29.071474075317383, "learning_rate": 4.939946728208627e-06, "loss": 0.2406, "step": 950 }, { "epoch": 2.786331375705766, "grad_norm": 25.93909454345703, "learning_rate": 3.813371088240086e-06, "loss": 0.231, "step": 960 }, { "epoch": 2.815355660869368, "grad_norm": 28.83918571472168, "learning_rate": 2.830775920453093e-06, "loss": 0.2303, "step": 970 }, { "epoch": 2.8443799460329697, "grad_norm": 28.06920623779297, "learning_rate": 1.9931289913066694e-06, "loss": 0.2339, "step": 980 }, { "epoch": 2.8734042311965715, "grad_norm": 28.357439041137695, "learning_rate": 1.3012553064889631e-06, "loss": 0.2325, "step": 990 }, { "epoch": 2.9024285163601733, "grad_norm": 25.29115104675293, "learning_rate": 7.558362983619448e-07, "loss": 0.2374, "step": 1000 }, { "epoch": 2.931452801523775, "grad_norm": 27.02465057373047, "learning_rate": 3.57409154812871e-07, "loss": 0.2307, "step": 1010 }, { "epoch": 2.960477086687377, "grad_norm": 26.2918701171875, "learning_rate": 1.0636629017320431e-07, "loss": 0.232, "step": 1020 }, { "epoch": 2.989501371850978, "grad_norm": 28.43804359436035, "learning_rate": 2.9549587264754428e-09, "loss": 0.2287, "step": 1030 }, { "epoch": 2.995306228883699, "step": 1032, "total_flos": 1.0711204212442399e+18, "train_loss": 0.44727156865735385, "train_runtime": 21178.1386, "train_samples_per_second": 6.247, "train_steps_per_second": 0.049 } ], "logging_steps": 10, "max_steps": 1032, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0711204212442399e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }