{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992587101556709, "eval_steps": 100, "global_step": 337, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014825796886582653, "grad_norm": 3.8051848965817707, "learning_rate": 2.9411764705882355e-06, "loss": 0.956, "mean_token_accuracy": 0.7389732335227731, "step": 5 }, { "epoch": 0.029651593773165306, "grad_norm": 2.3193458934516125, "learning_rate": 5.882352941176471e-06, "loss": 0.9131, "mean_token_accuracy": 0.745571190628745, "step": 10 }, { "epoch": 0.04447739065974796, "grad_norm": 1.465130510110654, "learning_rate": 8.823529411764707e-06, "loss": 0.802, "mean_token_accuracy": 0.7669543650793653, "step": 15 }, { "epoch": 0.05930318754633061, "grad_norm": 1.1649028225710198, "learning_rate": 1.1764705882352942e-05, "loss": 0.7568, "mean_token_accuracy": 0.7719902319902323, "step": 20 }, { "epoch": 0.07412898443291327, "grad_norm": 0.8613760026054987, "learning_rate": 1.4705882352941179e-05, "loss": 0.7119, "mean_token_accuracy": 0.7831028693528694, "step": 25 }, { "epoch": 0.08895478131949593, "grad_norm": 0.783478820047515, "learning_rate": 1.7647058823529414e-05, "loss": 0.6953, "mean_token_accuracy": 0.7864308608058606, "step": 30 }, { "epoch": 0.10378057820607858, "grad_norm": 0.845141787895257, "learning_rate": 1.9999462497359468e-05, "loss": 0.6841, "mean_token_accuracy": 0.7876358363858365, "step": 35 }, { "epoch": 0.11860637509266123, "grad_norm": 0.7223548858937856, "learning_rate": 1.9980655971335944e-05, "loss": 0.6674, "mean_token_accuracy": 0.7906836800335666, "step": 40 }, { "epoch": 0.1334321719792439, "grad_norm": 0.8020026503983825, "learning_rate": 1.993503206718859e-05, "loss": 0.6618, "mean_token_accuracy": 0.7926633089133089, "step": 45 }, { "epoch": 0.14825796886582654, "grad_norm": 0.7215498275444733, "learning_rate": 1.986271337340182e-05, "loss": 0.6421, "mean_token_accuracy": 0.797378663003663, "step": 50 }, { "epoch": 0.16308376575240918, "grad_norm": 0.7852660604263051, "learning_rate": 1.976389420563607e-05, "loss": 0.646, "mean_token_accuracy": 0.796246372287669, "step": 55 }, { "epoch": 0.17790956263899185, "grad_norm": 0.7068328201097145, "learning_rate": 1.9638840084614182e-05, "loss": 0.6349, "mean_token_accuracy": 0.7993047924297925, "step": 60 }, { "epoch": 0.1927353595255745, "grad_norm": 0.6758929995809626, "learning_rate": 1.9487887022684336e-05, "loss": 0.6163, "mean_token_accuracy": 0.8042696886446887, "step": 65 }, { "epoch": 0.20756115641215717, "grad_norm": 0.7716624972063456, "learning_rate": 1.9311440620976597e-05, "loss": 0.6319, "mean_token_accuracy": 0.7993212087850436, "step": 70 }, { "epoch": 0.2223869532987398, "grad_norm": 0.6609819990400844, "learning_rate": 1.9109974979578852e-05, "loss": 0.6356, "mean_token_accuracy": 0.798153998778999, "step": 75 }, { "epoch": 0.23721275018532245, "grad_norm": 0.7028168784009874, "learning_rate": 1.8884031423660492e-05, "loss": 0.6337, "mean_token_accuracy": 0.7985944050496857, "step": 80 }, { "epoch": 0.2520385470719051, "grad_norm": 0.6683430701808383, "learning_rate": 1.8634217048966638e-05, "loss": 0.6353, "mean_token_accuracy": 0.7978753602810393, "step": 85 }, { "epoch": 0.2668643439584878, "grad_norm": 0.7314922030102382, "learning_rate": 1.836120309059107e-05, "loss": 0.6115, "mean_token_accuracy": 0.8045245726495727, "step": 90 }, { "epoch": 0.28169014084507044, "grad_norm": 0.6365309024889941, "learning_rate": 1.8065723119410885e-05, "loss": 0.6232, "mean_token_accuracy": 0.8015613553113555, "step": 95 }, { "epoch": 0.2965159377316531, "grad_norm": 0.6672244460635505, "learning_rate": 1.77485710710289e-05, "loss": 0.6255, "mean_token_accuracy": 0.8006891025641025, "step": 100 }, { "epoch": 0.2965159377316531, "eval_loss": 0.6413724422454834, "eval_mean_token_accuracy": 0.795018613493309, "eval_runtime": 5.4348, "eval_samples_per_second": 23.552, "eval_steps_per_second": 1.472, "step": 100 }, { "epoch": 0.3113417346182357, "grad_norm": 0.75128092844917, "learning_rate": 1.741059911251997e-05, "loss": 0.6223, "mean_token_accuracy": 0.8012797619047619, "step": 105 }, { "epoch": 0.32616753150481836, "grad_norm": 0.6526755539743339, "learning_rate": 1.7052715352713076e-05, "loss": 0.6125, "mean_token_accuracy": 0.8042971611721612, "step": 110 }, { "epoch": 0.34099332839140106, "grad_norm": 0.5958813027219968, "learning_rate": 1.667588140216154e-05, "loss": 0.6241, "mean_token_accuracy": 0.8009714590964588, "step": 115 }, { "epoch": 0.3558191252779837, "grad_norm": 0.6459938604110328, "learning_rate": 1.628110978935756e-05, "loss": 0.6214, "mean_token_accuracy": 0.8017719780219782, "step": 120 }, { "epoch": 0.37064492216456635, "grad_norm": 0.7336206637425139, "learning_rate": 1.586946124013354e-05, "loss": 0.6228, "mean_token_accuracy": 0.8008722527472527, "step": 125 }, { "epoch": 0.385470719051149, "grad_norm": 0.6924708455089922, "learning_rate": 1.5442041827560274e-05, "loss": 0.6093, "mean_token_accuracy": 0.8049916056166057, "step": 130 }, { "epoch": 0.40029651593773163, "grad_norm": 0.6252828305378912, "learning_rate": 1.5000000000000002e-05, "loss": 0.6119, "mean_token_accuracy": 0.8043566849816848, "step": 135 }, { "epoch": 0.41512231282431433, "grad_norm": 0.5543639102481284, "learning_rate": 1.4544523495299843e-05, "loss": 0.6184, "mean_token_accuracy": 0.8021657509157512, "step": 140 }, { "epoch": 0.429948109710897, "grad_norm": 0.6721392797389169, "learning_rate": 1.4076836149416889e-05, "loss": 0.6025, "mean_token_accuracy": 0.8070421245421245, "step": 145 }, { "epoch": 0.4447739065974796, "grad_norm": 0.7061921312824664, "learning_rate": 1.3598194608050011e-05, "loss": 0.5859, "mean_token_accuracy": 0.8116018009768011, "step": 150 }, { "epoch": 0.45959970348406226, "grad_norm": 0.5295415190053663, "learning_rate": 1.3109884950114007e-05, "loss": 0.6101, "mean_token_accuracy": 0.8041674297924299, "step": 155 }, { "epoch": 0.4744255003706449, "grad_norm": 0.5162022426859912, "learning_rate": 1.2613219232128608e-05, "loss": 0.5968, "mean_token_accuracy": 0.8079342185592185, "step": 160 }, { "epoch": 0.4892512972572276, "grad_norm": 0.5740077528394586, "learning_rate": 1.2109531962807333e-05, "loss": 0.6052, "mean_token_accuracy": 0.8052155664205085, "step": 165 }, { "epoch": 0.5040770941438102, "grad_norm": 0.5784618789845658, "learning_rate": 1.1600176517318742e-05, "loss": 0.598, "mean_token_accuracy": 0.8077052808302808, "step": 170 }, { "epoch": 0.5189028910303929, "grad_norm": 0.5622095615626541, "learning_rate": 1.1086521500854746e-05, "loss": 0.5928, "mean_token_accuracy": 0.808961385836386, "step": 175 }, { "epoch": 0.5337286879169756, "grad_norm": 0.555421977804962, "learning_rate": 1.0569947071276847e-05, "loss": 0.6079, "mean_token_accuracy": 0.8038957570207568, "step": 180 }, { "epoch": 0.5485544848035582, "grad_norm": 0.5591254267329707, "learning_rate": 1.0051841230721065e-05, "loss": 0.5996, "mean_token_accuracy": 0.8062667887667889, "step": 185 }, { "epoch": 0.5633802816901409, "grad_norm": 0.5480628578430322, "learning_rate": 9.533596096125826e-06, "loss": 0.584, "mean_token_accuracy": 0.812239774114774, "step": 190 }, { "epoch": 0.5782060785767235, "grad_norm": 0.5514593108031277, "learning_rate": 9.016604158703654e-06, "loss": 0.6084, "mean_token_accuracy": 0.8041910866910866, "step": 195 }, { "epoch": 0.5930318754633062, "grad_norm": 0.5363039052592674, "learning_rate": 8.502254542407186e-06, "loss": 0.6039, "mean_token_accuracy": 0.8054059829059831, "step": 200 }, { "epoch": 0.5930318754633062, "eval_loss": 0.6165258288383484, "eval_mean_token_accuracy": 0.8018075907910054, "eval_runtime": 5.2218, "eval_samples_per_second": 24.513, "eval_steps_per_second": 1.532, "step": 200 }, { "epoch": 0.6078576723498889, "grad_norm": 0.5869561564258384, "learning_rate": 7.991929271442817e-06, "loss": 0.5845, "mean_token_accuracy": 0.810370879120879, "step": 205 }, { "epoch": 0.6226834692364714, "grad_norm": 0.5126220770273285, "learning_rate": 7.48699955686089e-06, "loss": 0.6053, "mean_token_accuracy": 0.8049973686472104, "step": 210 }, { "epoch": 0.6375092661230541, "grad_norm": 0.5283766104356306, "learning_rate": 6.988822112200157e-06, "loss": 0.5809, "mean_token_accuracy": 0.8124908424908428, "step": 215 }, { "epoch": 0.6523350630096367, "grad_norm": 0.5538487666398877, "learning_rate": 6.498735508086094e-06, "loss": 0.6041, "mean_token_accuracy": 0.8046634615384616, "step": 220 }, { "epoch": 0.6671608598962194, "grad_norm": 0.502142536336529, "learning_rate": 6.018056575578075e-06, "loss": 0.5831, "mean_token_accuracy": 0.8119062881562883, "step": 225 }, { "epoch": 0.6819866567828021, "grad_norm": 0.5468779254684961, "learning_rate": 5.548076867929331e-06, "loss": 0.5938, "mean_token_accuracy": 0.8081074481074481, "step": 230 }, { "epoch": 0.6968124536693847, "grad_norm": 0.4985379501273117, "learning_rate": 5.090059190266779e-06, "loss": 0.5963, "mean_token_accuracy": 0.8077899877899876, "step": 235 }, { "epoch": 0.7116382505559674, "grad_norm": 0.5151452293487124, "learning_rate": 4.645234206515171e-06, "loss": 0.5899, "mean_token_accuracy": 0.8092720264856231, "step": 240 }, { "epoch": 0.72646404744255, "grad_norm": 0.4949458523296326, "learning_rate": 4.214797132682597e-06, "loss": 0.5921, "mean_token_accuracy": 0.8082945650477681, "step": 245 }, { "epoch": 0.7412898443291327, "grad_norm": 0.47351353980603134, "learning_rate": 3.799904525392251e-06, "loss": 0.5906, "mean_token_accuracy": 0.8085080891330888, "step": 250 }, { "epoch": 0.7561156412157154, "grad_norm": 0.48563113738070895, "learning_rate": 3.401671174289469e-06, "loss": 0.5822, "mean_token_accuracy": 0.8119319261486757, "step": 255 }, { "epoch": 0.770941438102298, "grad_norm": 0.4624658793689265, "learning_rate": 3.021167106673928e-06, "loss": 0.5895, "mean_token_accuracy": 0.8094238400488403, "step": 260 }, { "epoch": 0.7857672349888807, "grad_norm": 0.4887471221196979, "learning_rate": 2.6594147124053983e-06, "loss": 0.5933, "mean_token_accuracy": 0.8082280219780221, "step": 265 }, { "epoch": 0.8005930318754633, "grad_norm": 0.4755865942641489, "learning_rate": 2.317385996808195e-06, "loss": 0.5796, "mean_token_accuracy": 0.8127861721611721, "step": 270 }, { "epoch": 0.815418828762046, "grad_norm": 0.47035132329171114, "learning_rate": 1.9959999689556407e-06, "loss": 0.5967, "mean_token_accuracy": 0.8072924297924298, "step": 275 }, { "epoch": 0.8302446256486287, "grad_norm": 0.472860512590234, "learning_rate": 1.6961201723520248e-06, "loss": 0.578, "mean_token_accuracy": 0.8127960927960929, "step": 280 }, { "epoch": 0.8450704225352113, "grad_norm": 0.4614371078147505, "learning_rate": 1.4185523646469822e-06, "loss": 0.5901, "mean_token_accuracy": 0.8086108107426412, "step": 285 }, { "epoch": 0.859896219421794, "grad_norm": 0.43931896763540745, "learning_rate": 1.1640423526166987e-06, "loss": 0.5834, "mean_token_accuracy": 0.8111981074481076, "step": 290 }, { "epoch": 0.8747220163083765, "grad_norm": 0.4456895234302146, "learning_rate": 9.332739882292752e-07, "loss": 0.5842, "mean_token_accuracy": 0.8116254578754578, "step": 295 }, { "epoch": 0.8895478131949592, "grad_norm": 0.43426061796264454, "learning_rate": 7.268673311786378e-07, "loss": 0.572, "mean_token_accuracy": 0.8145253357753358, "step": 300 }, { "epoch": 0.8895478131949592, "eval_loss": 0.6066814064979553, "eval_mean_token_accuracy": 0.8039550018326881, "eval_runtime": 5.5504, "eval_samples_per_second": 23.061, "eval_steps_per_second": 1.441, "step": 300 }, { "epoch": 0.9043736100815419, "grad_norm": 0.45927392759405916, "learning_rate": 5.453769828241872e-07, "loss": 0.5848, "mean_token_accuracy": 0.8108882783882784, "step": 305 }, { "epoch": 0.9191994069681245, "grad_norm": 0.44015278909822164, "learning_rate": 3.8929059601275463e-07, "loss": 0.5883, "mean_token_accuracy": 0.8098282967032967, "step": 310 }, { "epoch": 0.9340252038547072, "grad_norm": 0.46373639838618946, "learning_rate": 2.5902756478688674e-07, "loss": 0.5893, "mean_token_accuracy": 0.8085653235653236, "step": 315 }, { "epoch": 0.9488510007412898, "grad_norm": 0.4453169356064979, "learning_rate": 1.5493789750014032e-07, "loss": 0.5934, "mean_token_accuracy": 0.8079464285714286, "step": 320 }, { "epoch": 0.9636767976278725, "grad_norm": 0.44080298077757607, "learning_rate": 7.730127636723539e-08, "loss": 0.5835, "mean_token_accuracy": 0.8108723706229716, "step": 325 }, { "epoch": 0.9785025945144552, "grad_norm": 0.42873433987601206, "learning_rate": 2.6326305976001054e-08, "loss": 0.5817, "mean_token_accuracy": 0.8121665140415143, "step": 330 }, { "epoch": 0.9933283914010378, "grad_norm": 0.44474948977305667, "learning_rate": 2.149952780321485e-09, "loss": 0.5933, "mean_token_accuracy": 0.8080050057957257, "step": 335 }, { "epoch": 0.9992587101556709, "mean_token_accuracy": 0.8132173382173382, "step": 337, "total_flos": 176608216350720.0, "train_loss": 0.6238384791580789, "train_runtime": 3258.6411, "train_samples_per_second": 6.624, "train_steps_per_second": 0.103 } ], "logging_steps": 5, "max_steps": 337, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 176608216350720.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }