{ "best_metric": 0.5564497113227844, "best_model_checkpoint": "/kaggle/working/results/checkpoint-11500", "epoch": 0.9687836383207751, "eval_steps": 500, "global_step": 13500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03588087549336204, "grad_norm": 5.043849468231201, "learning_rate": 3.856476498026552e-05, "loss": 0.3432, "step": 500 }, { "epoch": 0.03588087549336204, "eval_loss": 0.7613404989242554, "eval_runtime": 151.9935, "eval_samples_per_second": 68.424, "eval_steps_per_second": 0.539, "step": 500 }, { "epoch": 0.07176175098672408, "grad_norm": 5.727941036224365, "learning_rate": 3.7129529960531036e-05, "loss": 0.3295, "step": 1000 }, { "epoch": 0.07176175098672408, "eval_loss": 0.7736061215400696, "eval_runtime": 152.2697, "eval_samples_per_second": 68.3, "eval_steps_per_second": 0.539, "step": 1000 }, { "epoch": 0.10764262648008611, "grad_norm": 10.28232192993164, "learning_rate": 3.569429494079656e-05, "loss": 0.3222, "step": 1500 }, { "epoch": 0.10764262648008611, "eval_loss": 0.7609947323799133, "eval_runtime": 151.8769, "eval_samples_per_second": 68.477, "eval_steps_per_second": 0.54, "step": 1500 }, { "epoch": 0.14352350197344815, "grad_norm": 9.004725456237793, "learning_rate": 3.425905992106208e-05, "loss": 0.3284, "step": 2000 }, { "epoch": 0.14352350197344815, "eval_loss": 0.7286639213562012, "eval_runtime": 151.8324, "eval_samples_per_second": 68.497, "eval_steps_per_second": 0.54, "step": 2000 }, { "epoch": 0.17940437746681018, "grad_norm": 5.145745277404785, "learning_rate": 3.28238249013276e-05, "loss": 0.3381, "step": 2500 }, { "epoch": 0.17940437746681018, "eval_loss": 0.7416331768035889, "eval_runtime": 151.8311, "eval_samples_per_second": 68.497, "eval_steps_per_second": 0.54, "step": 2500 }, { "epoch": 0.21528525296017223, "grad_norm": 4.784695625305176, "learning_rate": 3.1388589881593114e-05, "loss": 0.3311, "step": 3000 }, { "epoch": 0.21528525296017223, "eval_loss": 0.7392188310623169, "eval_runtime": 152.2068, "eval_samples_per_second": 68.328, "eval_steps_per_second": 0.539, "step": 3000 }, { "epoch": 0.25116612845353425, "grad_norm": 7.851515293121338, "learning_rate": 2.9953354861858634e-05, "loss": 0.3325, "step": 3500 }, { "epoch": 0.25116612845353425, "eval_loss": 0.7302030324935913, "eval_runtime": 152.1733, "eval_samples_per_second": 68.343, "eval_steps_per_second": 0.539, "step": 3500 }, { "epoch": 0.2870470039468963, "grad_norm": 2.829453468322754, "learning_rate": 2.851811984212415e-05, "loss": 0.3252, "step": 4000 }, { "epoch": 0.2870470039468963, "eval_loss": 0.7374987602233887, "eval_runtime": 152.0702, "eval_samples_per_second": 68.389, "eval_steps_per_second": 0.539, "step": 4000 }, { "epoch": 0.32292787944025836, "grad_norm": 16.365131378173828, "learning_rate": 2.7082884822389666e-05, "loss": 0.3579, "step": 4500 }, { "epoch": 0.32292787944025836, "eval_loss": 0.7489859461784363, "eval_runtime": 152.1526, "eval_samples_per_second": 68.352, "eval_steps_per_second": 0.539, "step": 4500 }, { "epoch": 0.35880875493362036, "grad_norm": 4.619975566864014, "learning_rate": 2.5647649802655186e-05, "loss": 0.517, "step": 5000 }, { "epoch": 0.35880875493362036, "eval_loss": 0.6779808402061462, "eval_runtime": 152.588, "eval_samples_per_second": 68.157, "eval_steps_per_second": 0.537, "step": 5000 }, { "epoch": 0.3946896304269824, "grad_norm": 5.207348823547363, "learning_rate": 2.4212414782920702e-05, "loss": 0.5202, "step": 5500 }, { "epoch": 0.3946896304269824, "eval_loss": 0.6824296116828918, "eval_runtime": 152.1419, "eval_samples_per_second": 68.357, "eval_steps_per_second": 0.539, "step": 5500 }, { "epoch": 0.43057050592034446, "grad_norm": 9.262358665466309, "learning_rate": 2.2777179763186225e-05, "loss": 0.5078, "step": 6000 }, { "epoch": 0.43057050592034446, "eval_loss": 0.6495384573936462, "eval_runtime": 151.992, "eval_samples_per_second": 68.425, "eval_steps_per_second": 0.54, "step": 6000 }, { "epoch": 0.4664513814137065, "grad_norm": 8.04774284362793, "learning_rate": 2.1341944743451745e-05, "loss": 0.512, "step": 6500 }, { "epoch": 0.4664513814137065, "eval_loss": 0.6483868956565857, "eval_runtime": 151.9988, "eval_samples_per_second": 68.422, "eval_steps_per_second": 0.539, "step": 6500 }, { "epoch": 0.5023322569070685, "grad_norm": 10.386420249938965, "learning_rate": 1.990670972371726e-05, "loss": 0.5096, "step": 7000 }, { "epoch": 0.5023322569070685, "eval_loss": 0.6350330710411072, "eval_runtime": 152.1248, "eval_samples_per_second": 68.365, "eval_steps_per_second": 0.539, "step": 7000 }, { "epoch": 0.5382131324004306, "grad_norm": 6.1806535720825195, "learning_rate": 1.847147470398278e-05, "loss": 0.5067, "step": 7500 }, { "epoch": 0.5382131324004306, "eval_loss": 0.6437995433807373, "eval_runtime": 152.5204, "eval_samples_per_second": 68.188, "eval_steps_per_second": 0.538, "step": 7500 }, { "epoch": 0.5740940078937926, "grad_norm": 9.446738243103027, "learning_rate": 1.7036239684248297e-05, "loss": 0.4932, "step": 8000 }, { "epoch": 0.5740940078937926, "eval_loss": 0.6440528035163879, "eval_runtime": 151.9078, "eval_samples_per_second": 68.463, "eval_steps_per_second": 0.54, "step": 8000 }, { "epoch": 0.6099748833871547, "grad_norm": 7.391712188720703, "learning_rate": 1.5601004664513816e-05, "loss": 0.4855, "step": 8500 }, { "epoch": 0.6099748833871547, "eval_loss": 0.6418237090110779, "eval_runtime": 152.0351, "eval_samples_per_second": 68.405, "eval_steps_per_second": 0.539, "step": 8500 }, { "epoch": 0.6458557588805167, "grad_norm": 8.007753372192383, "learning_rate": 1.4165769644779334e-05, "loss": 0.484, "step": 9000 }, { "epoch": 0.6458557588805167, "eval_loss": 0.6276165843009949, "eval_runtime": 152.0065, "eval_samples_per_second": 68.418, "eval_steps_per_second": 0.539, "step": 9000 }, { "epoch": 0.6817366343738788, "grad_norm": 5.451321601867676, "learning_rate": 1.2730534625044852e-05, "loss": 0.4956, "step": 9500 }, { "epoch": 0.6817366343738788, "eval_loss": 0.6465044617652893, "eval_runtime": 152.1889, "eval_samples_per_second": 68.336, "eval_steps_per_second": 0.539, "step": 9500 }, { "epoch": 0.7176175098672407, "grad_norm": 6.791475772857666, "learning_rate": 1.129529960531037e-05, "loss": 0.4769, "step": 10000 }, { "epoch": 0.7176175098672407, "eval_loss": 0.6356320381164551, "eval_runtime": 152.2461, "eval_samples_per_second": 68.31, "eval_steps_per_second": 0.539, "step": 10000 }, { "epoch": 0.7534983853606028, "grad_norm": 19.865468978881836, "learning_rate": 9.86006458557589e-06, "loss": 0.472, "step": 10500 }, { "epoch": 0.7534983853606028, "eval_loss": 0.6133418083190918, "eval_runtime": 152.0045, "eval_samples_per_second": 68.419, "eval_steps_per_second": 0.539, "step": 10500 }, { "epoch": 0.7893792608539648, "grad_norm": 4.383126258850098, "learning_rate": 8.424829565841408e-06, "loss": 0.4756, "step": 11000 }, { "epoch": 0.7893792608539648, "eval_loss": 0.6084704399108887, "eval_runtime": 151.9103, "eval_samples_per_second": 68.461, "eval_steps_per_second": 0.54, "step": 11000 }, { "epoch": 0.8252601363473269, "grad_norm": 4.4955596923828125, "learning_rate": 6.9895945461069255e-06, "loss": 0.474, "step": 11500 }, { "epoch": 0.8252601363473269, "eval_loss": 0.5564497113227844, "eval_runtime": 152.1244, "eval_samples_per_second": 68.365, "eval_steps_per_second": 0.539, "step": 11500 }, { "epoch": 0.8611410118406889, "grad_norm": 5.504430294036865, "learning_rate": 5.554359526372443e-06, "loss": 0.4743, "step": 12000 }, { "epoch": 0.8611410118406889, "eval_loss": 0.5606986284255981, "eval_runtime": 152.3774, "eval_samples_per_second": 68.252, "eval_steps_per_second": 0.538, "step": 12000 }, { "epoch": 0.897021887334051, "grad_norm": 4.161441326141357, "learning_rate": 4.119124506637962e-06, "loss": 0.4842, "step": 12500 }, { "epoch": 0.897021887334051, "eval_loss": 0.5657362937927246, "eval_runtime": 152.0558, "eval_samples_per_second": 68.396, "eval_steps_per_second": 0.539, "step": 12500 }, { "epoch": 0.932902762827413, "grad_norm": 15.681989669799805, "learning_rate": 2.6838894869034805e-06, "loss": 0.4583, "step": 13000 }, { "epoch": 0.932902762827413, "eval_loss": 0.5640388131141663, "eval_runtime": 152.0718, "eval_samples_per_second": 68.389, "eval_steps_per_second": 0.539, "step": 13000 }, { "epoch": 0.9687836383207751, "grad_norm": 6.109396934509277, "learning_rate": 1.248654467168999e-06, "loss": 0.4662, "step": 13500 }, { "epoch": 0.9687836383207751, "eval_loss": 0.5629301071166992, "eval_runtime": 152.1551, "eval_samples_per_second": 68.351, "eval_steps_per_second": 0.539, "step": 13500 } ], "logging_steps": 500, "max_steps": 13935, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.13688468946944e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }