{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01054546413224012, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001054546413224012, "eval_loss": 3.0851244926452637, "eval_runtime": 1296.9727, "eval_samples_per_second": 12.314, "eval_steps_per_second": 1.54, "step": 1 }, { "epoch": 0.0003163639239672036, "grad_norm": 3.364920139312744, "learning_rate": 3e-05, "loss": 3.1129, "step": 3 }, { "epoch": 0.0006327278479344072, "grad_norm": 3.629132032394409, "learning_rate": 6e-05, "loss": 2.9367, "step": 6 }, { "epoch": 0.0009490917719016108, "grad_norm": 2.8804471492767334, "learning_rate": 9e-05, "loss": 2.1674, "step": 9 }, { "epoch": 0.0009490917719016108, "eval_loss": 1.5537173748016357, "eval_runtime": 1301.5706, "eval_samples_per_second": 12.271, "eval_steps_per_second": 1.534, "step": 9 }, { "epoch": 0.0012654556958688144, "grad_norm": 3.045879602432251, "learning_rate": 9.987820251299122e-05, "loss": 1.2596, "step": 12 }, { "epoch": 0.001581819619836018, "grad_norm": 2.454657793045044, "learning_rate": 9.924038765061042e-05, "loss": 0.9167, "step": 15 }, { "epoch": 0.0018981835438032215, "grad_norm": 2.024900197982788, "learning_rate": 9.806308479691595e-05, "loss": 0.5806, "step": 18 }, { "epoch": 0.0018981835438032215, "eval_loss": 0.6141843199729919, "eval_runtime": 1299.0748, "eval_samples_per_second": 12.294, "eval_steps_per_second": 1.537, "step": 18 }, { "epoch": 0.002214547467770425, "grad_norm": 1.8922075033187866, "learning_rate": 9.635919272833938e-05, "loss": 0.487, "step": 21 }, { "epoch": 0.0025309113917376287, "grad_norm": 1.6137950420379639, "learning_rate": 9.414737964294636e-05, "loss": 0.473, "step": 24 }, { "epoch": 0.0028472753157048323, "grad_norm": 1.769119143486023, "learning_rate": 9.145187862775209e-05, "loss": 0.5101, "step": 27 }, { "epoch": 0.0028472753157048323, "eval_loss": 0.42539533972740173, "eval_runtime": 1301.7236, "eval_samples_per_second": 12.269, "eval_steps_per_second": 1.534, "step": 27 }, { "epoch": 0.003163639239672036, "grad_norm": 1.2414625883102417, "learning_rate": 8.83022221559489e-05, "loss": 0.3457, "step": 30 }, { "epoch": 0.0034800031636392395, "grad_norm": 1.1694670915603638, "learning_rate": 8.473291852294987e-05, "loss": 0.3041, "step": 33 }, { "epoch": 0.003796367087606443, "grad_norm": 1.5880285501480103, "learning_rate": 8.07830737662829e-05, "loss": 0.3631, "step": 36 }, { "epoch": 0.003796367087606443, "eval_loss": 0.3508250117301941, "eval_runtime": 1301.98, "eval_samples_per_second": 12.267, "eval_steps_per_second": 1.534, "step": 36 }, { "epoch": 0.004112731011573647, "grad_norm": 1.1332464218139648, "learning_rate": 7.649596321166024e-05, "loss": 0.3533, "step": 39 }, { "epoch": 0.00442909493554085, "grad_norm": 1.8899226188659668, "learning_rate": 7.191855733945387e-05, "loss": 0.4094, "step": 42 }, { "epoch": 0.004745458859508054, "grad_norm": 1.209465503692627, "learning_rate": 6.710100716628344e-05, "loss": 0.2916, "step": 45 }, { "epoch": 0.004745458859508054, "eval_loss": 0.3128226399421692, "eval_runtime": 1301.9125, "eval_samples_per_second": 12.267, "eval_steps_per_second": 1.534, "step": 45 }, { "epoch": 0.0050618227834752574, "grad_norm": 1.3826940059661865, "learning_rate": 6.209609477998338e-05, "loss": 0.5031, "step": 48 }, { "epoch": 0.0053781867074424615, "grad_norm": 1.3125900030136108, "learning_rate": 5.695865504800327e-05, "loss": 0.2013, "step": 51 }, { "epoch": 0.005694550631409665, "grad_norm": 1.3176207542419434, "learning_rate": 5.174497483512506e-05, "loss": 0.3433, "step": 54 }, { "epoch": 0.005694550631409665, "eval_loss": 0.2877451777458191, "eval_runtime": 1302.1449, "eval_samples_per_second": 12.265, "eval_steps_per_second": 1.534, "step": 54 }, { "epoch": 0.006010914555376869, "grad_norm": 1.3576031923294067, "learning_rate": 4.6512176312793736e-05, "loss": 0.2684, "step": 57 }, { "epoch": 0.006327278479344072, "grad_norm": 1.230389952659607, "learning_rate": 4.131759111665349e-05, "loss": 0.2711, "step": 60 }, { "epoch": 0.006643642403311276, "grad_norm": 1.3148422241210938, "learning_rate": 3.6218132209150045e-05, "loss": 0.382, "step": 63 }, { "epoch": 0.006643642403311276, "eval_loss": 0.2793634235858917, "eval_runtime": 1301.6079, "eval_samples_per_second": 12.27, "eval_steps_per_second": 1.534, "step": 63 }, { "epoch": 0.006960006327278479, "grad_norm": 1.0743589401245117, "learning_rate": 3.12696703292044e-05, "loss": 0.2639, "step": 66 }, { "epoch": 0.007276370251245683, "grad_norm": 0.9537119269371033, "learning_rate": 2.6526421860705473e-05, "loss": 0.2925, "step": 69 }, { "epoch": 0.007592734175212886, "grad_norm": 1.0721756219863892, "learning_rate": 2.2040354826462668e-05, "loss": 0.2189, "step": 72 }, { "epoch": 0.007592734175212886, "eval_loss": 0.2621087431907654, "eval_runtime": 1299.3498, "eval_samples_per_second": 12.292, "eval_steps_per_second": 1.537, "step": 72 }, { "epoch": 0.00790909809918009, "grad_norm": 1.1351699829101562, "learning_rate": 1.7860619515673033e-05, "loss": 0.2786, "step": 75 }, { "epoch": 0.008225462023147294, "grad_norm": 1.0633392333984375, "learning_rate": 1.4033009983067452e-05, "loss": 0.2632, "step": 78 }, { "epoch": 0.008541825947114497, "grad_norm": 1.0229682922363281, "learning_rate": 1.0599462319663905e-05, "loss": 0.2519, "step": 81 }, { "epoch": 0.008541825947114497, "eval_loss": 0.2528908848762512, "eval_runtime": 1301.3962, "eval_samples_per_second": 12.272, "eval_steps_per_second": 1.535, "step": 81 }, { "epoch": 0.0088581898710817, "grad_norm": 0.9887948632240295, "learning_rate": 7.597595192178702e-06, "loss": 0.1914, "step": 84 }, { "epoch": 0.009174553795048905, "grad_norm": 0.971612811088562, "learning_rate": 5.060297685041659e-06, "loss": 0.2243, "step": 87 }, { "epoch": 0.009490917719016109, "grad_norm": 1.1088848114013672, "learning_rate": 3.0153689607045845e-06, "loss": 0.3923, "step": 90 }, { "epoch": 0.009490917719016109, "eval_loss": 0.2498694211244583, "eval_runtime": 1301.4191, "eval_samples_per_second": 12.272, "eval_steps_per_second": 1.534, "step": 90 }, { "epoch": 0.009807281642983312, "grad_norm": 1.1288279294967651, "learning_rate": 1.4852136862001764e-06, "loss": 0.1999, "step": 93 }, { "epoch": 0.010123645566950515, "grad_norm": 1.1936079263687134, "learning_rate": 4.865965629214819e-07, "loss": 0.1988, "step": 96 }, { "epoch": 0.01044000949091772, "grad_norm": 0.9933436512947083, "learning_rate": 3.04586490452119e-08, "loss": 0.4169, "step": 99 }, { "epoch": 0.01044000949091772, "eval_loss": 0.2488851249217987, "eval_runtime": 1301.5736, "eval_samples_per_second": 12.271, "eval_steps_per_second": 1.534, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.811167717372723e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }