{ "best_metric": 1.7828339338302612, "best_model_checkpoint": "miner_id_24/checkpoint-250", "epoch": 0.007337618502538816, "eval_steps": 50, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.9350474010155263e-05, "eval_loss": 2.557575225830078, "eval_runtime": 211.0125, "eval_samples_per_second": 67.986, "eval_steps_per_second": 16.999, "step": 1 }, { "epoch": 0.0002935047401015526, "grad_norm": 0.6864449977874756, "learning_rate": 4.36e-05, "loss": 2.2522, "step": 10 }, { "epoch": 0.0005870094802031052, "grad_norm": 0.6921091675758362, "learning_rate": 8.72e-05, "loss": 2.2997, "step": 20 }, { "epoch": 0.000880514220304658, "grad_norm": 0.6702334880828857, "learning_rate": 0.0001308, "loss": 2.2192, "step": 30 }, { "epoch": 0.0011740189604062105, "grad_norm": 0.7354345321655273, "learning_rate": 0.0001744, "loss": 2.165, "step": 40 }, { "epoch": 0.0014675237005077632, "grad_norm": 0.91349196434021, "learning_rate": 0.000218, "loss": 1.7941, "step": 50 }, { "epoch": 0.0014675237005077632, "eval_loss": 2.014312267303467, "eval_runtime": 210.6307, "eval_samples_per_second": 68.11, "eval_steps_per_second": 17.03, "step": 50 }, { "epoch": 0.001761028440609316, "grad_norm": 0.6642831563949585, "learning_rate": 0.00021773448147832086, "loss": 1.8997, "step": 60 }, { "epoch": 0.0020545331807108685, "grad_norm": 0.5897936224937439, "learning_rate": 0.0002169392194928312, "loss": 2.0005, "step": 70 }, { "epoch": 0.002348037920812421, "grad_norm": 0.5885149240493774, "learning_rate": 0.00021561808847998484, "loss": 2.0533, "step": 80 }, { "epoch": 0.002641542660913974, "grad_norm": 0.9499412775039673, "learning_rate": 0.00021377752485727676, "loss": 2.1402, "step": 90 }, { "epoch": 0.0029350474010155264, "grad_norm": 0.7471547722816467, "learning_rate": 0.00021142649566566402, "loss": 1.5838, "step": 100 }, { "epoch": 0.0029350474010155264, "eval_loss": 1.9189231395721436, "eval_runtime": 210.9482, "eval_samples_per_second": 68.007, "eval_steps_per_second": 17.004, "step": 100 }, { "epoch": 0.003228552141117079, "grad_norm": 0.5224080085754395, "learning_rate": 0.0002085764548830435, "loss": 1.8536, "step": 110 }, { "epoch": 0.003522056881218632, "grad_norm": 0.6087786555290222, "learning_rate": 0.00020524128762162305, "loss": 1.9387, "step": 120 }, { "epoch": 0.0038155616213201844, "grad_norm": 0.649080216884613, "learning_rate": 0.00020143724248105043, "loss": 1.9686, "step": 130 }, { "epoch": 0.004109066361421737, "grad_norm": 0.6359239220619202, "learning_rate": 0.0001971828523868693, "loss": 1.9197, "step": 140 }, { "epoch": 0.00440257110152329, "grad_norm": 0.5560302138328552, "learning_rate": 0.0001924988442999686, "loss": 1.6309, "step": 150 }, { "epoch": 0.00440257110152329, "eval_loss": 1.8461153507232666, "eval_runtime": 211.9482, "eval_samples_per_second": 67.686, "eval_steps_per_second": 16.924, "step": 150 }, { "epoch": 0.004696075841624842, "grad_norm": 0.48622700572013855, "learning_rate": 0.00018740803823691298, "loss": 1.7917, "step": 160 }, { "epoch": 0.004989580581726395, "grad_norm": 0.49764740467071533, "learning_rate": 0.00018193523609311556, "loss": 1.8452, "step": 170 }, { "epoch": 0.005283085321827948, "grad_norm": 0.5556671619415283, "learning_rate": 0.00017610710081049675, "loss": 1.8311, "step": 180 }, { "epoch": 0.0055765900619295, "grad_norm": 0.5455553531646729, "learning_rate": 0.00016995202647831142, "loss": 1.8376, "step": 190 }, { "epoch": 0.005870094802031053, "grad_norm": 0.49670788645744324, "learning_rate": 0.00016350000000000002, "loss": 1.5889, "step": 200 }, { "epoch": 0.005870094802031053, "eval_loss": 1.8115742206573486, "eval_runtime": 210.7647, "eval_samples_per_second": 68.066, "eval_steps_per_second": 17.019, "step": 200 }, { "epoch": 0.006163599542132606, "grad_norm": 0.4287732243537903, "learning_rate": 0.00015678245500000943, "loss": 1.8633, "step": 210 }, { "epoch": 0.006457104282234158, "grad_norm": 0.4952814280986786, "learning_rate": 0.00014983211868233444, "loss": 1.7891, "step": 220 }, { "epoch": 0.006750609022335711, "grad_norm": 0.6276448965072632, "learning_rate": 0.00014268285238686927, "loss": 1.9548, "step": 230 }, { "epoch": 0.007044113762437264, "grad_norm": 0.7252087593078613, "learning_rate": 0.00013536948662036378, "loss": 1.8363, "step": 240 }, { "epoch": 0.007337618502538816, "grad_norm": 0.5657986998558044, "learning_rate": 0.00012792765136569544, "loss": 1.464, "step": 250 }, { "epoch": 0.007337618502538816, "eval_loss": 1.7828339338302612, "eval_runtime": 211.3794, "eval_samples_per_second": 67.868, "eval_steps_per_second": 16.969, "step": 250 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4009708431605760.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }