{ "best_metric": 9.273290634155273, "best_model_checkpoint": "miner_id_24/checkpoint-300", "epoch": 0.353045013239188, "eval_steps": 50, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011768167107972932, "eval_loss": 10.379207611083984, "eval_runtime": 3.1746, "eval_samples_per_second": 451.087, "eval_steps_per_second": 112.772, "step": 1 }, { "epoch": 0.011768167107972934, "grad_norm": 0.5737844705581665, "learning_rate": 0.0002, "loss": 10.3536, "step": 10 }, { "epoch": 0.023536334215945868, "grad_norm": 0.5810375809669495, "learning_rate": 0.0001998582695676762, "loss": 10.3108, "step": 20 }, { "epoch": 0.0353045013239188, "grad_norm": 0.5553200840950012, "learning_rate": 0.00019943348002101371, "loss": 10.2622, "step": 30 }, { "epoch": 0.047072668431891736, "grad_norm": 0.559855043888092, "learning_rate": 0.00019872683547213446, "loss": 10.2096, "step": 40 }, { "epoch": 0.05884083553986467, "grad_norm": 0.5475898385047913, "learning_rate": 0.00019774033898178667, "loss": 10.1577, "step": 50 }, { "epoch": 0.05884083553986467, "eval_loss": 10.128434181213379, "eval_runtime": 3.1442, "eval_samples_per_second": 455.443, "eval_steps_per_second": 113.861, "step": 50 }, { "epoch": 0.0706090026478376, "grad_norm": 3.1386730670928955, "learning_rate": 0.0001964767868814516, "loss": 10.1068, "step": 60 }, { "epoch": 0.08237716975581054, "grad_norm": 0.5707929134368896, "learning_rate": 0.00019493976084683813, "loss": 10.0549, "step": 70 }, { "epoch": 0.09414533686378347, "grad_norm": 0.5704991221427917, "learning_rate": 0.00019313361774523385, "loss": 10.006, "step": 80 }, { "epoch": 0.1059135039717564, "grad_norm": 0.5538395643234253, "learning_rate": 0.00019106347728549135, "loss": 9.9593, "step": 90 }, { "epoch": 0.11768167107972934, "grad_norm": 0.6376510858535767, "learning_rate": 0.00018873520750565718, "loss": 9.912, "step": 100 }, { "epoch": 0.11768167107972934, "eval_loss": 9.88634204864502, "eval_runtime": 3.1338, "eval_samples_per_second": 456.959, "eval_steps_per_second": 114.24, "step": 100 }, { "epoch": 0.12944983818770225, "grad_norm": 1.2404887676239014, "learning_rate": 0.0001861554081393806, "loss": 9.8677, "step": 110 }, { "epoch": 0.1412180052956752, "grad_norm": 0.5645771622657776, "learning_rate": 0.0001833313919082515, "loss": 9.8202, "step": 120 }, { "epoch": 0.15298617240364812, "grad_norm": 0.6336705088615417, "learning_rate": 0.00018027116379309638, "loss": 9.7735, "step": 130 }, { "epoch": 0.16475433951162108, "grad_norm": 22.544200897216797, "learning_rate": 0.00017698339834299061, "loss": 9.7257, "step": 140 }, { "epoch": 0.176522506619594, "grad_norm": 1.2222896814346313, "learning_rate": 0.00017347741508630672, "loss": 9.6954, "step": 150 }, { "epoch": 0.176522506619594, "eval_loss": 9.663827896118164, "eval_runtime": 3.1098, "eval_samples_per_second": 460.483, "eval_steps_per_second": 115.121, "step": 150 }, { "epoch": 0.18829067372756694, "grad_norm": 0.5674024224281311, "learning_rate": 0.0001697631521134985, "loss": 9.6526, "step": 160 }, { "epoch": 0.20005884083553988, "grad_norm": 0.5667808651924133, "learning_rate": 0.00016585113790650388, "loss": 9.613, "step": 170 }, { "epoch": 0.2118270079435128, "grad_norm": 0.5394534468650818, "learning_rate": 0.0001617524614946192, "loss": 9.5741, "step": 180 }, { "epoch": 0.22359517505148574, "grad_norm": 0.5556952357292175, "learning_rate": 0.0001574787410214407, "loss": 9.5348, "step": 190 }, { "epoch": 0.23536334215945867, "grad_norm": 0.5671817064285278, "learning_rate": 0.00015304209081197425, "loss": 9.4987, "step": 200 }, { "epoch": 0.23536334215945867, "eval_loss": 9.482871055603027, "eval_runtime": 3.1933, "eval_samples_per_second": 448.444, "eval_steps_per_second": 112.111, "step": 200 }, { "epoch": 0.2471315092674316, "grad_norm": 0.7693024277687073, "learning_rate": 0.00014845508703326504, "loss": 9.4781, "step": 210 }, { "epoch": 0.2588996763754045, "grad_norm": 3.6308693885803223, "learning_rate": 0.00014373073204588556, "loss": 9.4655, "step": 220 }, { "epoch": 0.27066784348337747, "grad_norm": 0.5690402388572693, "learning_rate": 0.00013888241754733208, "loss": 9.4397, "step": 230 }, { "epoch": 0.2824360105913504, "grad_norm": 0.5628238916397095, "learning_rate": 0.00013392388661180303, "loss": 9.4196, "step": 240 }, { "epoch": 0.29420417769932333, "grad_norm": 0.5627617835998535, "learning_rate": 0.0001288691947339621, "loss": 9.3916, "step": 250 }, { "epoch": 0.29420417769932333, "eval_loss": 9.378963470458984, "eval_runtime": 3.1537, "eval_samples_per_second": 454.067, "eval_steps_per_second": 113.517, "step": 250 }, { "epoch": 0.30597234480729624, "grad_norm": 0.5669119954109192, "learning_rate": 0.0001237326699871115, "loss": 9.3743, "step": 260 }, { "epoch": 0.3177405119152692, "grad_norm": 0.5632140040397644, "learning_rate": 0.00011852887240871145, "loss": 9.3444, "step": 270 }, { "epoch": 0.32950867902324216, "grad_norm": 0.5589627623558044, "learning_rate": 0.00011327255272837221, "loss": 9.3176, "step": 280 }, { "epoch": 0.34127684613121506, "grad_norm": 0.5277190208435059, "learning_rate": 0.00010797861055530831, "loss": 9.2899, "step": 290 }, { "epoch": 0.353045013239188, "grad_norm": 0.5792800784111023, "learning_rate": 0.00010266205214377748, "loss": 9.2735, "step": 300 }, { "epoch": 0.353045013239188, "eval_loss": 9.273290634155273, "eval_runtime": 3.1419, "eval_samples_per_second": 455.778, "eval_steps_per_second": 113.945, "step": 300 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 64247775363072.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }