{ "best_metric": 2.6884331703186035, "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-211_1e-3/checkpoint-44480", "epoch": 19.99134539732494, "eval_steps": 500, "global_step": 44480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.44958974935371476, "grad_norm": 0.4794260859489441, "learning_rate": 3.125e-05, "loss": 5.5578, "step": 1000 }, { "epoch": 0.8991794987074295, "grad_norm": 0.6688742637634277, "learning_rate": 6.25e-05, "loss": 4.0839, "step": 2000 }, { "epoch": 1.0, "eval_accuracy": 0.3593648454767537, "eval_loss": 3.831916093826294, "eval_runtime": 74.178, "eval_samples_per_second": 818.033, "eval_steps_per_second": 12.794, "step": 2225 }, { "epoch": 1.348432055749129, "grad_norm": 0.6358478665351868, "learning_rate": 9.375e-05, "loss": 3.6915, "step": 3000 }, { "epoch": 1.7980218051028438, "grad_norm": 0.5631846189498901, "learning_rate": 0.000125, "loss": 3.4456, "step": 4000 }, { "epoch": 2.0, "eval_accuracy": 0.4073226310959453, "eval_loss": 3.3209149837493896, "eval_runtime": 74.0924, "eval_samples_per_second": 818.978, "eval_steps_per_second": 12.808, "step": 4450 }, { "epoch": 2.2472743621445432, "grad_norm": 0.4991568326950073, "learning_rate": 0.00015625, "loss": 3.2447, "step": 5000 }, { "epoch": 2.696864111498258, "grad_norm": 0.4847903549671173, "learning_rate": 0.0001875, "loss": 3.1244, "step": 6000 }, { "epoch": 3.0, "eval_accuracy": 0.4285145475461114, "eval_loss": 3.104708194732666, "eval_runtime": 73.8, "eval_samples_per_second": 822.222, "eval_steps_per_second": 12.859, "step": 6675 }, { "epoch": 3.1461166685399573, "grad_norm": 0.4304514229297638, "learning_rate": 0.00021875, "loss": 3.0269, "step": 7000 }, { "epoch": 3.595706417893672, "grad_norm": 0.40505334734916687, "learning_rate": 0.00025, "loss": 2.9574, "step": 8000 }, { "epoch": 4.0, "eval_accuracy": 0.43970219861181126, "eval_loss": 3.0002548694610596, "eval_runtime": 73.4836, "eval_samples_per_second": 825.763, "eval_steps_per_second": 12.914, "step": 8900 }, { "epoch": 4.044958974935372, "grad_norm": 0.37327685952186584, "learning_rate": 0.00028125000000000003, "loss": 2.9085, "step": 9000 }, { "epoch": 4.4945487242890865, "grad_norm": 0.3851179778575897, "learning_rate": 0.0003125, "loss": 2.8553, "step": 10000 }, { "epoch": 4.944138473642801, "grad_norm": 0.351309210062027, "learning_rate": 0.00034375, "loss": 2.8363, "step": 11000 }, { "epoch": 5.0, "eval_accuracy": 0.44639368206082697, "eval_loss": 2.9325950145721436, "eval_runtime": 73.3051, "eval_samples_per_second": 827.773, "eval_steps_per_second": 12.946, "step": 11125 }, { "epoch": 5.3933910306845005, "grad_norm": 0.30397000908851624, "learning_rate": 0.000375, "loss": 2.7841, "step": 12000 }, { "epoch": 5.842980780038215, "grad_norm": 0.3001512587070465, "learning_rate": 0.00040625000000000004, "loss": 2.7798, "step": 13000 }, { "epoch": 6.0, "eval_accuracy": 0.4501730065790324, "eval_loss": 2.893338203430176, "eval_runtime": 72.6459, "eval_samples_per_second": 835.284, "eval_steps_per_second": 13.063, "step": 13350 }, { "epoch": 6.292233337079915, "grad_norm": 0.31525805592536926, "learning_rate": 0.0004375, "loss": 2.7452, "step": 14000 }, { "epoch": 6.741823086433629, "grad_norm": 0.2669561803340912, "learning_rate": 0.00046871875, "loss": 2.7383, "step": 15000 }, { "epoch": 7.0, "eval_accuracy": 0.4534228417800871, "eval_loss": 2.866095542907715, "eval_runtime": 73.0212, "eval_samples_per_second": 830.991, "eval_steps_per_second": 12.996, "step": 15575 }, { "epoch": 7.191075643475329, "grad_norm": 0.26346272230148315, "learning_rate": 0.00049996875, "loss": 2.7114, "step": 16000 }, { "epoch": 7.640665392829043, "grad_norm": 0.2594705820083618, "learning_rate": 0.00053121875, "loss": 2.706, "step": 17000 }, { "epoch": 8.0, "eval_accuracy": 0.45629784016441116, "eval_loss": 2.8461852073669434, "eval_runtime": 73.5549, "eval_samples_per_second": 824.962, "eval_steps_per_second": 12.902, "step": 17800 }, { "epoch": 8.089917949870744, "grad_norm": 0.2530349791049957, "learning_rate": 0.0005624375, "loss": 2.6931, "step": 18000 }, { "epoch": 8.539507699224458, "grad_norm": 0.2322220653295517, "learning_rate": 0.0005936875, "loss": 2.673, "step": 19000 }, { "epoch": 8.989097448578173, "grad_norm": 0.229995995759964, "learning_rate": 0.0006249375000000001, "loss": 2.6843, "step": 20000 }, { "epoch": 9.0, "eval_accuracy": 0.4581340881771298, "eval_loss": 2.828643321990967, "eval_runtime": 72.9062, "eval_samples_per_second": 832.303, "eval_steps_per_second": 13.017, "step": 20025 }, { "epoch": 9.438350005619872, "grad_norm": 0.2317390888929367, "learning_rate": 0.0006561562500000001, "loss": 2.6473, "step": 21000 }, { "epoch": 9.887939754973587, "grad_norm": 0.21522429585456848, "learning_rate": 0.00068740625, "loss": 2.6625, "step": 22000 }, { "epoch": 10.0, "eval_accuracy": 0.4588648907156798, "eval_loss": 2.8217990398406982, "eval_runtime": 73.1813, "eval_samples_per_second": 829.173, "eval_steps_per_second": 12.968, "step": 22250 }, { "epoch": 10.337192312015286, "grad_norm": 0.22456763684749603, "learning_rate": 0.00071865625, "loss": 2.6346, "step": 23000 }, { "epoch": 10.786782061369001, "grad_norm": 0.2095971554517746, "learning_rate": 0.000749875, "loss": 2.6482, "step": 24000 }, { "epoch": 11.0, "eval_accuracy": 0.4596921167939819, "eval_loss": 2.8130385875701904, "eval_runtime": 72.6333, "eval_samples_per_second": 835.429, "eval_steps_per_second": 13.066, "step": 24475 }, { "epoch": 11.2360346184107, "grad_norm": 0.20169340074062347, "learning_rate": 0.000781125, "loss": 2.6289, "step": 25000 }, { "epoch": 11.685624367764415, "grad_norm": 0.19152085483074188, "learning_rate": 0.00081234375, "loss": 2.6327, "step": 26000 }, { "epoch": 12.0, "eval_accuracy": 0.46038278594232684, "eval_loss": 2.807528495788574, "eval_runtime": 72.4528, "eval_samples_per_second": 837.51, "eval_steps_per_second": 13.098, "step": 26700 }, { "epoch": 12.134876924806115, "grad_norm": 0.1980164498090744, "learning_rate": 0.00084359375, "loss": 2.6247, "step": 27000 }, { "epoch": 12.58446667415983, "grad_norm": 0.18324294686317444, "learning_rate": 0.0008748437500000001, "loss": 2.6196, "step": 28000 }, { "epoch": 13.0, "eval_accuracy": 0.4610464409890522, "eval_loss": 2.7995188236236572, "eval_runtime": 72.6154, "eval_samples_per_second": 835.635, "eval_steps_per_second": 13.069, "step": 28925 }, { "epoch": 13.033719231201529, "grad_norm": 0.19393277168273926, "learning_rate": 0.0009060625, "loss": 2.6278, "step": 29000 }, { "epoch": 13.483308980555243, "grad_norm": 0.18423600494861603, "learning_rate": 0.0009373125, "loss": 2.6053, "step": 30000 }, { "epoch": 13.932898729908958, "grad_norm": 0.17649294435977936, "learning_rate": 0.00096853125, "loss": 2.6254, "step": 31000 }, { "epoch": 14.0, "eval_accuracy": 0.46197015523414375, "eval_loss": 2.7950985431671143, "eval_runtime": 72.3838, "eval_samples_per_second": 838.309, "eval_steps_per_second": 13.111, "step": 31150 }, { "epoch": 14.382151286950657, "grad_norm": 0.18526628613471985, "learning_rate": 0.00099978125, "loss": 2.596, "step": 32000 }, { "epoch": 14.831741036304372, "grad_norm": 0.18303723633289337, "learning_rate": 0.0009205128205128205, "loss": 2.6119, "step": 33000 }, { "epoch": 15.0, "eval_accuracy": 0.46398089624775424, "eval_loss": 2.7756266593933105, "eval_runtime": 72.4994, "eval_samples_per_second": 836.973, "eval_steps_per_second": 13.09, "step": 33375 }, { "epoch": 15.280993593346071, "grad_norm": 0.17772276699543, "learning_rate": 0.0008403846153846155, "loss": 2.5714, "step": 34000 }, { "epoch": 15.730583342699786, "grad_norm": 0.17440925538539886, "learning_rate": 0.0007603365384615385, "loss": 2.5659, "step": 35000 }, { "epoch": 16.0, "eval_accuracy": 0.467761513306707, "eval_loss": 2.750791072845459, "eval_runtime": 72.6994, "eval_samples_per_second": 834.669, "eval_steps_per_second": 13.054, "step": 35600 }, { "epoch": 16.179835899741487, "grad_norm": 0.18570850789546967, "learning_rate": 0.0006802083333333333, "loss": 2.5372, "step": 36000 }, { "epoch": 16.6294256490952, "grad_norm": 0.17522676289081573, "learning_rate": 0.0006000801282051283, "loss": 2.5146, "step": 37000 }, { "epoch": 17.0, "eval_accuracy": 0.47085624361808004, "eval_loss": 2.7269184589385986, "eval_runtime": 72.7436, "eval_samples_per_second": 834.162, "eval_steps_per_second": 13.046, "step": 37825 }, { "epoch": 17.0786782061369, "grad_norm": 0.18600726127624512, "learning_rate": 0.0005201121794871794, "loss": 2.4995, "step": 38000 }, { "epoch": 17.528267955490616, "grad_norm": 0.19313445687294006, "learning_rate": 0.0004399839743589744, "loss": 2.4549, "step": 39000 }, { "epoch": 17.97785770484433, "grad_norm": 0.1866413950920105, "learning_rate": 0.0003598557692307692, "loss": 2.4598, "step": 40000 }, { "epoch": 18.0, "eval_accuracy": 0.4740384143110112, "eval_loss": 2.706336736679077, "eval_runtime": 72.6636, "eval_samples_per_second": 835.082, "eval_steps_per_second": 13.06, "step": 40050 }, { "epoch": 18.42711026188603, "grad_norm": 0.19298428297042847, "learning_rate": 0.0002797275641025641, "loss": 2.3904, "step": 41000 }, { "epoch": 18.876700011239745, "grad_norm": 0.19698494672775269, "learning_rate": 0.0001996794871794872, "loss": 2.394, "step": 42000 }, { "epoch": 19.0, "eval_accuracy": 0.47701274445176883, "eval_loss": 2.690155506134033, "eval_runtime": 72.5061, "eval_samples_per_second": 836.895, "eval_steps_per_second": 13.089, "step": 42275 }, { "epoch": 19.325952568281444, "grad_norm": 0.20177572965621948, "learning_rate": 0.00011955128205128206, "loss": 2.3376, "step": 43000 }, { "epoch": 19.775542317635157, "grad_norm": 0.19659394025802612, "learning_rate": 3.950320512820513e-05, "loss": 2.3215, "step": 44000 }, { "epoch": 19.99134539732494, "eval_accuracy": 0.4782639238952008, "eval_loss": 2.6884331703186035, "eval_runtime": 72.6335, "eval_samples_per_second": 835.427, "eval_steps_per_second": 13.066, "step": 44480 }, { "epoch": 19.99134539732494, "step": 44480, "total_flos": 1.487139158163456e+18, "train_loss": 2.8006602801864955, "train_runtime": 30100.0214, "train_samples_per_second": 378.335, "train_steps_per_second": 1.478 } ], "logging_steps": 1000, "max_steps": 44480, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.487139158163456e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }