{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 500, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": 3.64324951171875, "accuracy": 0.6015625, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 6.657606601715088, "learning_rate": 2.5000000000000004e-07, "loss": 0.6742, "step": 1 }, { "Batch Mean": 3.58660888671875, "accuracy": 0.5, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 5.2591233253479, "learning_rate": 5.000000000000001e-07, "loss": 0.6808, "step": 2 }, { "Batch Mean": 3.56298828125, "accuracy": 0.546875, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 5.481376647949219, "learning_rate": 7.5e-07, "loss": 0.6846, "step": 3 }, { "Batch Mean": 3.60186767578125, "accuracy": 0.5625, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 7.51620626449585, "learning_rate": 1.0000000000000002e-06, "loss": 0.6892, "step": 4 }, { "Batch Mean": 3.57855224609375, "accuracy": 0.453125, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 8.899469375610352, "learning_rate": 1.25e-06, "loss": 0.6956, "step": 5 }, { "Batch Mean": 3.595703125, "accuracy": 0.5546875, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 6.726147174835205, "learning_rate": 1.5e-06, "loss": 0.6839, "step": 6 }, { "Batch Mean": 3.64105224609375, "accuracy": 0.453125, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 7.116271018981934, "learning_rate": 1.75e-06, "loss": 0.7154, "step": 7 }, { "Batch Mean": 3.6541748046875, "accuracy": 0.5546875, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 6.190150260925293, "learning_rate": 2.0000000000000003e-06, "loss": 0.6844, "step": 8 }, { "Batch Mean": 3.6507568359375, "accuracy": 0.6015625, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 7.176676273345947, "learning_rate": 2.25e-06, "loss": 0.665, "step": 9 }, { "Batch Mean": 3.6649169921875, "accuracy": 0.59375, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 5.139154434204102, "learning_rate": 2.5e-06, "loss": 0.6626, "step": 10 }, { "Batch Mean": 3.70733642578125, "accuracy": 0.671875, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 5.621807098388672, "learning_rate": 2.7500000000000004e-06, "loss": 0.6299, "step": 11 }, { "Batch Mean": 3.74017333984375, "accuracy": 0.65625, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 5.459871768951416, "learning_rate": 3e-06, "loss": 0.6321, "step": 12 }, { "Batch Mean": 3.8514404296875, "accuracy": 0.625, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 7.757749557495117, "learning_rate": 3.2500000000000002e-06, "loss": 0.6453, "step": 13 }, { "Batch Mean": 3.80059814453125, "accuracy": 0.703125, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 6.681384086608887, "learning_rate": 3.5e-06, "loss": 0.5769, "step": 14 }, { "Batch Mean": 3.8978271484375, "accuracy": 0.6640625, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 10.110990524291992, "learning_rate": 3.7500000000000005e-06, "loss": 0.6354, "step": 15 }, { "Batch Mean": 3.6146774291992188, "accuracy": 0.71875, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 10.233904838562012, "learning_rate": 4.000000000000001e-06, "loss": 0.6182, "step": 16 }, { "Batch Mean": 3.5331335067749023, "accuracy": 0.6328125, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 10.035528182983398, "learning_rate": 4.25e-06, "loss": 0.6252, "step": 17 }, { "Batch Mean": 3.0357871055603027, "accuracy": 0.625, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 9.972918510437012, "learning_rate": 4.5e-06, "loss": 0.6225, "step": 18 }, { "Batch Mean": 2.6747031211853027, "accuracy": 0.6796875, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 21.3759765625, "learning_rate": 4.75e-06, "loss": 0.6328, "step": 19 }, { "Batch Mean": 2.418215751647949, "accuracy": 0.6875, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 10.712898254394531, "learning_rate": 5e-06, "loss": 0.6352, "step": 20 }, { "Batch Mean": 1.9824256896972656, "accuracy": 0.640625, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 10.833212852478027, "learning_rate": 4.986842105263158e-06, "loss": 0.6244, "step": 21 }, { "Batch Mean": 1.5567502975463867, "accuracy": 0.6484375, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 8.710196495056152, "learning_rate": 4.973684210526316e-06, "loss": 0.6002, "step": 22 }, { "Batch Mean": 1.262591004371643, "accuracy": 0.6484375, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 9.207337379455566, "learning_rate": 4.960526315789474e-06, "loss": 0.5754, "step": 23 }, { "Batch Mean": 0.9924072027206421, "accuracy": 0.7109375, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 7.179128170013428, "learning_rate": 4.947368421052632e-06, "loss": 0.5446, "step": 24 }, { "Batch Mean": 0.7883305549621582, "accuracy": 0.6484375, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 7.777095794677734, "learning_rate": 4.9342105263157895e-06, "loss": 0.6383, "step": 25 }, { "Batch Mean": 0.7204087972640991, "accuracy": 0.7109375, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 6.316252708435059, "learning_rate": 4.921052631578948e-06, "loss": 0.5416, "step": 26 }, { "Batch Mean": 0.7001075744628906, "accuracy": 0.75, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 6.2172017097473145, "learning_rate": 4.907894736842106e-06, "loss": 0.5603, "step": 27 }, { "Batch Mean": 0.4948960542678833, "accuracy": 0.6640625, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 6.071000576019287, "learning_rate": 4.894736842105264e-06, "loss": 0.5919, "step": 28 }, { "Batch Mean": 0.6193783283233643, "accuracy": 0.609375, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 7.123428821563721, "learning_rate": 4.881578947368422e-06, "loss": 0.6265, "step": 29 }, { "Batch Mean": 0.5463962554931641, "accuracy": 0.6640625, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 8.785526275634766, "learning_rate": 4.8684210526315795e-06, "loss": 0.6251, "step": 30 }, { "Batch Mean": 0.6883676052093506, "accuracy": 0.7265625, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 6.22755241394043, "learning_rate": 4.855263157894737e-06, "loss": 0.5505, "step": 31 }, { "Batch Mean": 0.8646153211593628, "accuracy": 0.71875, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 5.980678558349609, "learning_rate": 4.842105263157895e-06, "loss": 0.5817, "step": 32 }, { "Batch Mean": 0.9955297708511353, "accuracy": 0.7109375, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 5.80774450302124, "learning_rate": 4.828947368421053e-06, "loss": 0.5697, "step": 33 }, { "Batch Mean": 1.163506031036377, "accuracy": 0.7421875, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 7.277930736541748, "learning_rate": 4.815789473684211e-06, "loss": 0.5299, "step": 34 }, { "Batch Mean": 1.314648151397705, "accuracy": 0.7109375, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 6.664306163787842, "learning_rate": 4.802631578947369e-06, "loss": 0.5548, "step": 35 }, { "Batch Mean": 1.4656352996826172, "accuracy": 0.7421875, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 6.886595249176025, "learning_rate": 4.789473684210527e-06, "loss": 0.5432, "step": 36 }, { "Batch Mean": 1.5227254629135132, "accuracy": 0.7109375, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 6.418576717376709, "learning_rate": 4.7763157894736844e-06, "loss": 0.5536, "step": 37 }, { "Batch Mean": 1.631667137145996, "accuracy": 0.71875, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 8.560834884643555, "learning_rate": 4.763157894736842e-06, "loss": 0.569, "step": 38 }, { "Batch Mean": 1.6476902961730957, "accuracy": 0.734375, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 8.121230125427246, "learning_rate": 4.75e-06, "loss": 0.5418, "step": 39 }, { "Batch Mean": 1.492204189300537, "accuracy": 0.71875, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 7.698504447937012, "learning_rate": 4.736842105263158e-06, "loss": 0.5079, "step": 40 }, { "Batch Mean": 1.3397908210754395, "accuracy": 0.75, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 8.164704322814941, "learning_rate": 4.723684210526316e-06, "loss": 0.5291, "step": 41 }, { "Batch Mean": 1.4944896697998047, "accuracy": 0.75, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 7.740182399749756, "learning_rate": 4.710526315789474e-06, "loss": 0.5375, "step": 42 }, { "Batch Mean": 1.322850227355957, "accuracy": 0.765625, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 7.922670841217041, "learning_rate": 4.697368421052632e-06, "loss": 0.4777, "step": 43 }, { "Batch Mean": 1.4683116674423218, "accuracy": 0.7578125, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 8.393726348876953, "learning_rate": 4.68421052631579e-06, "loss": 0.4728, "step": 44 }, { "Batch Mean": 1.6116371154785156, "accuracy": 0.7109375, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 9.989266395568848, "learning_rate": 4.671052631578948e-06, "loss": 0.5428, "step": 45 }, { "Batch Mean": 1.52450692653656, "accuracy": 0.7890625, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 9.131291389465332, "learning_rate": 4.657894736842106e-06, "loss": 0.4236, "step": 46 }, { "Batch Mean": 1.776949405670166, "accuracy": 0.71875, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 11.912744522094727, "learning_rate": 4.6447368421052635e-06, "loss": 0.6295, "step": 47 }, { "Batch Mean": 2.3817920684814453, "accuracy": 0.6875, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 10.917041778564453, "learning_rate": 4.631578947368421e-06, "loss": 0.5316, "step": 48 }, { "Batch Mean": 2.175340175628662, "accuracy": 0.7734375, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 8.514415740966797, "learning_rate": 4.618421052631579e-06, "loss": 0.4724, "step": 49 }, { "Batch Mean": 2.521066665649414, "accuracy": 0.65625, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 10.435226440429688, "learning_rate": 4.605263157894737e-06, "loss": 0.6361, "step": 50 }, { "Batch Mean": 2.40045166015625, "accuracy": 0.7578125, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 7.130827903747559, "learning_rate": 4.592105263157895e-06, "loss": 0.4949, "step": 51 }, { "Batch Mean": 2.7120556831359863, "accuracy": 0.6953125, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 6.832071304321289, "learning_rate": 4.578947368421053e-06, "loss": 0.5509, "step": 52 }, { "Batch Mean": 2.6350326538085938, "accuracy": 0.7109375, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 6.527810096740723, "learning_rate": 4.565789473684211e-06, "loss": 0.5334, "step": 53 }, { "Batch Mean": 2.8320467472076416, "accuracy": 0.75, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 6.686796188354492, "learning_rate": 4.552631578947369e-06, "loss": 0.505, "step": 54 }, { "Batch Mean": 2.5793724060058594, "accuracy": 0.71875, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 6.586344242095947, "learning_rate": 4.539473684210527e-06, "loss": 0.5145, "step": 55 }, { "Batch Mean": 2.652639389038086, "accuracy": 0.7890625, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 7.232706069946289, "learning_rate": 4.526315789473685e-06, "loss": 0.4847, "step": 56 }, { "Batch Mean": 2.7974865436553955, "accuracy": 0.671875, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 9.864455223083496, "learning_rate": 4.513157894736843e-06, "loss": 0.5848, "step": 57 }, { "Batch Mean": 2.7397356033325195, "accuracy": 0.7734375, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 6.654571056365967, "learning_rate": 4.5e-06, "loss": 0.5149, "step": 58 }, { "Batch Mean": 3.020786762237549, "accuracy": 0.7109375, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 7.521582126617432, "learning_rate": 4.4868421052631584e-06, "loss": 0.532, "step": 59 }, { "Batch Mean": 3.0784976482391357, "accuracy": 0.7421875, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 7.8287272453308105, "learning_rate": 4.473684210526316e-06, "loss": 0.5201, "step": 60 }, { "Batch Mean": 3.2929041385650635, "accuracy": 0.6875, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 7.036741733551025, "learning_rate": 4.460526315789474e-06, "loss": 0.5192, "step": 61 }, { "Batch Mean": 3.2737629413604736, "accuracy": 0.7734375, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 8.064910888671875, "learning_rate": 4.447368421052632e-06, "loss": 0.5094, "step": 62 }, { "Batch Mean": 3.0026721954345703, "accuracy": 0.75, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 8.189529418945312, "learning_rate": 4.43421052631579e-06, "loss": 0.4838, "step": 63 }, { "Batch Mean": 3.0598621368408203, "accuracy": 0.7109375, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 7.452142238616943, "learning_rate": 4.4210526315789476e-06, "loss": 0.5297, "step": 64 }, { "Batch Mean": 2.890104293823242, "accuracy": 0.734375, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 7.773412227630615, "learning_rate": 4.407894736842105e-06, "loss": 0.4938, "step": 65 }, { "Batch Mean": 2.8558566570281982, "accuracy": 0.7890625, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 7.888664722442627, "learning_rate": 4.394736842105263e-06, "loss": 0.5426, "step": 66 }, { "Batch Mean": 2.8415894508361816, "accuracy": 0.7265625, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 7.992714881896973, "learning_rate": 4.381578947368421e-06, "loss": 0.5278, "step": 67 }, { "Batch Mean": 2.5067033767700195, "accuracy": 0.71875, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 7.569831848144531, "learning_rate": 4.368421052631579e-06, "loss": 0.497, "step": 68 }, { "Batch Mean": 2.690906047821045, "accuracy": 0.7890625, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 8.406615257263184, "learning_rate": 4.3552631578947375e-06, "loss": 0.5383, "step": 69 }, { "Batch Mean": 2.54428768157959, "accuracy": 0.7890625, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 6.022793292999268, "learning_rate": 4.342105263157895e-06, "loss": 0.4619, "step": 70 }, { "Batch Mean": 2.539039134979248, "accuracy": 0.75, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 5.931001663208008, "learning_rate": 4.328947368421053e-06, "loss": 0.4643, "step": 71 }, { "Batch Mean": 2.3624706268310547, "accuracy": 0.75, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 6.277031898498535, "learning_rate": 4.315789473684211e-06, "loss": 0.5027, "step": 72 }, { "Batch Mean": 2.5795602798461914, "accuracy": 0.7734375, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 7.181740760803223, "learning_rate": 4.302631578947369e-06, "loss": 0.4652, "step": 73 }, { "Batch Mean": 2.328335762023926, "accuracy": 0.7265625, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 6.926029205322266, "learning_rate": 4.289473684210527e-06, "loss": 0.4554, "step": 74 }, { "Batch Mean": 2.195453643798828, "accuracy": 0.7109375, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 7.934150695800781, "learning_rate": 4.276315789473684e-06, "loss": 0.5458, "step": 75 }, { "Batch Mean": 2.7026095390319824, "accuracy": 0.7421875, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 7.034529209136963, "learning_rate": 4.2631578947368425e-06, "loss": 0.5041, "step": 76 }, { "Batch Mean": 2.611628532409668, "accuracy": 0.734375, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 7.741730213165283, "learning_rate": 4.25e-06, "loss": 0.4804, "step": 77 }, { "Batch Mean": 2.5273165702819824, "accuracy": 0.765625, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 7.251089572906494, "learning_rate": 4.236842105263158e-06, "loss": 0.4959, "step": 78 }, { "Batch Mean": 2.5558762550354004, "accuracy": 0.7109375, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 7.240261077880859, "learning_rate": 4.223684210526316e-06, "loss": 0.5173, "step": 79 }, { "Batch Mean": 2.2066729068756104, "accuracy": 0.78125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 7.0953474044799805, "learning_rate": 4.210526315789474e-06, "loss": 0.5174, "step": 80 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }