|
{ |
|
"best_metric": 2.979677438735962, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-200", |
|
"epoch": 0.5161290322580645, |
|
"eval_steps": 50, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025806451612903226, |
|
"grad_norm": 2.02890944480896, |
|
"learning_rate": 1.007e-05, |
|
"loss": 3.2543, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0025806451612903226, |
|
"eval_loss": 3.4940507411956787, |
|
"eval_runtime": 39.1123, |
|
"eval_samples_per_second": 4.167, |
|
"eval_steps_per_second": 1.048, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005161290322580645, |
|
"grad_norm": 2.1640987396240234, |
|
"learning_rate": 2.014e-05, |
|
"loss": 3.1859, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007741935483870968, |
|
"grad_norm": 2.289116859436035, |
|
"learning_rate": 3.0209999999999997e-05, |
|
"loss": 3.1102, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01032258064516129, |
|
"grad_norm": 1.9029440879821777, |
|
"learning_rate": 4.028e-05, |
|
"loss": 3.1912, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012903225806451613, |
|
"grad_norm": 1.5423721075057983, |
|
"learning_rate": 5.035e-05, |
|
"loss": 3.3264, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015483870967741935, |
|
"grad_norm": 1.1242539882659912, |
|
"learning_rate": 6.0419999999999994e-05, |
|
"loss": 2.9698, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01806451612903226, |
|
"grad_norm": 1.6809178590774536, |
|
"learning_rate": 7.049e-05, |
|
"loss": 2.8629, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02064516129032258, |
|
"grad_norm": 1.3089873790740967, |
|
"learning_rate": 8.056e-05, |
|
"loss": 2.9564, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.023225806451612905, |
|
"grad_norm": 0.9860974550247192, |
|
"learning_rate": 9.062999999999999e-05, |
|
"loss": 3.0032, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.025806451612903226, |
|
"grad_norm": 0.9577404856681824, |
|
"learning_rate": 0.0001007, |
|
"loss": 2.7733, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02838709677419355, |
|
"grad_norm": 1.073362946510315, |
|
"learning_rate": 0.00010017, |
|
"loss": 3.0293, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03096774193548387, |
|
"grad_norm": 0.9577970504760742, |
|
"learning_rate": 9.963999999999999e-05, |
|
"loss": 3.0549, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03354838709677419, |
|
"grad_norm": 0.8062331676483154, |
|
"learning_rate": 9.910999999999999e-05, |
|
"loss": 2.9548, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03612903225806452, |
|
"grad_norm": 0.8570857644081116, |
|
"learning_rate": 9.858e-05, |
|
"loss": 3.1505, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03870967741935484, |
|
"grad_norm": 0.9095686674118042, |
|
"learning_rate": 9.805e-05, |
|
"loss": 2.8758, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04129032258064516, |
|
"grad_norm": 0.8106794953346252, |
|
"learning_rate": 9.752e-05, |
|
"loss": 2.9163, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04387096774193548, |
|
"grad_norm": 0.8135427832603455, |
|
"learning_rate": 9.698999999999999e-05, |
|
"loss": 3.1537, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04645161290322581, |
|
"grad_norm": 0.773794412612915, |
|
"learning_rate": 9.646e-05, |
|
"loss": 2.9923, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04903225806451613, |
|
"grad_norm": 0.776435911655426, |
|
"learning_rate": 9.593e-05, |
|
"loss": 2.994, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05161290322580645, |
|
"grad_norm": 0.702139139175415, |
|
"learning_rate": 9.539999999999999e-05, |
|
"loss": 2.8807, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05419354838709677, |
|
"grad_norm": 0.6850553750991821, |
|
"learning_rate": 9.487e-05, |
|
"loss": 3.0033, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0567741935483871, |
|
"grad_norm": 0.6869837045669556, |
|
"learning_rate": 9.434e-05, |
|
"loss": 2.7906, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05935483870967742, |
|
"grad_norm": 0.7767460942268372, |
|
"learning_rate": 9.381e-05, |
|
"loss": 2.8578, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06193548387096774, |
|
"grad_norm": 0.747832179069519, |
|
"learning_rate": 9.327999999999999e-05, |
|
"loss": 2.9695, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 0.7731716632843018, |
|
"learning_rate": 9.274999999999999e-05, |
|
"loss": 2.9062, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06709677419354838, |
|
"grad_norm": 0.8283132910728455, |
|
"learning_rate": 9.222e-05, |
|
"loss": 3.1388, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0696774193548387, |
|
"grad_norm": 0.8168233036994934, |
|
"learning_rate": 9.169e-05, |
|
"loss": 3.0032, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07225806451612904, |
|
"grad_norm": 0.7814300060272217, |
|
"learning_rate": 9.116e-05, |
|
"loss": 3.0617, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07483870967741936, |
|
"grad_norm": 0.8907764554023743, |
|
"learning_rate": 9.062999999999999e-05, |
|
"loss": 2.8873, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07741935483870968, |
|
"grad_norm": 0.897400975227356, |
|
"learning_rate": 9.01e-05, |
|
"loss": 3.0865, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.916833758354187, |
|
"learning_rate": 8.957e-05, |
|
"loss": 2.9955, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08258064516129032, |
|
"grad_norm": 0.9079581499099731, |
|
"learning_rate": 8.903999999999999e-05, |
|
"loss": 2.9475, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08516129032258064, |
|
"grad_norm": 1.2848162651062012, |
|
"learning_rate": 8.850999999999999e-05, |
|
"loss": 3.0854, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08774193548387096, |
|
"grad_norm": 1.0301451683044434, |
|
"learning_rate": 8.798e-05, |
|
"loss": 3.2001, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09032258064516129, |
|
"grad_norm": 0.9421987533569336, |
|
"learning_rate": 8.745e-05, |
|
"loss": 2.7962, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09290322580645162, |
|
"grad_norm": 1.2306110858917236, |
|
"learning_rate": 8.692e-05, |
|
"loss": 3.1929, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09548387096774194, |
|
"grad_norm": 1.1693624258041382, |
|
"learning_rate": 8.638999999999999e-05, |
|
"loss": 3.1043, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09806451612903226, |
|
"grad_norm": 1.169491171836853, |
|
"learning_rate": 8.586e-05, |
|
"loss": 2.7704, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10064516129032258, |
|
"grad_norm": 1.1204756498336792, |
|
"learning_rate": 8.533e-05, |
|
"loss": 3.2268, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1032258064516129, |
|
"grad_norm": 1.1709730625152588, |
|
"learning_rate": 8.479999999999999e-05, |
|
"loss": 2.9685, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10580645161290322, |
|
"grad_norm": 1.2603025436401367, |
|
"learning_rate": 8.427e-05, |
|
"loss": 2.9147, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10838709677419354, |
|
"grad_norm": 1.5371952056884766, |
|
"learning_rate": 8.374e-05, |
|
"loss": 2.9618, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11096774193548387, |
|
"grad_norm": 1.4978915452957153, |
|
"learning_rate": 8.321e-05, |
|
"loss": 3.0561, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.1135483870967742, |
|
"grad_norm": 1.8759700059890747, |
|
"learning_rate": 8.268e-05, |
|
"loss": 3.5141, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11612903225806452, |
|
"grad_norm": 1.6922487020492554, |
|
"learning_rate": 8.214999999999999e-05, |
|
"loss": 3.229, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11870967741935484, |
|
"grad_norm": 1.9749841690063477, |
|
"learning_rate": 8.162e-05, |
|
"loss": 3.2539, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12129032258064516, |
|
"grad_norm": 2.2926204204559326, |
|
"learning_rate": 8.108999999999998e-05, |
|
"loss": 3.1085, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12387096774193548, |
|
"grad_norm": 4.02115535736084, |
|
"learning_rate": 8.056e-05, |
|
"loss": 3.1481, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12645161290322582, |
|
"grad_norm": 4.62841272354126, |
|
"learning_rate": 8.003e-05, |
|
"loss": 3.7727, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 6.652851581573486, |
|
"learning_rate": 7.95e-05, |
|
"loss": 3.9126, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"eval_loss": 3.116821765899658, |
|
"eval_runtime": 38.3432, |
|
"eval_samples_per_second": 4.251, |
|
"eval_steps_per_second": 1.069, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13161290322580646, |
|
"grad_norm": 1.2032880783081055, |
|
"learning_rate": 7.897e-05, |
|
"loss": 2.8811, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13419354838709677, |
|
"grad_norm": 0.9579372406005859, |
|
"learning_rate": 7.843999999999999e-05, |
|
"loss": 2.9946, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1367741935483871, |
|
"grad_norm": 0.7830987572669983, |
|
"learning_rate": 7.790999999999999e-05, |
|
"loss": 2.8823, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1393548387096774, |
|
"grad_norm": 0.6758972406387329, |
|
"learning_rate": 7.738e-05, |
|
"loss": 2.9346, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14193548387096774, |
|
"grad_norm": 0.6744924187660217, |
|
"learning_rate": 7.685e-05, |
|
"loss": 2.8026, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14451612903225808, |
|
"grad_norm": 0.6925913691520691, |
|
"learning_rate": 7.632e-05, |
|
"loss": 2.9121, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14709677419354839, |
|
"grad_norm": 0.6952354311943054, |
|
"learning_rate": 7.578999999999999e-05, |
|
"loss": 2.9217, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14967741935483872, |
|
"grad_norm": 0.6015385985374451, |
|
"learning_rate": 7.526e-05, |
|
"loss": 2.911, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.15225806451612903, |
|
"grad_norm": 0.6113649606704712, |
|
"learning_rate": 7.473e-05, |
|
"loss": 2.8453, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15483870967741936, |
|
"grad_norm": 0.6024471521377563, |
|
"learning_rate": 7.419999999999999e-05, |
|
"loss": 2.7338, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15741935483870967, |
|
"grad_norm": 0.6171099543571472, |
|
"learning_rate": 7.367e-05, |
|
"loss": 2.7157, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6734641790390015, |
|
"learning_rate": 7.314e-05, |
|
"loss": 3.0369, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1625806451612903, |
|
"grad_norm": 0.6411603689193726, |
|
"learning_rate": 7.261e-05, |
|
"loss": 2.9175, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16516129032258065, |
|
"grad_norm": 0.6401498913764954, |
|
"learning_rate": 7.208e-05, |
|
"loss": 2.9601, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16774193548387098, |
|
"grad_norm": 0.6685578227043152, |
|
"learning_rate": 7.154999999999999e-05, |
|
"loss": 2.9055, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1703225806451613, |
|
"grad_norm": 0.6739301681518555, |
|
"learning_rate": 7.102e-05, |
|
"loss": 2.8973, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.17290322580645162, |
|
"grad_norm": 0.6716254353523254, |
|
"learning_rate": 7.049e-05, |
|
"loss": 2.8507, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17548387096774193, |
|
"grad_norm": 0.6408494710922241, |
|
"learning_rate": 6.996e-05, |
|
"loss": 2.8232, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.17806451612903226, |
|
"grad_norm": 0.6635752320289612, |
|
"learning_rate": 6.943e-05, |
|
"loss": 3.088, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.18064516129032257, |
|
"grad_norm": 0.710978090763092, |
|
"learning_rate": 6.89e-05, |
|
"loss": 2.8905, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1832258064516129, |
|
"grad_norm": 0.7735083103179932, |
|
"learning_rate": 6.837e-05, |
|
"loss": 2.9583, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18580645161290324, |
|
"grad_norm": 0.7552114725112915, |
|
"learning_rate": 6.784e-05, |
|
"loss": 2.8666, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18838709677419355, |
|
"grad_norm": 0.8119356036186218, |
|
"learning_rate": 6.730999999999999e-05, |
|
"loss": 3.0893, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.19096774193548388, |
|
"grad_norm": 0.7227278351783752, |
|
"learning_rate": 6.678e-05, |
|
"loss": 2.9174, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 0.7297806143760681, |
|
"learning_rate": 6.625e-05, |
|
"loss": 2.9618, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19612903225806452, |
|
"grad_norm": 0.7950009107589722, |
|
"learning_rate": 6.572e-05, |
|
"loss": 2.8738, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19870967741935483, |
|
"grad_norm": 0.7869434952735901, |
|
"learning_rate": 6.519e-05, |
|
"loss": 2.7843, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.20129032258064516, |
|
"grad_norm": 0.8707318902015686, |
|
"learning_rate": 6.466e-05, |
|
"loss": 3.1622, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.20387096774193547, |
|
"grad_norm": 0.8520801663398743, |
|
"learning_rate": 6.413e-05, |
|
"loss": 3.0377, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2064516129032258, |
|
"grad_norm": 0.9687163233757019, |
|
"learning_rate": 6.359999999999999e-05, |
|
"loss": 3.0816, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20903225806451614, |
|
"grad_norm": 0.8639389872550964, |
|
"learning_rate": 6.306999999999999e-05, |
|
"loss": 2.8825, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.21161290322580645, |
|
"grad_norm": 1.042325735092163, |
|
"learning_rate": 6.254000000000001e-05, |
|
"loss": 2.9446, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.21419354838709678, |
|
"grad_norm": 0.9391971230506897, |
|
"learning_rate": 6.201e-05, |
|
"loss": 3.0544, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2167741935483871, |
|
"grad_norm": 1.0379990339279175, |
|
"learning_rate": 6.148e-05, |
|
"loss": 3.1278, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.21935483870967742, |
|
"grad_norm": 1.0052063465118408, |
|
"learning_rate": 6.095e-05, |
|
"loss": 3.0154, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.22193548387096773, |
|
"grad_norm": 1.1294814348220825, |
|
"learning_rate": 6.0419999999999994e-05, |
|
"loss": 2.9711, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.22451612903225807, |
|
"grad_norm": 1.1187207698822021, |
|
"learning_rate": 5.988999999999999e-05, |
|
"loss": 2.7353, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2270967741935484, |
|
"grad_norm": 1.1556931734085083, |
|
"learning_rate": 5.9359999999999994e-05, |
|
"loss": 3.0027, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2296774193548387, |
|
"grad_norm": 1.4021755456924438, |
|
"learning_rate": 5.8830000000000004e-05, |
|
"loss": 2.9716, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.23225806451612904, |
|
"grad_norm": 1.32869291305542, |
|
"learning_rate": 5.83e-05, |
|
"loss": 3.2768, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23483870967741935, |
|
"grad_norm": 1.6008881330490112, |
|
"learning_rate": 5.777e-05, |
|
"loss": 3.2203, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.23741935483870968, |
|
"grad_norm": 1.739267349243164, |
|
"learning_rate": 5.7239999999999994e-05, |
|
"loss": 3.0869, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.6709328889846802, |
|
"learning_rate": 5.671e-05, |
|
"loss": 3.1174, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.24258064516129033, |
|
"grad_norm": 2.5117313861846924, |
|
"learning_rate": 5.6179999999999994e-05, |
|
"loss": 3.2951, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.24516129032258063, |
|
"grad_norm": 2.1630053520202637, |
|
"learning_rate": 5.5650000000000004e-05, |
|
"loss": 3.3658, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.24774193548387097, |
|
"grad_norm": 2.027144193649292, |
|
"learning_rate": 5.512e-05, |
|
"loss": 3.3735, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2503225806451613, |
|
"grad_norm": 2.5083370208740234, |
|
"learning_rate": 5.459e-05, |
|
"loss": 2.9347, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.25290322580645164, |
|
"grad_norm": 2.995940685272217, |
|
"learning_rate": 5.406e-05, |
|
"loss": 3.5815, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.25548387096774194, |
|
"grad_norm": 3.9194164276123047, |
|
"learning_rate": 5.353e-05, |
|
"loss": 3.3064, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 7.003715991973877, |
|
"learning_rate": 5.2999999999999994e-05, |
|
"loss": 3.744, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"eval_loss": 3.0592901706695557, |
|
"eval_runtime": 38.3187, |
|
"eval_samples_per_second": 4.254, |
|
"eval_steps_per_second": 1.07, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26064516129032256, |
|
"grad_norm": 0.8486892580986023, |
|
"learning_rate": 5.246999999999999e-05, |
|
"loss": 2.855, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2632258064516129, |
|
"grad_norm": 0.8181604146957397, |
|
"learning_rate": 5.194e-05, |
|
"loss": 3.0667, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2658064516129032, |
|
"grad_norm": 0.7249521017074585, |
|
"learning_rate": 5.141e-05, |
|
"loss": 2.6855, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.26838709677419353, |
|
"grad_norm": 0.7008607387542725, |
|
"learning_rate": 5.088e-05, |
|
"loss": 2.9785, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2709677419354839, |
|
"grad_norm": 0.6476490497589111, |
|
"learning_rate": 5.035e-05, |
|
"loss": 3.0082, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2735483870967742, |
|
"grad_norm": 0.618168294429779, |
|
"learning_rate": 4.9819999999999994e-05, |
|
"loss": 2.8595, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2761290322580645, |
|
"grad_norm": 0.6012650728225708, |
|
"learning_rate": 4.929e-05, |
|
"loss": 2.7147, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2787096774193548, |
|
"grad_norm": 0.6011011600494385, |
|
"learning_rate": 4.876e-05, |
|
"loss": 2.7046, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2812903225806452, |
|
"grad_norm": 0.9905216693878174, |
|
"learning_rate": 4.823e-05, |
|
"loss": 2.8049, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2838709677419355, |
|
"grad_norm": 0.5935449600219727, |
|
"learning_rate": 4.7699999999999994e-05, |
|
"loss": 2.8319, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2864516129032258, |
|
"grad_norm": 0.5935178399085999, |
|
"learning_rate": 4.717e-05, |
|
"loss": 2.9189, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.28903225806451616, |
|
"grad_norm": 0.6392220854759216, |
|
"learning_rate": 4.6639999999999994e-05, |
|
"loss": 2.6776, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29161290322580646, |
|
"grad_norm": 0.6458805799484253, |
|
"learning_rate": 4.611e-05, |
|
"loss": 2.8857, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.29419354838709677, |
|
"grad_norm": 0.6118318438529968, |
|
"learning_rate": 4.558e-05, |
|
"loss": 2.7875, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2967741935483871, |
|
"grad_norm": 0.6118870377540588, |
|
"learning_rate": 4.505e-05, |
|
"loss": 2.7211, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.29935483870967744, |
|
"grad_norm": 0.6446405053138733, |
|
"learning_rate": 4.4519999999999994e-05, |
|
"loss": 2.9338, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.30193548387096775, |
|
"grad_norm": 0.641838014125824, |
|
"learning_rate": 4.399e-05, |
|
"loss": 2.6843, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.30451612903225805, |
|
"grad_norm": 0.609104335308075, |
|
"learning_rate": 4.346e-05, |
|
"loss": 2.848, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.30709677419354836, |
|
"grad_norm": 0.6547131538391113, |
|
"learning_rate": 4.293e-05, |
|
"loss": 2.9458, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3096774193548387, |
|
"grad_norm": 0.7462600469589233, |
|
"learning_rate": 4.2399999999999994e-05, |
|
"loss": 3.1013, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31225806451612903, |
|
"grad_norm": 0.6465981602668762, |
|
"learning_rate": 4.187e-05, |
|
"loss": 2.7645, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.31483870967741934, |
|
"grad_norm": 0.7226136922836304, |
|
"learning_rate": 4.134e-05, |
|
"loss": 3.041, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3174193548387097, |
|
"grad_norm": 0.8358485698699951, |
|
"learning_rate": 4.081e-05, |
|
"loss": 2.996, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.823491632938385, |
|
"learning_rate": 4.028e-05, |
|
"loss": 3.0795, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 0.7459017634391785, |
|
"learning_rate": 3.975e-05, |
|
"loss": 2.9079, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3251612903225806, |
|
"grad_norm": 0.7506483197212219, |
|
"learning_rate": 3.9219999999999994e-05, |
|
"loss": 2.7378, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.327741935483871, |
|
"grad_norm": 0.8168128728866577, |
|
"learning_rate": 3.869e-05, |
|
"loss": 3.1751, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3303225806451613, |
|
"grad_norm": 0.8122110962867737, |
|
"learning_rate": 3.816e-05, |
|
"loss": 3.0125, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3329032258064516, |
|
"grad_norm": 0.8495927453041077, |
|
"learning_rate": 3.763e-05, |
|
"loss": 2.903, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.33548387096774196, |
|
"grad_norm": 0.9135481119155884, |
|
"learning_rate": 3.7099999999999994e-05, |
|
"loss": 3.0412, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33806451612903227, |
|
"grad_norm": 0.8771790862083435, |
|
"learning_rate": 3.657e-05, |
|
"loss": 2.9565, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3406451612903226, |
|
"grad_norm": 0.899596631526947, |
|
"learning_rate": 3.604e-05, |
|
"loss": 3.1069, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3432258064516129, |
|
"grad_norm": 1.055289626121521, |
|
"learning_rate": 3.551e-05, |
|
"loss": 2.9057, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.34580645161290324, |
|
"grad_norm": 1.0215229988098145, |
|
"learning_rate": 3.498e-05, |
|
"loss": 3.0173, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.34838709677419355, |
|
"grad_norm": 1.0191702842712402, |
|
"learning_rate": 3.445e-05, |
|
"loss": 3.1466, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.35096774193548386, |
|
"grad_norm": 1.1208670139312744, |
|
"learning_rate": 3.392e-05, |
|
"loss": 3.2075, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3535483870967742, |
|
"grad_norm": 1.1400412321090698, |
|
"learning_rate": 3.339e-05, |
|
"loss": 3.159, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3561290322580645, |
|
"grad_norm": 1.1518625020980835, |
|
"learning_rate": 3.286e-05, |
|
"loss": 3.1083, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.35870967741935483, |
|
"grad_norm": 1.382748007774353, |
|
"learning_rate": 3.233e-05, |
|
"loss": 2.9339, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.36129032258064514, |
|
"grad_norm": 1.279389500617981, |
|
"learning_rate": 3.1799999999999994e-05, |
|
"loss": 2.9156, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3638709677419355, |
|
"grad_norm": 1.5063074827194214, |
|
"learning_rate": 3.1270000000000004e-05, |
|
"loss": 3.4519, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3664516129032258, |
|
"grad_norm": 1.417640209197998, |
|
"learning_rate": 3.074e-05, |
|
"loss": 3.0448, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3690322580645161, |
|
"grad_norm": 1.8814888000488281, |
|
"learning_rate": 3.0209999999999997e-05, |
|
"loss": 2.7309, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3716129032258065, |
|
"grad_norm": 2.1574666500091553, |
|
"learning_rate": 2.9679999999999997e-05, |
|
"loss": 3.2975, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3741935483870968, |
|
"grad_norm": 2.018376350402832, |
|
"learning_rate": 2.915e-05, |
|
"loss": 2.935, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3767741935483871, |
|
"grad_norm": 2.580366373062134, |
|
"learning_rate": 2.8619999999999997e-05, |
|
"loss": 3.0637, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3793548387096774, |
|
"grad_norm": 2.8553295135498047, |
|
"learning_rate": 2.8089999999999997e-05, |
|
"loss": 3.2642, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.38193548387096776, |
|
"grad_norm": 4.084458351135254, |
|
"learning_rate": 2.756e-05, |
|
"loss": 3.5309, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.38451612903225807, |
|
"grad_norm": 7.080618858337402, |
|
"learning_rate": 2.703e-05, |
|
"loss": 3.7543, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 9.050718307495117, |
|
"learning_rate": 2.6499999999999997e-05, |
|
"loss": 4.0362, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"eval_loss": 3.009519338607788, |
|
"eval_runtime": 37.8132, |
|
"eval_samples_per_second": 4.311, |
|
"eval_steps_per_second": 1.084, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3896774193548387, |
|
"grad_norm": 0.6236698031425476, |
|
"learning_rate": 2.597e-05, |
|
"loss": 2.7676, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.39225806451612905, |
|
"grad_norm": 0.6113293170928955, |
|
"learning_rate": 2.544e-05, |
|
"loss": 2.7639, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.39483870967741935, |
|
"grad_norm": 0.610192060470581, |
|
"learning_rate": 2.4909999999999997e-05, |
|
"loss": 2.5103, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.39741935483870966, |
|
"grad_norm": 0.6351131200790405, |
|
"learning_rate": 2.438e-05, |
|
"loss": 2.8813, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6906495690345764, |
|
"learning_rate": 2.3849999999999997e-05, |
|
"loss": 2.8643, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.40258064516129033, |
|
"grad_norm": 0.7632659673690796, |
|
"learning_rate": 2.3319999999999997e-05, |
|
"loss": 2.8429, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.40516129032258064, |
|
"grad_norm": 0.6306165456771851, |
|
"learning_rate": 2.279e-05, |
|
"loss": 2.7581, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.40774193548387094, |
|
"grad_norm": 0.5983100533485413, |
|
"learning_rate": 2.2259999999999997e-05, |
|
"loss": 2.8474, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4103225806451613, |
|
"grad_norm": 0.6067503094673157, |
|
"learning_rate": 2.173e-05, |
|
"loss": 2.8127, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4129032258064516, |
|
"grad_norm": 0.5717953443527222, |
|
"learning_rate": 2.1199999999999997e-05, |
|
"loss": 2.7154, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4154838709677419, |
|
"grad_norm": 0.6313890218734741, |
|
"learning_rate": 2.067e-05, |
|
"loss": 2.9207, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4180645161290323, |
|
"grad_norm": 0.6323216557502747, |
|
"learning_rate": 2.014e-05, |
|
"loss": 3.0632, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4206451612903226, |
|
"grad_norm": 0.5878114104270935, |
|
"learning_rate": 1.9609999999999997e-05, |
|
"loss": 2.6803, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4232258064516129, |
|
"grad_norm": 0.6011391282081604, |
|
"learning_rate": 1.908e-05, |
|
"loss": 2.761, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4258064516129032, |
|
"grad_norm": 0.6402645111083984, |
|
"learning_rate": 1.8549999999999997e-05, |
|
"loss": 2.8095, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.42838709677419357, |
|
"grad_norm": 0.6337774991989136, |
|
"learning_rate": 1.802e-05, |
|
"loss": 2.8637, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4309677419354839, |
|
"grad_norm": 0.6133434772491455, |
|
"learning_rate": 1.749e-05, |
|
"loss": 2.7507, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4335483870967742, |
|
"grad_norm": 0.6644814610481262, |
|
"learning_rate": 1.696e-05, |
|
"loss": 2.8863, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.43612903225806454, |
|
"grad_norm": 0.6810536980628967, |
|
"learning_rate": 1.643e-05, |
|
"loss": 3.0598, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.43870967741935485, |
|
"grad_norm": 0.644666850566864, |
|
"learning_rate": 1.5899999999999997e-05, |
|
"loss": 2.6853, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.44129032258064516, |
|
"grad_norm": 0.8024281859397888, |
|
"learning_rate": 1.537e-05, |
|
"loss": 3.0717, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.44387096774193546, |
|
"grad_norm": 0.7148249745368958, |
|
"learning_rate": 1.4839999999999999e-05, |
|
"loss": 2.8716, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4464516129032258, |
|
"grad_norm": 0.6904494762420654, |
|
"learning_rate": 1.4309999999999999e-05, |
|
"loss": 2.8221, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.44903225806451613, |
|
"grad_norm": 0.7332553863525391, |
|
"learning_rate": 1.378e-05, |
|
"loss": 3.0164, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 0.820591151714325, |
|
"learning_rate": 1.3249999999999999e-05, |
|
"loss": 3.0185, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4541935483870968, |
|
"grad_norm": 0.7899701595306396, |
|
"learning_rate": 1.272e-05, |
|
"loss": 2.7191, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4567741935483871, |
|
"grad_norm": 0.8323290944099426, |
|
"learning_rate": 1.219e-05, |
|
"loss": 3.1204, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4593548387096774, |
|
"grad_norm": 0.8330437541007996, |
|
"learning_rate": 1.1659999999999998e-05, |
|
"loss": 3.191, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4619354838709677, |
|
"grad_norm": 0.8948123455047607, |
|
"learning_rate": 1.1129999999999998e-05, |
|
"loss": 3.1088, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4645161290322581, |
|
"grad_norm": 0.8483486175537109, |
|
"learning_rate": 1.0599999999999998e-05, |
|
"loss": 2.7368, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4670967741935484, |
|
"grad_norm": 0.8846768140792847, |
|
"learning_rate": 1.007e-05, |
|
"loss": 2.8681, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4696774193548387, |
|
"grad_norm": 0.9632049798965454, |
|
"learning_rate": 9.54e-06, |
|
"loss": 2.7557, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.472258064516129, |
|
"grad_norm": 0.9614686965942383, |
|
"learning_rate": 9.01e-06, |
|
"loss": 2.8339, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.47483870967741937, |
|
"grad_norm": 0.9410024285316467, |
|
"learning_rate": 8.48e-06, |
|
"loss": 2.8687, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4774193548387097, |
|
"grad_norm": 1.2544703483581543, |
|
"learning_rate": 7.949999999999998e-06, |
|
"loss": 2.8427, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.054551124572754, |
|
"learning_rate": 7.419999999999999e-06, |
|
"loss": 3.0275, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.48258064516129034, |
|
"grad_norm": 0.9934067130088806, |
|
"learning_rate": 6.89e-06, |
|
"loss": 2.8259, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.48516129032258065, |
|
"grad_norm": 1.0902060270309448, |
|
"learning_rate": 6.36e-06, |
|
"loss": 2.925, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.48774193548387096, |
|
"grad_norm": 1.249941110610962, |
|
"learning_rate": 5.829999999999999e-06, |
|
"loss": 3.2027, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.49032258064516127, |
|
"grad_norm": 1.44179105758667, |
|
"learning_rate": 5.299999999999999e-06, |
|
"loss": 3.1991, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.49290322580645163, |
|
"grad_norm": 1.3627164363861084, |
|
"learning_rate": 4.77e-06, |
|
"loss": 3.0373, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.49548387096774194, |
|
"grad_norm": 1.419520378112793, |
|
"learning_rate": 4.24e-06, |
|
"loss": 3.1531, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.49806451612903224, |
|
"grad_norm": 1.5665879249572754, |
|
"learning_rate": 3.7099999999999996e-06, |
|
"loss": 3.0034, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5006451612903225, |
|
"grad_norm": 1.668223261833191, |
|
"learning_rate": 3.18e-06, |
|
"loss": 2.8947, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5032258064516129, |
|
"grad_norm": 2.2476518154144287, |
|
"learning_rate": 2.6499999999999996e-06, |
|
"loss": 3.6002, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5058064516129033, |
|
"grad_norm": 2.826122760772705, |
|
"learning_rate": 2.12e-06, |
|
"loss": 3.2501, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5083870967741936, |
|
"grad_norm": 2.145322561264038, |
|
"learning_rate": 1.59e-06, |
|
"loss": 3.3192, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5109677419354839, |
|
"grad_norm": 3.014976978302002, |
|
"learning_rate": 1.06e-06, |
|
"loss": 3.1429, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5135483870967742, |
|
"grad_norm": 3.8677399158477783, |
|
"learning_rate": 5.3e-07, |
|
"loss": 3.3806, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 9.486526489257812, |
|
"learning_rate": 0.0, |
|
"loss": 4.3408, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"eval_loss": 2.979677438735962, |
|
"eval_runtime": 38.3406, |
|
"eval_samples_per_second": 4.251, |
|
"eval_steps_per_second": 1.069, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.676900394199941e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|