{ "best_metric": 0.517457902431488, "best_model_checkpoint": "./kd_results/microsoft/beit-base-finetuned-ade-640-640_alpha0.7_temp5.0_t2/checkpoint-3600", "epoch": 20.0, "eval_steps": 500, "global_step": 3600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1388888888888889, "grad_norm": 7.222753047943115, "learning_rate": 3.4722222222222224e-06, "loss": 1.1565, "step": 25 }, { "epoch": 0.2777777777777778, "grad_norm": 8.318293571472168, "learning_rate": 6.944444444444445e-06, "loss": 1.1518, "step": 50 }, { "epoch": 0.4166666666666667, "grad_norm": 5.613315582275391, "learning_rate": 1.0416666666666668e-05, "loss": 1.1668, "step": 75 }, { "epoch": 0.5555555555555556, "grad_norm": 6.886393070220947, "learning_rate": 1.388888888888889e-05, "loss": 1.1685, "step": 100 }, { "epoch": 0.6944444444444444, "grad_norm": 4.675456523895264, "learning_rate": 1.736111111111111e-05, "loss": 1.1258, "step": 125 }, { "epoch": 0.8333333333333334, "grad_norm": 6.820631504058838, "learning_rate": 2.0833333333333336e-05, "loss": 1.0927, "step": 150 }, { "epoch": 0.9722222222222222, "grad_norm": 9.431523323059082, "learning_rate": 2.4305555555555558e-05, "loss": 1.1235, "step": 175 }, { "epoch": 1.0, "eval_accuracy": 0.3932806324110672, "eval_loss": 1.3238857984542847, "eval_runtime": 72.8485, "eval_samples_per_second": 13.892, "eval_steps_per_second": 0.439, "step": 180 }, { "epoch": 1.1111111111111112, "grad_norm": 10.79791259765625, "learning_rate": 2.777777777777778e-05, "loss": 0.9992, "step": 200 }, { "epoch": 1.25, "grad_norm": 6.876852512359619, "learning_rate": 3.125e-05, "loss": 0.9959, "step": 225 }, { "epoch": 1.3888888888888888, "grad_norm": 5.41601037979126, "learning_rate": 3.472222222222222e-05, "loss": 0.9272, "step": 250 }, { "epoch": 1.5277777777777777, "grad_norm": 5.712553024291992, "learning_rate": 3.8194444444444444e-05, "loss": 0.8841, "step": 275 }, { "epoch": 1.6666666666666665, "grad_norm": 6.366713523864746, "learning_rate": 4.166666666666667e-05, "loss": 0.779, "step": 300 }, { "epoch": 1.8055555555555556, "grad_norm": 9.597274780273438, "learning_rate": 4.5138888888888894e-05, "loss": 0.6792, "step": 325 }, { "epoch": 1.9444444444444444, "grad_norm": 5.85896635055542, "learning_rate": 4.8611111111111115e-05, "loss": 0.6303, "step": 350 }, { "epoch": 2.0, "eval_accuracy": 0.6877470355731226, "eval_loss": 0.841946542263031, "eval_runtime": 68.9674, "eval_samples_per_second": 14.674, "eval_steps_per_second": 0.464, "step": 360 }, { "epoch": 2.0833333333333335, "grad_norm": 5.268279075622559, "learning_rate": 4.976851851851852e-05, "loss": 0.5683, "step": 375 }, { "epoch": 2.2222222222222223, "grad_norm": 7.627471923828125, "learning_rate": 4.938271604938271e-05, "loss": 0.5024, "step": 400 }, { "epoch": 2.361111111111111, "grad_norm": 8.074167251586914, "learning_rate": 4.899691358024692e-05, "loss": 0.5199, "step": 425 }, { "epoch": 2.5, "grad_norm": 6.42832612991333, "learning_rate": 4.8611111111111115e-05, "loss": 0.5179, "step": 450 }, { "epoch": 2.638888888888889, "grad_norm": 7.448732852935791, "learning_rate": 4.8225308641975306e-05, "loss": 0.528, "step": 475 }, { "epoch": 2.7777777777777777, "grad_norm": 7.955748558044434, "learning_rate": 4.783950617283951e-05, "loss": 0.4876, "step": 500 }, { "epoch": 2.9166666666666665, "grad_norm": 7.4806742668151855, "learning_rate": 4.745370370370371e-05, "loss": 0.471, "step": 525 }, { "epoch": 3.0, "eval_accuracy": 0.7322134387351779, "eval_loss": 0.7386972904205322, "eval_runtime": 71.6196, "eval_samples_per_second": 14.13, "eval_steps_per_second": 0.447, "step": 540 }, { "epoch": 3.0555555555555554, "grad_norm": 4.674981117248535, "learning_rate": 4.70679012345679e-05, "loss": 0.4377, "step": 550 }, { "epoch": 3.1944444444444446, "grad_norm": 4.392780303955078, "learning_rate": 4.66820987654321e-05, "loss": 0.4075, "step": 575 }, { "epoch": 3.3333333333333335, "grad_norm": 6.68107795715332, "learning_rate": 4.62962962962963e-05, "loss": 0.3958, "step": 600 }, { "epoch": 3.4722222222222223, "grad_norm": 8.04480266571045, "learning_rate": 4.591049382716049e-05, "loss": 0.411, "step": 625 }, { "epoch": 3.611111111111111, "grad_norm": 5.624967098236084, "learning_rate": 4.5524691358024696e-05, "loss": 0.3841, "step": 650 }, { "epoch": 3.75, "grad_norm": 5.001491546630859, "learning_rate": 4.5138888888888894e-05, "loss": 0.3445, "step": 675 }, { "epoch": 3.888888888888889, "grad_norm": 3.7939069271087646, "learning_rate": 4.4753086419753084e-05, "loss": 0.3553, "step": 700 }, { "epoch": 4.0, "eval_accuracy": 0.7737154150197628, "eval_loss": 0.6575068831443787, "eval_runtime": 73.802, "eval_samples_per_second": 13.712, "eval_steps_per_second": 0.434, "step": 720 }, { "epoch": 4.027777777777778, "grad_norm": 3.7097816467285156, "learning_rate": 4.436728395061729e-05, "loss": 0.367, "step": 725 }, { "epoch": 4.166666666666667, "grad_norm": 2.837089776992798, "learning_rate": 4.3981481481481486e-05, "loss": 0.3453, "step": 750 }, { "epoch": 4.305555555555555, "grad_norm": 3.6544575691223145, "learning_rate": 4.359567901234568e-05, "loss": 0.2949, "step": 775 }, { "epoch": 4.444444444444445, "grad_norm": 4.620490550994873, "learning_rate": 4.3209876543209875e-05, "loss": 0.3194, "step": 800 }, { "epoch": 4.583333333333333, "grad_norm": 4.1617937088012695, "learning_rate": 4.282407407407408e-05, "loss": 0.3188, "step": 825 }, { "epoch": 4.722222222222222, "grad_norm": 5.08017635345459, "learning_rate": 4.243827160493827e-05, "loss": 0.2805, "step": 850 }, { "epoch": 4.861111111111111, "grad_norm": 6.545626640319824, "learning_rate": 4.205246913580247e-05, "loss": 0.2989, "step": 875 }, { "epoch": 5.0, "grad_norm": 11.199858665466309, "learning_rate": 4.166666666666667e-05, "loss": 0.3071, "step": 900 }, { "epoch": 5.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6260281205177307, "eval_runtime": 72.9279, "eval_samples_per_second": 13.877, "eval_steps_per_second": 0.439, "step": 900 }, { "epoch": 5.138888888888889, "grad_norm": 8.157869338989258, "learning_rate": 4.128086419753087e-05, "loss": 0.2608, "step": 925 }, { "epoch": 5.277777777777778, "grad_norm": 6.550694942474365, "learning_rate": 4.089506172839506e-05, "loss": 0.2635, "step": 950 }, { "epoch": 5.416666666666667, "grad_norm": 4.43126916885376, "learning_rate": 4.0509259259259265e-05, "loss": 0.2524, "step": 975 }, { "epoch": 5.555555555555555, "grad_norm": 5.895445823669434, "learning_rate": 4.012345679012346e-05, "loss": 0.2753, "step": 1000 }, { "epoch": 5.694444444444445, "grad_norm": 4.39314079284668, "learning_rate": 3.973765432098765e-05, "loss": 0.2798, "step": 1025 }, { "epoch": 5.833333333333333, "grad_norm": 7.890945911407471, "learning_rate": 3.935185185185186e-05, "loss": 0.2533, "step": 1050 }, { "epoch": 5.972222222222222, "grad_norm": 3.788822650909424, "learning_rate": 3.8966049382716055e-05, "loss": 0.2206, "step": 1075 }, { "epoch": 6.0, "eval_accuracy": 0.7964426877470355, "eval_loss": 0.60780268907547, "eval_runtime": 72.669, "eval_samples_per_second": 13.926, "eval_steps_per_second": 0.44, "step": 1080 }, { "epoch": 6.111111111111111, "grad_norm": 2.5203280448913574, "learning_rate": 3.8580246913580246e-05, "loss": 0.2321, "step": 1100 }, { "epoch": 6.25, "grad_norm": 4.587474822998047, "learning_rate": 3.8194444444444444e-05, "loss": 0.2245, "step": 1125 }, { "epoch": 6.388888888888889, "grad_norm": 3.7738234996795654, "learning_rate": 3.780864197530865e-05, "loss": 0.2278, "step": 1150 }, { "epoch": 6.527777777777778, "grad_norm": 3.1061084270477295, "learning_rate": 3.742283950617284e-05, "loss": 0.1915, "step": 1175 }, { "epoch": 6.666666666666667, "grad_norm": 3.019678831100464, "learning_rate": 3.7037037037037037e-05, "loss": 0.2292, "step": 1200 }, { "epoch": 6.805555555555555, "grad_norm": 2.68041729927063, "learning_rate": 3.665123456790124e-05, "loss": 0.2283, "step": 1225 }, { "epoch": 6.944444444444445, "grad_norm": 4.879120826721191, "learning_rate": 3.626543209876543e-05, "loss": 0.2027, "step": 1250 }, { "epoch": 7.0, "eval_accuracy": 0.8053359683794467, "eval_loss": 0.5722182393074036, "eval_runtime": 73.8232, "eval_samples_per_second": 13.708, "eval_steps_per_second": 0.433, "step": 1260 }, { "epoch": 7.083333333333333, "grad_norm": 3.7718591690063477, "learning_rate": 3.587962962962963e-05, "loss": 0.1954, "step": 1275 }, { "epoch": 7.222222222222222, "grad_norm": 2.43890118598938, "learning_rate": 3.5493827160493834e-05, "loss": 0.1957, "step": 1300 }, { "epoch": 7.361111111111111, "grad_norm": 2.6753463745117188, "learning_rate": 3.5108024691358025e-05, "loss": 0.1994, "step": 1325 }, { "epoch": 7.5, "grad_norm": 1.5570522546768188, "learning_rate": 3.472222222222222e-05, "loss": 0.1913, "step": 1350 }, { "epoch": 7.638888888888889, "grad_norm": 4.230154037475586, "learning_rate": 3.4336419753086427e-05, "loss": 0.1889, "step": 1375 }, { "epoch": 7.777777777777778, "grad_norm": 2.0178449153900146, "learning_rate": 3.395061728395062e-05, "loss": 0.1914, "step": 1400 }, { "epoch": 7.916666666666667, "grad_norm": 5.009683609008789, "learning_rate": 3.3564814814814815e-05, "loss": 0.1922, "step": 1425 }, { "epoch": 8.0, "eval_accuracy": 0.8092885375494071, "eval_loss": 0.5998839139938354, "eval_runtime": 72.2786, "eval_samples_per_second": 14.001, "eval_steps_per_second": 0.443, "step": 1440 }, { "epoch": 8.055555555555555, "grad_norm": 4.476706027984619, "learning_rate": 3.317901234567901e-05, "loss": 0.1881, "step": 1450 }, { "epoch": 8.194444444444445, "grad_norm": 3.35538649559021, "learning_rate": 3.279320987654321e-05, "loss": 0.1706, "step": 1475 }, { "epoch": 8.333333333333334, "grad_norm": 5.0826826095581055, "learning_rate": 3.240740740740741e-05, "loss": 0.1843, "step": 1500 }, { "epoch": 8.472222222222221, "grad_norm": 2.089994192123413, "learning_rate": 3.2021604938271605e-05, "loss": 0.1752, "step": 1525 }, { "epoch": 8.61111111111111, "grad_norm": 2.181107521057129, "learning_rate": 3.16358024691358e-05, "loss": 0.1795, "step": 1550 }, { "epoch": 8.75, "grad_norm": 2.4433932304382324, "learning_rate": 3.125e-05, "loss": 0.1806, "step": 1575 }, { "epoch": 8.88888888888889, "grad_norm": 2.5213308334350586, "learning_rate": 3.08641975308642e-05, "loss": 0.1801, "step": 1600 }, { "epoch": 9.0, "eval_accuracy": 0.7984189723320159, "eval_loss": 0.6050839424133301, "eval_runtime": 71.6254, "eval_samples_per_second": 14.129, "eval_steps_per_second": 0.447, "step": 1620 }, { "epoch": 9.027777777777779, "grad_norm": 2.464143753051758, "learning_rate": 3.04783950617284e-05, "loss": 0.1823, "step": 1625 }, { "epoch": 9.166666666666666, "grad_norm": 1.9522860050201416, "learning_rate": 3.0092592592592593e-05, "loss": 0.1697, "step": 1650 }, { "epoch": 9.305555555555555, "grad_norm": 3.385920524597168, "learning_rate": 2.970679012345679e-05, "loss": 0.1632, "step": 1675 }, { "epoch": 9.444444444444445, "grad_norm": 3.4966518878936768, "learning_rate": 2.9320987654320992e-05, "loss": 0.1725, "step": 1700 }, { "epoch": 9.583333333333334, "grad_norm": 2.042785406112671, "learning_rate": 2.8935185185185186e-05, "loss": 0.1652, "step": 1725 }, { "epoch": 9.722222222222221, "grad_norm": 1.7008278369903564, "learning_rate": 2.8549382716049384e-05, "loss": 0.1649, "step": 1750 }, { "epoch": 9.86111111111111, "grad_norm": 2.4193341732025146, "learning_rate": 2.8163580246913578e-05, "loss": 0.1575, "step": 1775 }, { "epoch": 10.0, "grad_norm": 9.32138442993164, "learning_rate": 2.777777777777778e-05, "loss": 0.1758, "step": 1800 }, { "epoch": 10.0, "eval_accuracy": 0.8063241106719368, "eval_loss": 0.572110652923584, "eval_runtime": 73.2828, "eval_samples_per_second": 13.81, "eval_steps_per_second": 0.437, "step": 1800 }, { "epoch": 10.13888888888889, "grad_norm": 2.2760612964630127, "learning_rate": 2.7391975308641977e-05, "loss": 0.1435, "step": 1825 }, { "epoch": 10.277777777777779, "grad_norm": 2.5033085346221924, "learning_rate": 2.700617283950617e-05, "loss": 0.1507, "step": 1850 }, { "epoch": 10.416666666666666, "grad_norm": 3.035231590270996, "learning_rate": 2.6620370370370372e-05, "loss": 0.1615, "step": 1875 }, { "epoch": 10.555555555555555, "grad_norm": 1.2022318840026855, "learning_rate": 2.623456790123457e-05, "loss": 0.1507, "step": 1900 }, { "epoch": 10.694444444444445, "grad_norm": 1.8878600597381592, "learning_rate": 2.5848765432098764e-05, "loss": 0.1605, "step": 1925 }, { "epoch": 10.833333333333334, "grad_norm": 1.6575684547424316, "learning_rate": 2.5462962962962965e-05, "loss": 0.1535, "step": 1950 }, { "epoch": 10.972222222222221, "grad_norm": 1.5876245498657227, "learning_rate": 2.5077160493827162e-05, "loss": 0.154, "step": 1975 }, { "epoch": 11.0, "eval_accuracy": 0.8102766798418972, "eval_loss": 0.5764340758323669, "eval_runtime": 69.3741, "eval_samples_per_second": 14.588, "eval_steps_per_second": 0.461, "step": 1980 }, { "epoch": 11.11111111111111, "grad_norm": 2.0368149280548096, "learning_rate": 2.4691358024691357e-05, "loss": 0.1449, "step": 2000 }, { "epoch": 11.25, "grad_norm": 1.995473027229309, "learning_rate": 2.4305555555555558e-05, "loss": 0.1484, "step": 2025 }, { "epoch": 11.38888888888889, "grad_norm": 2.1665401458740234, "learning_rate": 2.3919753086419755e-05, "loss": 0.145, "step": 2050 }, { "epoch": 11.527777777777779, "grad_norm": 1.5415103435516357, "learning_rate": 2.353395061728395e-05, "loss": 0.1472, "step": 2075 }, { "epoch": 11.666666666666666, "grad_norm": 1.7828869819641113, "learning_rate": 2.314814814814815e-05, "loss": 0.1468, "step": 2100 }, { "epoch": 11.805555555555555, "grad_norm": 1.8061931133270264, "learning_rate": 2.2762345679012348e-05, "loss": 0.1542, "step": 2125 }, { "epoch": 11.944444444444445, "grad_norm": 1.6420892477035522, "learning_rate": 2.2376543209876542e-05, "loss": 0.1495, "step": 2150 }, { "epoch": 12.0, "eval_accuracy": 0.817193675889328, "eval_loss": 0.5539035201072693, "eval_runtime": 72.0331, "eval_samples_per_second": 14.049, "eval_steps_per_second": 0.444, "step": 2160 }, { "epoch": 12.083333333333334, "grad_norm": 1.1013528108596802, "learning_rate": 2.1990740740740743e-05, "loss": 0.1324, "step": 2175 }, { "epoch": 12.222222222222221, "grad_norm": 1.1566153764724731, "learning_rate": 2.1604938271604937e-05, "loss": 0.1322, "step": 2200 }, { "epoch": 12.36111111111111, "grad_norm": 2.9402618408203125, "learning_rate": 2.1219135802469135e-05, "loss": 0.1395, "step": 2225 }, { "epoch": 12.5, "grad_norm": 1.4940177202224731, "learning_rate": 2.0833333333333336e-05, "loss": 0.149, "step": 2250 }, { "epoch": 12.63888888888889, "grad_norm": 1.2324851751327515, "learning_rate": 2.044753086419753e-05, "loss": 0.1446, "step": 2275 }, { "epoch": 12.777777777777779, "grad_norm": 1.6738492250442505, "learning_rate": 2.006172839506173e-05, "loss": 0.131, "step": 2300 }, { "epoch": 12.916666666666666, "grad_norm": 1.5494078397750854, "learning_rate": 1.967592592592593e-05, "loss": 0.1334, "step": 2325 }, { "epoch": 13.0, "eval_accuracy": 0.8389328063241107, "eval_loss": 0.527151346206665, "eval_runtime": 71.8766, "eval_samples_per_second": 14.08, "eval_steps_per_second": 0.445, "step": 2340 }, { "epoch": 13.055555555555555, "grad_norm": 1.056087613105774, "learning_rate": 1.9290123456790123e-05, "loss": 0.1444, "step": 2350 }, { "epoch": 13.194444444444445, "grad_norm": 1.6478755474090576, "learning_rate": 1.8904320987654324e-05, "loss": 0.1372, "step": 2375 }, { "epoch": 13.333333333333334, "grad_norm": 1.4524025917053223, "learning_rate": 1.8518518518518518e-05, "loss": 0.1338, "step": 2400 }, { "epoch": 13.472222222222221, "grad_norm": 2.1392762660980225, "learning_rate": 1.8132716049382716e-05, "loss": 0.1295, "step": 2425 }, { "epoch": 13.61111111111111, "grad_norm": 1.3661255836486816, "learning_rate": 1.7746913580246917e-05, "loss": 0.1326, "step": 2450 }, { "epoch": 13.75, "grad_norm": 1.6498640775680542, "learning_rate": 1.736111111111111e-05, "loss": 0.1299, "step": 2475 }, { "epoch": 13.88888888888889, "grad_norm": 2.753546714782715, "learning_rate": 1.697530864197531e-05, "loss": 0.132, "step": 2500 }, { "epoch": 14.0, "eval_accuracy": 0.8270750988142292, "eval_loss": 0.5426644682884216, "eval_runtime": 71.9593, "eval_samples_per_second": 14.063, "eval_steps_per_second": 0.445, "step": 2520 }, { "epoch": 14.027777777777779, "grad_norm": 1.3266198635101318, "learning_rate": 1.6589506172839506e-05, "loss": 0.1408, "step": 2525 }, { "epoch": 14.166666666666666, "grad_norm": 1.1106363534927368, "learning_rate": 1.6203703703703704e-05, "loss": 0.1331, "step": 2550 }, { "epoch": 14.305555555555555, "grad_norm": 1.399956226348877, "learning_rate": 1.58179012345679e-05, "loss": 0.1302, "step": 2575 }, { "epoch": 14.444444444444445, "grad_norm": 1.4343998432159424, "learning_rate": 1.54320987654321e-05, "loss": 0.1309, "step": 2600 }, { "epoch": 14.583333333333334, "grad_norm": 3.668091058731079, "learning_rate": 1.5046296296296297e-05, "loss": 0.1295, "step": 2625 }, { "epoch": 14.722222222222221, "grad_norm": 1.215031623840332, "learning_rate": 1.4660493827160496e-05, "loss": 0.1281, "step": 2650 }, { "epoch": 14.86111111111111, "grad_norm": 1.0562926530838013, "learning_rate": 1.4274691358024692e-05, "loss": 0.1215, "step": 2675 }, { "epoch": 15.0, "grad_norm": 5.751645565032959, "learning_rate": 1.388888888888889e-05, "loss": 0.1284, "step": 2700 }, { "epoch": 15.0, "eval_accuracy": 0.825098814229249, "eval_loss": 0.5345058441162109, "eval_runtime": 73.3445, "eval_samples_per_second": 13.798, "eval_steps_per_second": 0.436, "step": 2700 }, { "epoch": 15.13888888888889, "grad_norm": 2.1824536323547363, "learning_rate": 1.3503086419753085e-05, "loss": 0.1217, "step": 2725 }, { "epoch": 15.277777777777779, "grad_norm": 0.9013480544090271, "learning_rate": 1.3117283950617285e-05, "loss": 0.1232, "step": 2750 }, { "epoch": 15.416666666666666, "grad_norm": 1.235672950744629, "learning_rate": 1.2731481481481482e-05, "loss": 0.1213, "step": 2775 }, { "epoch": 15.555555555555555, "grad_norm": 1.3273124694824219, "learning_rate": 1.2345679012345678e-05, "loss": 0.1242, "step": 2800 }, { "epoch": 15.694444444444445, "grad_norm": 1.6189485788345337, "learning_rate": 1.1959876543209878e-05, "loss": 0.1275, "step": 2825 }, { "epoch": 15.833333333333334, "grad_norm": 1.5683832168579102, "learning_rate": 1.1574074074074075e-05, "loss": 0.126, "step": 2850 }, { "epoch": 15.972222222222221, "grad_norm": 1.721937656402588, "learning_rate": 1.1188271604938271e-05, "loss": 0.1205, "step": 2875 }, { "epoch": 16.0, "eval_accuracy": 0.8349802371541502, "eval_loss": 0.5285242795944214, "eval_runtime": 73.3035, "eval_samples_per_second": 13.806, "eval_steps_per_second": 0.437, "step": 2880 }, { "epoch": 16.11111111111111, "grad_norm": 0.7611171007156372, "learning_rate": 1.0802469135802469e-05, "loss": 0.1188, "step": 2900 }, { "epoch": 16.25, "grad_norm": 1.2971806526184082, "learning_rate": 1.0416666666666668e-05, "loss": 0.122, "step": 2925 }, { "epoch": 16.38888888888889, "grad_norm": 0.7731566429138184, "learning_rate": 1.0030864197530866e-05, "loss": 0.1203, "step": 2950 }, { "epoch": 16.52777777777778, "grad_norm": 1.075944185256958, "learning_rate": 9.645061728395062e-06, "loss": 0.1238, "step": 2975 }, { "epoch": 16.666666666666668, "grad_norm": 0.7787513136863708, "learning_rate": 9.259259259259259e-06, "loss": 0.1179, "step": 3000 }, { "epoch": 16.805555555555557, "grad_norm": 1.0278524160385132, "learning_rate": 8.873456790123458e-06, "loss": 0.1152, "step": 3025 }, { "epoch": 16.944444444444443, "grad_norm": 0.9401047229766846, "learning_rate": 8.487654320987654e-06, "loss": 0.1178, "step": 3050 }, { "epoch": 17.0, "eval_accuracy": 0.8300395256916996, "eval_loss": 0.535774827003479, "eval_runtime": 73.0785, "eval_samples_per_second": 13.848, "eval_steps_per_second": 0.438, "step": 3060 }, { "epoch": 17.083333333333332, "grad_norm": 0.7830631732940674, "learning_rate": 8.101851851851852e-06, "loss": 0.1165, "step": 3075 }, { "epoch": 17.22222222222222, "grad_norm": 0.6156997680664062, "learning_rate": 7.71604938271605e-06, "loss": 0.1172, "step": 3100 }, { "epoch": 17.36111111111111, "grad_norm": 0.7898209691047668, "learning_rate": 7.330246913580248e-06, "loss": 0.1109, "step": 3125 }, { "epoch": 17.5, "grad_norm": 0.995391845703125, "learning_rate": 6.944444444444445e-06, "loss": 0.1184, "step": 3150 }, { "epoch": 17.63888888888889, "grad_norm": 1.293843388557434, "learning_rate": 6.558641975308642e-06, "loss": 0.1128, "step": 3175 }, { "epoch": 17.77777777777778, "grad_norm": 1.179330587387085, "learning_rate": 6.172839506172839e-06, "loss": 0.1197, "step": 3200 }, { "epoch": 17.916666666666668, "grad_norm": 1.5338051319122314, "learning_rate": 5.787037037037038e-06, "loss": 0.1199, "step": 3225 }, { "epoch": 18.0, "eval_accuracy": 0.8349802371541502, "eval_loss": 0.5189934968948364, "eval_runtime": 72.4361, "eval_samples_per_second": 13.971, "eval_steps_per_second": 0.442, "step": 3240 }, { "epoch": 18.055555555555557, "grad_norm": 0.6209427118301392, "learning_rate": 5.401234567901234e-06, "loss": 0.1192, "step": 3250 }, { "epoch": 18.194444444444443, "grad_norm": 0.6023913025856018, "learning_rate": 5.015432098765433e-06, "loss": 0.109, "step": 3275 }, { "epoch": 18.333333333333332, "grad_norm": 1.2056450843811035, "learning_rate": 4.6296296296296296e-06, "loss": 0.1159, "step": 3300 }, { "epoch": 18.47222222222222, "grad_norm": 1.0217233896255493, "learning_rate": 4.243827160493827e-06, "loss": 0.1126, "step": 3325 }, { "epoch": 18.61111111111111, "grad_norm": 0.8297092318534851, "learning_rate": 3.858024691358025e-06, "loss": 0.1146, "step": 3350 }, { "epoch": 18.75, "grad_norm": 0.5455515384674072, "learning_rate": 3.4722222222222224e-06, "loss": 0.1192, "step": 3375 }, { "epoch": 18.88888888888889, "grad_norm": 0.5246239304542542, "learning_rate": 3.0864197530864196e-06, "loss": 0.112, "step": 3400 }, { "epoch": 19.0, "eval_accuracy": 0.8310276679841897, "eval_loss": 0.5260413885116577, "eval_runtime": 73.1437, "eval_samples_per_second": 13.836, "eval_steps_per_second": 0.437, "step": 3420 }, { "epoch": 19.02777777777778, "grad_norm": 0.4471115171909332, "learning_rate": 2.700617283950617e-06, "loss": 0.1185, "step": 3425 }, { "epoch": 19.166666666666668, "grad_norm": 2.6564996242523193, "learning_rate": 2.3148148148148148e-06, "loss": 0.1196, "step": 3450 }, { "epoch": 19.305555555555557, "grad_norm": 0.7345994710922241, "learning_rate": 1.9290123456790124e-06, "loss": 0.1108, "step": 3475 }, { "epoch": 19.444444444444443, "grad_norm": 0.7227876782417297, "learning_rate": 1.5432098765432098e-06, "loss": 0.113, "step": 3500 }, { "epoch": 19.583333333333332, "grad_norm": 0.8343745470046997, "learning_rate": 1.1574074074074074e-06, "loss": 0.1131, "step": 3525 }, { "epoch": 19.72222222222222, "grad_norm": 0.6085208058357239, "learning_rate": 7.716049382716049e-07, "loss": 0.1139, "step": 3550 }, { "epoch": 19.86111111111111, "grad_norm": 0.9748389720916748, "learning_rate": 3.8580246913580245e-07, "loss": 0.1093, "step": 3575 }, { "epoch": 20.0, "grad_norm": 0.821051836013794, "learning_rate": 0.0, "loss": 0.1087, "step": 3600 }, { "epoch": 20.0, "eval_accuracy": 0.83399209486166, "eval_loss": 0.517457902431488, "eval_runtime": 71.867, "eval_samples_per_second": 14.082, "eval_steps_per_second": 0.445, "step": 3600 }, { "epoch": 20.0, "step": 3600, "total_flos": 0.0, "train_loss": 0.2731148091952006, "train_runtime": 18326.3929, "train_samples_per_second": 6.253, "train_steps_per_second": 0.196 } ], "logging_steps": 25, "max_steps": 3600, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }