{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1000.0, "eval_steps": 100, "global_step": 75000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.3333333333333333, "eval_loss": 1.3757460117340088, "eval_runtime": 1.1281, "eval_samples_per_second": 31.912, "eval_steps_per_second": 7.978, "step": 100 }, { "epoch": 2.6666666666666665, "eval_loss": 1.6360117197036743, "eval_runtime": 0.5889, "eval_samples_per_second": 61.129, "eval_steps_per_second": 15.282, "step": 200 }, { "epoch": 4.0, "eval_loss": 1.3093128204345703, "eval_runtime": 0.5549, "eval_samples_per_second": 64.882, "eval_steps_per_second": 16.22, "step": 300 }, { "epoch": 5.333333333333333, "eval_loss": 1.647064208984375, "eval_runtime": 0.5606, "eval_samples_per_second": 64.216, "eval_steps_per_second": 16.054, "step": 400 }, { "epoch": 6.666666666666667, "grad_norm": 213.89942932128906, "learning_rate": 4.966666666666667e-06, "loss": 1.2974, "step": 500 }, { "epoch": 6.666666666666667, "eval_loss": 1.4340702295303345, "eval_runtime": 0.5525, "eval_samples_per_second": 65.162, "eval_steps_per_second": 16.29, "step": 500 }, { "epoch": 8.0, "eval_loss": 1.290921926498413, "eval_runtime": 0.5701, "eval_samples_per_second": 63.148, "eval_steps_per_second": 15.787, "step": 600 }, { "epoch": 9.333333333333334, "eval_loss": 1.5774412155151367, "eval_runtime": 0.5535, "eval_samples_per_second": 65.037, "eval_steps_per_second": 16.259, "step": 700 }, { "epoch": 10.666666666666666, "eval_loss": 1.5692601203918457, "eval_runtime": 0.5558, "eval_samples_per_second": 64.776, "eval_steps_per_second": 16.194, "step": 800 }, { "epoch": 12.0, "eval_loss": 1.3354053497314453, "eval_runtime": 0.5915, "eval_samples_per_second": 60.86, "eval_steps_per_second": 15.215, "step": 900 }, { "epoch": 13.333333333333334, "grad_norm": 48.9247932434082, "learning_rate": 4.933333333333334e-06, "loss": 1.0923, "step": 1000 }, { "epoch": 13.333333333333334, "eval_loss": 1.544665813446045, "eval_runtime": 0.5666, "eval_samples_per_second": 63.535, "eval_steps_per_second": 15.884, "step": 1000 }, { "epoch": 14.666666666666666, "eval_loss": 1.6668956279754639, "eval_runtime": 0.5619, "eval_samples_per_second": 64.072, "eval_steps_per_second": 16.018, "step": 1100 }, { "epoch": 16.0, "eval_loss": 1.6087732315063477, "eval_runtime": 0.547, "eval_samples_per_second": 65.808, "eval_steps_per_second": 16.452, "step": 1200 }, { "epoch": 17.333333333333332, "eval_loss": 1.7851026058197021, "eval_runtime": 0.5551, "eval_samples_per_second": 64.85, "eval_steps_per_second": 16.213, "step": 1300 }, { "epoch": 18.666666666666668, "eval_loss": 1.735277771949768, "eval_runtime": 0.5484, "eval_samples_per_second": 65.641, "eval_steps_per_second": 16.41, "step": 1400 }, { "epoch": 20.0, "grad_norm": 1.0645431280136108, "learning_rate": 4.9000000000000005e-06, "loss": 1.0658, "step": 1500 }, { "epoch": 20.0, "eval_loss": 1.7230966091156006, "eval_runtime": 0.562, "eval_samples_per_second": 64.054, "eval_steps_per_second": 16.014, "step": 1500 }, { "epoch": 21.333333333333332, "eval_loss": 1.9702562093734741, "eval_runtime": 0.7064, "eval_samples_per_second": 50.963, "eval_steps_per_second": 12.741, "step": 1600 }, { "epoch": 22.666666666666668, "eval_loss": 1.83578360080719, "eval_runtime": 0.5514, "eval_samples_per_second": 65.284, "eval_steps_per_second": 16.321, "step": 1700 }, { "epoch": 24.0, "eval_loss": 1.7885479927062988, "eval_runtime": 0.5467, "eval_samples_per_second": 65.855, "eval_steps_per_second": 16.464, "step": 1800 }, { "epoch": 25.333333333333332, "eval_loss": 1.7438658475875854, "eval_runtime": 0.5638, "eval_samples_per_second": 63.848, "eval_steps_per_second": 15.962, "step": 1900 }, { "epoch": 26.666666666666668, "grad_norm": 0.6635001301765442, "learning_rate": 4.866666666666667e-06, "loss": 1.0656, "step": 2000 }, { "epoch": 26.666666666666668, "eval_loss": 1.804993987083435, "eval_runtime": 0.552, "eval_samples_per_second": 65.213, "eval_steps_per_second": 16.303, "step": 2000 }, { "epoch": 28.0, "eval_loss": 1.5398216247558594, "eval_runtime": 0.5664, "eval_samples_per_second": 63.564, "eval_steps_per_second": 15.891, "step": 2100 }, { "epoch": 29.333333333333332, "eval_loss": 1.5121310949325562, "eval_runtime": 0.5505, "eval_samples_per_second": 65.392, "eval_steps_per_second": 16.348, "step": 2200 }, { "epoch": 30.666666666666668, "eval_loss": 1.5401850938796997, "eval_runtime": 0.604, "eval_samples_per_second": 59.599, "eval_steps_per_second": 14.9, "step": 2300 }, { "epoch": 32.0, "eval_loss": 1.719224214553833, "eval_runtime": 0.6796, "eval_samples_per_second": 52.976, "eval_steps_per_second": 13.244, "step": 2400 }, { "epoch": 33.333333333333336, "grad_norm": 0.9279197454452515, "learning_rate": 4.833333333333333e-06, "loss": 1.0312, "step": 2500 }, { "epoch": 33.333333333333336, "eval_loss": 1.7868709564208984, "eval_runtime": 0.5405, "eval_samples_per_second": 66.607, "eval_steps_per_second": 16.652, "step": 2500 }, { "epoch": 34.666666666666664, "eval_loss": 1.7577136754989624, "eval_runtime": 0.5292, "eval_samples_per_second": 68.029, "eval_steps_per_second": 17.007, "step": 2600 }, { "epoch": 36.0, "eval_loss": 1.7127821445465088, "eval_runtime": 0.625, "eval_samples_per_second": 57.596, "eval_steps_per_second": 14.399, "step": 2700 }, { "epoch": 37.333333333333336, "eval_loss": 1.7278578281402588, "eval_runtime": 0.5568, "eval_samples_per_second": 64.656, "eval_steps_per_second": 16.164, "step": 2800 }, { "epoch": 38.666666666666664, "eval_loss": 1.8015283346176147, "eval_runtime": 0.6267, "eval_samples_per_second": 57.439, "eval_steps_per_second": 14.36, "step": 2900 }, { "epoch": 40.0, "grad_norm": 0.12766413390636444, "learning_rate": 4.800000000000001e-06, "loss": 1.0194, "step": 3000 }, { "epoch": 40.0, "eval_loss": 1.7510027885437012, "eval_runtime": 0.5543, "eval_samples_per_second": 64.951, "eval_steps_per_second": 16.238, "step": 3000 }, { "epoch": 41.333333333333336, "eval_loss": 1.721305012702942, "eval_runtime": 0.5475, "eval_samples_per_second": 65.756, "eval_steps_per_second": 16.439, "step": 3100 }, { "epoch": 42.666666666666664, "eval_loss": 1.6871490478515625, "eval_runtime": 0.5643, "eval_samples_per_second": 63.796, "eval_steps_per_second": 15.949, "step": 3200 }, { "epoch": 44.0, "eval_loss": 1.7161273956298828, "eval_runtime": 0.5465, "eval_samples_per_second": 65.877, "eval_steps_per_second": 16.469, "step": 3300 }, { "epoch": 45.333333333333336, "eval_loss": 1.7033265829086304, "eval_runtime": 0.5391, "eval_samples_per_second": 66.778, "eval_steps_per_second": 16.695, "step": 3400 }, { "epoch": 46.666666666666664, "grad_norm": 0.5574055314064026, "learning_rate": 4.766666666666667e-06, "loss": 1.0267, "step": 3500 }, { "epoch": 46.666666666666664, "eval_loss": 1.6089798212051392, "eval_runtime": 0.7236, "eval_samples_per_second": 49.749, "eval_steps_per_second": 12.437, "step": 3500 }, { "epoch": 48.0, "eval_loss": 1.6582438945770264, "eval_runtime": 0.7063, "eval_samples_per_second": 50.972, "eval_steps_per_second": 12.743, "step": 3600 }, { "epoch": 49.333333333333336, "eval_loss": 1.8187174797058105, "eval_runtime": 0.534, "eval_samples_per_second": 67.41, "eval_steps_per_second": 16.852, "step": 3700 }, { "epoch": 50.666666666666664, "eval_loss": 1.7960256338119507, "eval_runtime": 0.5423, "eval_samples_per_second": 66.389, "eval_steps_per_second": 16.597, "step": 3800 }, { "epoch": 52.0, "eval_loss": 1.7180742025375366, "eval_runtime": 0.5771, "eval_samples_per_second": 62.382, "eval_steps_per_second": 15.595, "step": 3900 }, { "epoch": 53.333333333333336, "grad_norm": 0.24446183443069458, "learning_rate": 4.7333333333333335e-06, "loss": 1.0336, "step": 4000 }, { "epoch": 53.333333333333336, "eval_loss": 1.771078109741211, "eval_runtime": 0.5425, "eval_samples_per_second": 66.357, "eval_steps_per_second": 16.589, "step": 4000 }, { "epoch": 54.666666666666664, "eval_loss": 1.8512227535247803, "eval_runtime": 0.5526, "eval_samples_per_second": 65.141, "eval_steps_per_second": 16.285, "step": 4100 }, { "epoch": 56.0, "eval_loss": 1.616445541381836, "eval_runtime": 0.5597, "eval_samples_per_second": 64.322, "eval_steps_per_second": 16.081, "step": 4200 }, { "epoch": 57.333333333333336, "eval_loss": 1.7197967767715454, "eval_runtime": 0.5577, "eval_samples_per_second": 64.546, "eval_steps_per_second": 16.136, "step": 4300 }, { "epoch": 58.666666666666664, "eval_loss": 1.9392327070236206, "eval_runtime": 0.5753, "eval_samples_per_second": 62.574, "eval_steps_per_second": 15.643, "step": 4400 }, { "epoch": 60.0, "grad_norm": 0.9216328859329224, "learning_rate": 4.7e-06, "loss": 1.0152, "step": 4500 }, { "epoch": 60.0, "eval_loss": 1.7398226261138916, "eval_runtime": 0.7329, "eval_samples_per_second": 49.117, "eval_steps_per_second": 12.279, "step": 4500 }, { "epoch": 61.333333333333336, "eval_loss": 1.7675209045410156, "eval_runtime": 0.555, "eval_samples_per_second": 64.863, "eval_steps_per_second": 16.216, "step": 4600 }, { "epoch": 62.666666666666664, "eval_loss": 1.7337052822113037, "eval_runtime": 0.5679, "eval_samples_per_second": 63.388, "eval_steps_per_second": 15.847, "step": 4700 }, { "epoch": 64.0, "eval_loss": 1.9170137643814087, "eval_runtime": 0.5739, "eval_samples_per_second": 62.724, "eval_steps_per_second": 15.681, "step": 4800 }, { "epoch": 65.33333333333333, "eval_loss": 1.9818395376205444, "eval_runtime": 0.5633, "eval_samples_per_second": 63.914, "eval_steps_per_second": 15.979, "step": 4900 }, { "epoch": 66.66666666666667, "grad_norm": 0.20946620404720306, "learning_rate": 4.666666666666667e-06, "loss": 1.0139, "step": 5000 }, { "epoch": 66.66666666666667, "eval_loss": 1.7037873268127441, "eval_runtime": 0.697, "eval_samples_per_second": 51.648, "eval_steps_per_second": 12.912, "step": 5000 }, { "epoch": 68.0, "eval_loss": 1.8328310251235962, "eval_runtime": 0.5542, "eval_samples_per_second": 64.962, "eval_steps_per_second": 16.24, "step": 5100 }, { "epoch": 69.33333333333333, "eval_loss": 1.6285290718078613, "eval_runtime": 0.5758, "eval_samples_per_second": 62.519, "eval_steps_per_second": 15.63, "step": 5200 }, { "epoch": 70.66666666666667, "eval_loss": 1.82057785987854, "eval_runtime": 0.5544, "eval_samples_per_second": 64.937, "eval_steps_per_second": 16.234, "step": 5300 }, { "epoch": 72.0, "eval_loss": 1.8342149257659912, "eval_runtime": 0.5811, "eval_samples_per_second": 61.952, "eval_steps_per_second": 15.488, "step": 5400 }, { "epoch": 73.33333333333333, "grad_norm": 0.3138462007045746, "learning_rate": 4.633333333333334e-06, "loss": 1.0068, "step": 5500 }, { "epoch": 73.33333333333333, "eval_loss": 1.8047531843185425, "eval_runtime": 0.5723, "eval_samples_per_second": 62.903, "eval_steps_per_second": 15.726, "step": 5500 }, { "epoch": 74.66666666666667, "eval_loss": 1.7319685220718384, "eval_runtime": 0.558, "eval_samples_per_second": 64.515, "eval_steps_per_second": 16.129, "step": 5600 }, { "epoch": 76.0, "eval_loss": 1.714286208152771, "eval_runtime": 0.5521, "eval_samples_per_second": 65.209, "eval_steps_per_second": 16.302, "step": 5700 }, { "epoch": 77.33333333333333, "eval_loss": 2.0508413314819336, "eval_runtime": 0.5659, "eval_samples_per_second": 63.61, "eval_steps_per_second": 15.903, "step": 5800 }, { "epoch": 78.66666666666667, "eval_loss": 1.7538458108901978, "eval_runtime": 0.5425, "eval_samples_per_second": 66.364, "eval_steps_per_second": 16.591, "step": 5900 }, { "epoch": 80.0, "grad_norm": 0.19232626259326935, "learning_rate": 4.600000000000001e-06, "loss": 1.0056, "step": 6000 }, { "epoch": 80.0, "eval_loss": 1.7577366828918457, "eval_runtime": 0.5546, "eval_samples_per_second": 64.911, "eval_steps_per_second": 16.228, "step": 6000 }, { "epoch": 81.33333333333333, "eval_loss": 1.714871883392334, "eval_runtime": 0.5495, "eval_samples_per_second": 65.511, "eval_steps_per_second": 16.378, "step": 6100 }, { "epoch": 82.66666666666667, "eval_loss": 1.6159921884536743, "eval_runtime": 0.53, "eval_samples_per_second": 67.921, "eval_steps_per_second": 16.98, "step": 6200 }, { "epoch": 84.0, "eval_loss": 1.6219635009765625, "eval_runtime": 0.714, "eval_samples_per_second": 50.421, "eval_steps_per_second": 12.605, "step": 6300 }, { "epoch": 85.33333333333333, "eval_loss": 1.8384785652160645, "eval_runtime": 0.7485, "eval_samples_per_second": 48.098, "eval_steps_per_second": 12.024, "step": 6400 }, { "epoch": 86.66666666666667, "grad_norm": 0.7167218923568726, "learning_rate": 4.566666666666667e-06, "loss": 1.0003, "step": 6500 }, { "epoch": 86.66666666666667, "eval_loss": 1.6387672424316406, "eval_runtime": 0.5493, "eval_samples_per_second": 65.539, "eval_steps_per_second": 16.385, "step": 6500 }, { "epoch": 88.0, "eval_loss": 1.7860279083251953, "eval_runtime": 0.5698, "eval_samples_per_second": 63.179, "eval_steps_per_second": 15.795, "step": 6600 }, { "epoch": 89.33333333333333, "eval_loss": 1.84665846824646, "eval_runtime": 0.7359, "eval_samples_per_second": 48.919, "eval_steps_per_second": 12.23, "step": 6700 }, { "epoch": 90.66666666666667, "eval_loss": 1.8202018737792969, "eval_runtime": 0.563, "eval_samples_per_second": 63.938, "eval_steps_per_second": 15.985, "step": 6800 }, { "epoch": 92.0, "eval_loss": 1.783063292503357, "eval_runtime": 0.5553, "eval_samples_per_second": 64.829, "eval_steps_per_second": 16.207, "step": 6900 }, { "epoch": 93.33333333333333, "grad_norm": 0.30186399817466736, "learning_rate": 4.533333333333334e-06, "loss": 0.9939, "step": 7000 }, { "epoch": 93.33333333333333, "eval_loss": 1.5783860683441162, "eval_runtime": 0.5485, "eval_samples_per_second": 65.629, "eval_steps_per_second": 16.407, "step": 7000 }, { "epoch": 94.66666666666667, "eval_loss": 1.8026964664459229, "eval_runtime": 0.5317, "eval_samples_per_second": 67.713, "eval_steps_per_second": 16.928, "step": 7100 }, { "epoch": 96.0, "eval_loss": 1.7965619564056396, "eval_runtime": 0.5374, "eval_samples_per_second": 66.985, "eval_steps_per_second": 16.746, "step": 7200 }, { "epoch": 97.33333333333333, "eval_loss": 1.695617914199829, "eval_runtime": 0.5655, "eval_samples_per_second": 63.662, "eval_steps_per_second": 15.915, "step": 7300 }, { "epoch": 98.66666666666667, "eval_loss": 1.8203614950180054, "eval_runtime": 0.5314, "eval_samples_per_second": 67.743, "eval_steps_per_second": 16.936, "step": 7400 }, { "epoch": 100.0, "grad_norm": 0.3792312443256378, "learning_rate": 4.5e-06, "loss": 1.0131, "step": 7500 }, { "epoch": 100.0, "eval_loss": 1.7423609495162964, "eval_runtime": 0.5522, "eval_samples_per_second": 65.192, "eval_steps_per_second": 16.298, "step": 7500 }, { "epoch": 101.33333333333333, "eval_loss": 1.5216121673583984, "eval_runtime": 0.5403, "eval_samples_per_second": 66.631, "eval_steps_per_second": 16.658, "step": 7600 }, { "epoch": 102.66666666666667, "eval_loss": 1.6042466163635254, "eval_runtime": 0.677, "eval_samples_per_second": 53.177, "eval_steps_per_second": 13.294, "step": 7700 }, { "epoch": 104.0, "eval_loss": 1.72611665725708, "eval_runtime": 0.5396, "eval_samples_per_second": 66.715, "eval_steps_per_second": 16.679, "step": 7800 }, { "epoch": 105.33333333333333, "eval_loss": 1.664813756942749, "eval_runtime": 0.5584, "eval_samples_per_second": 64.472, "eval_steps_per_second": 16.118, "step": 7900 }, { "epoch": 106.66666666666667, "grad_norm": 0.9051163792610168, "learning_rate": 4.4666666666666665e-06, "loss": 1.0327, "step": 8000 }, { "epoch": 106.66666666666667, "eval_loss": 1.6512349843978882, "eval_runtime": 0.563, "eval_samples_per_second": 63.946, "eval_steps_per_second": 15.987, "step": 8000 }, { "epoch": 108.0, "eval_loss": 1.718727946281433, "eval_runtime": 0.5381, "eval_samples_per_second": 66.899, "eval_steps_per_second": 16.725, "step": 8100 }, { "epoch": 109.33333333333333, "eval_loss": 1.7837696075439453, "eval_runtime": 0.5508, "eval_samples_per_second": 65.362, "eval_steps_per_second": 16.34, "step": 8200 }, { "epoch": 110.66666666666667, "eval_loss": 1.690812110900879, "eval_runtime": 0.5601, "eval_samples_per_second": 64.273, "eval_steps_per_second": 16.068, "step": 8300 }, { "epoch": 112.0, "eval_loss": 1.5458675622940063, "eval_runtime": 0.5492, "eval_samples_per_second": 65.545, "eval_steps_per_second": 16.386, "step": 8400 }, { "epoch": 113.33333333333333, "grad_norm": 0.2740296423435211, "learning_rate": 4.433333333333334e-06, "loss": 1.004, "step": 8500 }, { "epoch": 113.33333333333333, "eval_loss": 1.8060648441314697, "eval_runtime": 0.6885, "eval_samples_per_second": 52.285, "eval_steps_per_second": 13.071, "step": 8500 }, { "epoch": 114.66666666666667, "eval_loss": 1.8637017011642456, "eval_runtime": 0.5514, "eval_samples_per_second": 65.291, "eval_steps_per_second": 16.323, "step": 8600 }, { "epoch": 116.0, "eval_loss": 1.6184149980545044, "eval_runtime": 0.7652, "eval_samples_per_second": 47.049, "eval_steps_per_second": 11.762, "step": 8700 }, { "epoch": 117.33333333333333, "eval_loss": 1.6870206594467163, "eval_runtime": 0.5477, "eval_samples_per_second": 65.726, "eval_steps_per_second": 16.431, "step": 8800 }, { "epoch": 118.66666666666667, "eval_loss": 1.7125325202941895, "eval_runtime": 0.54, "eval_samples_per_second": 66.662, "eval_steps_per_second": 16.666, "step": 8900 }, { "epoch": 120.0, "grad_norm": 0.21472270786762238, "learning_rate": 4.4e-06, "loss": 0.9897, "step": 9000 }, { "epoch": 120.0, "eval_loss": 1.645700216293335, "eval_runtime": 0.551, "eval_samples_per_second": 65.334, "eval_steps_per_second": 16.334, "step": 9000 }, { "epoch": 121.33333333333333, "eval_loss": 1.6648261547088623, "eval_runtime": 0.5497, "eval_samples_per_second": 65.49, "eval_steps_per_second": 16.372, "step": 9100 }, { "epoch": 122.66666666666667, "eval_loss": 1.6010613441467285, "eval_runtime": 0.5488, "eval_samples_per_second": 65.602, "eval_steps_per_second": 16.401, "step": 9200 }, { "epoch": 124.0, "eval_loss": 1.619473934173584, "eval_runtime": 0.5548, "eval_samples_per_second": 64.888, "eval_steps_per_second": 16.222, "step": 9300 }, { "epoch": 125.33333333333333, "eval_loss": 1.7012939453125, "eval_runtime": 0.5488, "eval_samples_per_second": 65.603, "eval_steps_per_second": 16.401, "step": 9400 }, { "epoch": 126.66666666666667, "grad_norm": 0.11142808943986893, "learning_rate": 4.366666666666667e-06, "loss": 0.9933, "step": 9500 }, { "epoch": 126.66666666666667, "eval_loss": 1.761753797531128, "eval_runtime": 0.5581, "eval_samples_per_second": 64.499, "eval_steps_per_second": 16.125, "step": 9500 }, { "epoch": 128.0, "eval_loss": 1.6933720111846924, "eval_runtime": 0.6676, "eval_samples_per_second": 53.928, "eval_steps_per_second": 13.482, "step": 9600 }, { "epoch": 129.33333333333334, "eval_loss": 1.703611969947815, "eval_runtime": 0.7569, "eval_samples_per_second": 47.563, "eval_steps_per_second": 11.891, "step": 9700 }, { "epoch": 130.66666666666666, "eval_loss": 1.665324091911316, "eval_runtime": 0.5623, "eval_samples_per_second": 64.022, "eval_steps_per_second": 16.005, "step": 9800 }, { "epoch": 132.0, "eval_loss": 1.6665911674499512, "eval_runtime": 0.7246, "eval_samples_per_second": 49.682, "eval_steps_per_second": 12.42, "step": 9900 }, { "epoch": 133.33333333333334, "grad_norm": 0.19828678667545319, "learning_rate": 4.333333333333334e-06, "loss": 0.9757, "step": 10000 }, { "epoch": 133.33333333333334, "eval_loss": 1.73139488697052, "eval_runtime": 0.5969, "eval_samples_per_second": 60.309, "eval_steps_per_second": 15.077, "step": 10000 }, { "epoch": 134.66666666666666, "eval_loss": 1.7937448024749756, "eval_runtime": 0.5259, "eval_samples_per_second": 68.456, "eval_steps_per_second": 17.114, "step": 10100 }, { "epoch": 136.0, "eval_loss": 1.646231770515442, "eval_runtime": 0.5443, "eval_samples_per_second": 66.139, "eval_steps_per_second": 16.535, "step": 10200 }, { "epoch": 137.33333333333334, "eval_loss": 1.543634057044983, "eval_runtime": 0.7301, "eval_samples_per_second": 49.31, "eval_steps_per_second": 12.328, "step": 10300 }, { "epoch": 138.66666666666666, "eval_loss": 2.01108717918396, "eval_runtime": 0.5721, "eval_samples_per_second": 62.929, "eval_steps_per_second": 15.732, "step": 10400 }, { "epoch": 140.0, "grad_norm": 2.6110074520111084, "learning_rate": 4.3e-06, "loss": 0.9935, "step": 10500 }, { "epoch": 140.0, "eval_loss": 1.7568095922470093, "eval_runtime": 0.7221, "eval_samples_per_second": 49.855, "eval_steps_per_second": 12.464, "step": 10500 }, { "epoch": 141.33333333333334, "eval_loss": 1.711548924446106, "eval_runtime": 0.4422, "eval_samples_per_second": 81.407, "eval_steps_per_second": 20.352, "step": 10600 }, { "epoch": 142.66666666666666, "eval_loss": 1.7388983964920044, "eval_runtime": 0.6076, "eval_samples_per_second": 59.252, "eval_steps_per_second": 14.813, "step": 10700 }, { "epoch": 144.0, "eval_loss": 1.7509812116622925, "eval_runtime": 0.4314, "eval_samples_per_second": 83.455, "eval_steps_per_second": 20.864, "step": 10800 }, { "epoch": 145.33333333333334, "eval_loss": 1.9284065961837769, "eval_runtime": 0.4415, "eval_samples_per_second": 81.531, "eval_steps_per_second": 20.383, "step": 10900 }, { "epoch": 146.66666666666666, "grad_norm": 0.9369779229164124, "learning_rate": 4.266666666666668e-06, "loss": 0.9858, "step": 11000 }, { "epoch": 146.66666666666666, "eval_loss": 1.8226513862609863, "eval_runtime": 0.6059, "eval_samples_per_second": 59.415, "eval_steps_per_second": 14.854, "step": 11000 }, { "epoch": 148.0, "eval_loss": 1.5214120149612427, "eval_runtime": 0.445, "eval_samples_per_second": 80.902, "eval_steps_per_second": 20.225, "step": 11100 }, { "epoch": 149.33333333333334, "eval_loss": 1.7091901302337646, "eval_runtime": 0.588, "eval_samples_per_second": 61.225, "eval_steps_per_second": 15.306, "step": 11200 }, { "epoch": 150.66666666666666, "eval_loss": 1.8387616872787476, "eval_runtime": 0.4733, "eval_samples_per_second": 76.061, "eval_steps_per_second": 19.015, "step": 11300 }, { "epoch": 152.0, "eval_loss": 1.6450871229171753, "eval_runtime": 0.4586, "eval_samples_per_second": 78.497, "eval_steps_per_second": 19.624, "step": 11400 }, { "epoch": 153.33333333333334, "grad_norm": 0.3782365918159485, "learning_rate": 4.233333333333334e-06, "loss": 1.0067, "step": 11500 }, { "epoch": 153.33333333333334, "eval_loss": 1.7066504955291748, "eval_runtime": 0.5172, "eval_samples_per_second": 69.608, "eval_steps_per_second": 17.402, "step": 11500 }, { "epoch": 154.66666666666666, "eval_loss": 1.604575514793396, "eval_runtime": 0.5122, "eval_samples_per_second": 70.285, "eval_steps_per_second": 17.571, "step": 11600 }, { "epoch": 156.0, "eval_loss": 1.6562608480453491, "eval_runtime": 0.4925, "eval_samples_per_second": 73.089, "eval_steps_per_second": 18.272, "step": 11700 }, { "epoch": 157.33333333333334, "eval_loss": 1.6087099313735962, "eval_runtime": 0.4819, "eval_samples_per_second": 74.702, "eval_steps_per_second": 18.675, "step": 11800 }, { "epoch": 158.66666666666666, "eval_loss": 1.6842621564865112, "eval_runtime": 0.4738, "eval_samples_per_second": 75.985, "eval_steps_per_second": 18.996, "step": 11900 }, { "epoch": 160.0, "grad_norm": 0.3049619495868683, "learning_rate": 4.2000000000000004e-06, "loss": 0.9771, "step": 12000 }, { "epoch": 160.0, "eval_loss": 1.7211833000183105, "eval_runtime": 0.488, "eval_samples_per_second": 73.77, "eval_steps_per_second": 18.442, "step": 12000 }, { "epoch": 161.33333333333334, "eval_loss": 1.761918306350708, "eval_runtime": 0.537, "eval_samples_per_second": 67.039, "eval_steps_per_second": 16.76, "step": 12100 }, { "epoch": 162.66666666666666, "eval_loss": 1.7758536338806152, "eval_runtime": 0.6667, "eval_samples_per_second": 53.996, "eval_steps_per_second": 13.499, "step": 12200 }, { "epoch": 164.0, "eval_loss": 1.858489751815796, "eval_runtime": 0.5186, "eval_samples_per_second": 69.412, "eval_steps_per_second": 17.353, "step": 12300 }, { "epoch": 165.33333333333334, "eval_loss": 1.8833012580871582, "eval_runtime": 0.5112, "eval_samples_per_second": 70.417, "eval_steps_per_second": 17.604, "step": 12400 }, { "epoch": 166.66666666666666, "grad_norm": 0.43115538358688354, "learning_rate": 4.166666666666667e-06, "loss": 1.0029, "step": 12500 }, { "epoch": 166.66666666666666, "eval_loss": 1.5844683647155762, "eval_runtime": 0.51, "eval_samples_per_second": 70.587, "eval_steps_per_second": 17.647, "step": 12500 }, { "epoch": 168.0, "eval_loss": 1.7249573469161987, "eval_runtime": 0.5094, "eval_samples_per_second": 70.678, "eval_steps_per_second": 17.67, "step": 12600 }, { "epoch": 169.33333333333334, "eval_loss": 1.6341278553009033, "eval_runtime": 0.6722, "eval_samples_per_second": 53.553, "eval_steps_per_second": 13.388, "step": 12700 }, { "epoch": 170.66666666666666, "eval_loss": 1.6907905340194702, "eval_runtime": 0.5494, "eval_samples_per_second": 65.527, "eval_steps_per_second": 16.382, "step": 12800 }, { "epoch": 172.0, "eval_loss": 1.7291339635849, "eval_runtime": 0.708, "eval_samples_per_second": 50.847, "eval_steps_per_second": 12.712, "step": 12900 }, { "epoch": 173.33333333333334, "grad_norm": 0.6758123636245728, "learning_rate": 4.133333333333333e-06, "loss": 1.0056, "step": 13000 }, { "epoch": 173.33333333333334, "eval_loss": 1.7933716773986816, "eval_runtime": 0.5444, "eval_samples_per_second": 66.125, "eval_steps_per_second": 16.531, "step": 13000 }, { "epoch": 174.66666666666666, "eval_loss": 1.686177372932434, "eval_runtime": 0.5321, "eval_samples_per_second": 67.655, "eval_steps_per_second": 16.914, "step": 13100 }, { "epoch": 176.0, "eval_loss": 1.6645054817199707, "eval_runtime": 0.5433, "eval_samples_per_second": 66.266, "eval_steps_per_second": 16.566, "step": 13200 }, { "epoch": 177.33333333333334, "eval_loss": 1.626644492149353, "eval_runtime": 0.5324, "eval_samples_per_second": 67.614, "eval_steps_per_second": 16.904, "step": 13300 }, { "epoch": 178.66666666666666, "eval_loss": 1.459790587425232, "eval_runtime": 0.5525, "eval_samples_per_second": 65.159, "eval_steps_per_second": 16.29, "step": 13400 }, { "epoch": 180.0, "grad_norm": 5.249213218688965, "learning_rate": 4.1e-06, "loss": 0.9797, "step": 13500 }, { "epoch": 180.0, "eval_loss": 1.5745149850845337, "eval_runtime": 0.7629, "eval_samples_per_second": 47.189, "eval_steps_per_second": 11.797, "step": 13500 }, { "epoch": 181.33333333333334, "eval_loss": 1.4517251253128052, "eval_runtime": 0.5467, "eval_samples_per_second": 65.844, "eval_steps_per_second": 16.461, "step": 13600 }, { "epoch": 182.66666666666666, "eval_loss": 1.7139960527420044, "eval_runtime": 0.5288, "eval_samples_per_second": 68.079, "eval_steps_per_second": 17.02, "step": 13700 }, { "epoch": 184.0, "eval_loss": 1.7389436960220337, "eval_runtime": 0.5445, "eval_samples_per_second": 66.111, "eval_steps_per_second": 16.528, "step": 13800 }, { "epoch": 185.33333333333334, "eval_loss": 1.7348161935806274, "eval_runtime": 0.5689, "eval_samples_per_second": 63.277, "eval_steps_per_second": 15.819, "step": 13900 }, { "epoch": 186.66666666666666, "grad_norm": 0.29905539751052856, "learning_rate": 4.066666666666667e-06, "loss": 0.9865, "step": 14000 }, { "epoch": 186.66666666666666, "eval_loss": 1.5543586015701294, "eval_runtime": 0.5462, "eval_samples_per_second": 65.904, "eval_steps_per_second": 16.476, "step": 14000 }, { "epoch": 188.0, "eval_loss": 1.8664463758468628, "eval_runtime": 0.5575, "eval_samples_per_second": 64.57, "eval_steps_per_second": 16.143, "step": 14100 }, { "epoch": 189.33333333333334, "eval_loss": 1.5657261610031128, "eval_runtime": 0.5514, "eval_samples_per_second": 65.283, "eval_steps_per_second": 16.321, "step": 14200 }, { "epoch": 190.66666666666666, "eval_loss": 1.7389534711837769, "eval_runtime": 0.7354, "eval_samples_per_second": 48.951, "eval_steps_per_second": 12.238, "step": 14300 }, { "epoch": 192.0, "eval_loss": 2.0766286849975586, "eval_runtime": 0.712, "eval_samples_per_second": 50.564, "eval_steps_per_second": 12.641, "step": 14400 }, { "epoch": 193.33333333333334, "grad_norm": 0.31173381209373474, "learning_rate": 4.033333333333333e-06, "loss": 0.9972, "step": 14500 }, { "epoch": 193.33333333333334, "eval_loss": 1.7838109731674194, "eval_runtime": 0.5439, "eval_samples_per_second": 66.194, "eval_steps_per_second": 16.548, "step": 14500 }, { "epoch": 194.66666666666666, "eval_loss": 1.7652925252914429, "eval_runtime": 0.5885, "eval_samples_per_second": 61.171, "eval_steps_per_second": 15.293, "step": 14600 }, { "epoch": 196.0, "eval_loss": 1.6499571800231934, "eval_runtime": 0.581, "eval_samples_per_second": 61.967, "eval_steps_per_second": 15.492, "step": 14700 }, { "epoch": 197.33333333333334, "eval_loss": 1.6213910579681396, "eval_runtime": 0.6153, "eval_samples_per_second": 58.503, "eval_steps_per_second": 14.626, "step": 14800 }, { "epoch": 198.66666666666666, "eval_loss": 1.6204463243484497, "eval_runtime": 0.5544, "eval_samples_per_second": 64.929, "eval_steps_per_second": 16.232, "step": 14900 }, { "epoch": 200.0, "grad_norm": 0.23956339061260223, "learning_rate": 4.000000000000001e-06, "loss": 0.9843, "step": 15000 }, { "epoch": 200.0, "eval_loss": 1.880735158920288, "eval_runtime": 0.5707, "eval_samples_per_second": 63.081, "eval_steps_per_second": 15.77, "step": 15000 }, { "epoch": 201.33333333333334, "eval_loss": 1.7442986965179443, "eval_runtime": 0.5538, "eval_samples_per_second": 65.0, "eval_steps_per_second": 16.25, "step": 15100 }, { "epoch": 202.66666666666666, "eval_loss": 1.6454511880874634, "eval_runtime": 0.7049, "eval_samples_per_second": 51.072, "eval_steps_per_second": 12.768, "step": 15200 }, { "epoch": 204.0, "eval_loss": 1.7235594987869263, "eval_runtime": 0.7481, "eval_samples_per_second": 48.121, "eval_steps_per_second": 12.03, "step": 15300 }, { "epoch": 205.33333333333334, "eval_loss": 1.6651794910430908, "eval_runtime": 0.5533, "eval_samples_per_second": 65.06, "eval_steps_per_second": 16.265, "step": 15400 }, { "epoch": 206.66666666666666, "grad_norm": 0.1084405705332756, "learning_rate": 3.966666666666667e-06, "loss": 0.9878, "step": 15500 }, { "epoch": 206.66666666666666, "eval_loss": 1.8001526594161987, "eval_runtime": 0.6208, "eval_samples_per_second": 57.992, "eval_steps_per_second": 14.498, "step": 15500 }, { "epoch": 208.0, "eval_loss": 1.8208098411560059, "eval_runtime": 0.5518, "eval_samples_per_second": 65.242, "eval_steps_per_second": 16.31, "step": 15600 }, { "epoch": 209.33333333333334, "eval_loss": 1.6776872873306274, "eval_runtime": 0.5461, "eval_samples_per_second": 65.917, "eval_steps_per_second": 16.479, "step": 15700 }, { "epoch": 210.66666666666666, "eval_loss": 2.075190782546997, "eval_runtime": 0.5458, "eval_samples_per_second": 65.961, "eval_steps_per_second": 16.49, "step": 15800 }, { "epoch": 212.0, "eval_loss": 1.7014814615249634, "eval_runtime": 0.5583, "eval_samples_per_second": 64.48, "eval_steps_per_second": 16.12, "step": 15900 }, { "epoch": 213.33333333333334, "grad_norm": 0.1353149712085724, "learning_rate": 3.9333333333333335e-06, "loss": 0.9926, "step": 16000 }, { "epoch": 213.33333333333334, "eval_loss": 1.657114028930664, "eval_runtime": 0.5583, "eval_samples_per_second": 64.477, "eval_steps_per_second": 16.119, "step": 16000 }, { "epoch": 214.66666666666666, "eval_loss": 1.7757220268249512, "eval_runtime": 0.5618, "eval_samples_per_second": 64.081, "eval_steps_per_second": 16.02, "step": 16100 }, { "epoch": 216.0, "eval_loss": 1.7986140251159668, "eval_runtime": 0.5504, "eval_samples_per_second": 65.408, "eval_steps_per_second": 16.352, "step": 16200 }, { "epoch": 217.33333333333334, "eval_loss": 1.6359658241271973, "eval_runtime": 0.5573, "eval_samples_per_second": 64.598, "eval_steps_per_second": 16.15, "step": 16300 }, { "epoch": 218.66666666666666, "eval_loss": 1.7862355709075928, "eval_runtime": 0.5545, "eval_samples_per_second": 64.919, "eval_steps_per_second": 16.23, "step": 16400 }, { "epoch": 220.0, "grad_norm": 0.2572534382343292, "learning_rate": 3.900000000000001e-06, "loss": 0.9993, "step": 16500 }, { "epoch": 220.0, "eval_loss": 1.8996151685714722, "eval_runtime": 0.5561, "eval_samples_per_second": 64.739, "eval_steps_per_second": 16.185, "step": 16500 }, { "epoch": 221.33333333333334, "eval_loss": 1.689023494720459, "eval_runtime": 0.5391, "eval_samples_per_second": 66.782, "eval_steps_per_second": 16.695, "step": 16600 }, { "epoch": 222.66666666666666, "eval_loss": 2.096752643585205, "eval_runtime": 0.5676, "eval_samples_per_second": 63.426, "eval_steps_per_second": 15.856, "step": 16700 }, { "epoch": 224.0, "eval_loss": 1.6648435592651367, "eval_runtime": 0.5501, "eval_samples_per_second": 65.442, "eval_steps_per_second": 16.36, "step": 16800 }, { "epoch": 225.33333333333334, "eval_loss": 1.5713462829589844, "eval_runtime": 0.5401, "eval_samples_per_second": 66.658, "eval_steps_per_second": 16.665, "step": 16900 }, { "epoch": 226.66666666666666, "grad_norm": 10.063590049743652, "learning_rate": 3.866666666666667e-06, "loss": 0.9925, "step": 17000 }, { "epoch": 226.66666666666666, "eval_loss": 1.8458077907562256, "eval_runtime": 0.5553, "eval_samples_per_second": 64.826, "eval_steps_per_second": 16.207, "step": 17000 }, { "epoch": 228.0, "eval_loss": 1.770579218864441, "eval_runtime": 0.772, "eval_samples_per_second": 46.629, "eval_steps_per_second": 11.657, "step": 17100 }, { "epoch": 229.33333333333334, "eval_loss": 1.9782880544662476, "eval_runtime": 0.5365, "eval_samples_per_second": 67.105, "eval_steps_per_second": 16.776, "step": 17200 }, { "epoch": 230.66666666666666, "eval_loss": 1.9077173471450806, "eval_runtime": 0.5759, "eval_samples_per_second": 62.508, "eval_steps_per_second": 15.627, "step": 17300 }, { "epoch": 232.0, "eval_loss": 1.4449816942214966, "eval_runtime": 0.5399, "eval_samples_per_second": 66.676, "eval_steps_per_second": 16.669, "step": 17400 }, { "epoch": 233.33333333333334, "grad_norm": 0.3613598346710205, "learning_rate": 3.833333333333334e-06, "loss": 0.9964, "step": 17500 }, { "epoch": 233.33333333333334, "eval_loss": 1.6719191074371338, "eval_runtime": 0.5616, "eval_samples_per_second": 64.105, "eval_steps_per_second": 16.026, "step": 17500 }, { "epoch": 234.66666666666666, "eval_loss": 2.0009090900421143, "eval_runtime": 0.5905, "eval_samples_per_second": 60.962, "eval_steps_per_second": 15.24, "step": 17600 }, { "epoch": 236.0, "eval_loss": 1.8131402730941772, "eval_runtime": 0.5372, "eval_samples_per_second": 67.016, "eval_steps_per_second": 16.754, "step": 17700 }, { "epoch": 237.33333333333334, "eval_loss": 1.9489625692367554, "eval_runtime": 0.5226, "eval_samples_per_second": 68.888, "eval_steps_per_second": 17.222, "step": 17800 }, { "epoch": 238.66666666666666, "eval_loss": 1.960273265838623, "eval_runtime": 0.5499, "eval_samples_per_second": 65.465, "eval_steps_per_second": 16.366, "step": 17900 }, { "epoch": 240.0, "grad_norm": 0.07147068530321121, "learning_rate": 3.8000000000000005e-06, "loss": 1.0044, "step": 18000 }, { "epoch": 240.0, "eval_loss": 1.9288198947906494, "eval_runtime": 0.7323, "eval_samples_per_second": 49.163, "eval_steps_per_second": 12.291, "step": 18000 }, { "epoch": 241.33333333333334, "eval_loss": 1.57040536403656, "eval_runtime": 0.5674, "eval_samples_per_second": 63.447, "eval_steps_per_second": 15.862, "step": 18100 }, { "epoch": 242.66666666666666, "eval_loss": 1.6067564487457275, "eval_runtime": 0.5562, "eval_samples_per_second": 64.729, "eval_steps_per_second": 16.182, "step": 18200 }, { "epoch": 244.0, "eval_loss": 1.5850058794021606, "eval_runtime": 0.5313, "eval_samples_per_second": 67.762, "eval_steps_per_second": 16.94, "step": 18300 }, { "epoch": 245.33333333333334, "eval_loss": 2.0469565391540527, "eval_runtime": 0.5511, "eval_samples_per_second": 65.324, "eval_steps_per_second": 16.331, "step": 18400 }, { "epoch": 246.66666666666666, "grad_norm": 0.25046324729919434, "learning_rate": 3.766666666666667e-06, "loss": 0.9872, "step": 18500 }, { "epoch": 246.66666666666666, "eval_loss": 1.722219467163086, "eval_runtime": 0.7141, "eval_samples_per_second": 50.41, "eval_steps_per_second": 12.603, "step": 18500 }, { "epoch": 248.0, "eval_loss": 1.8908432722091675, "eval_runtime": 0.5481, "eval_samples_per_second": 65.684, "eval_steps_per_second": 16.421, "step": 18600 }, { "epoch": 249.33333333333334, "eval_loss": 1.9145712852478027, "eval_runtime": 0.5498, "eval_samples_per_second": 65.48, "eval_steps_per_second": 16.37, "step": 18700 }, { "epoch": 250.66666666666666, "eval_loss": 1.598602056503296, "eval_runtime": 0.5567, "eval_samples_per_second": 64.669, "eval_steps_per_second": 16.167, "step": 18800 }, { "epoch": 252.0, "eval_loss": 1.6978509426116943, "eval_runtime": 0.552, "eval_samples_per_second": 65.213, "eval_steps_per_second": 16.303, "step": 18900 }, { "epoch": 253.33333333333334, "grad_norm": 0.09876473248004913, "learning_rate": 3.7333333333333337e-06, "loss": 0.9792, "step": 19000 }, { "epoch": 253.33333333333334, "eval_loss": 1.6713515520095825, "eval_runtime": 0.5377, "eval_samples_per_second": 66.956, "eval_steps_per_second": 16.739, "step": 19000 }, { "epoch": 254.66666666666666, "eval_loss": 1.8374097347259521, "eval_runtime": 0.5587, "eval_samples_per_second": 64.437, "eval_steps_per_second": 16.109, "step": 19100 }, { "epoch": 256.0, "eval_loss": 1.644196629524231, "eval_runtime": 0.5366, "eval_samples_per_second": 67.089, "eval_steps_per_second": 16.772, "step": 19200 }, { "epoch": 257.3333333333333, "eval_loss": 1.7039121389389038, "eval_runtime": 0.5633, "eval_samples_per_second": 63.911, "eval_steps_per_second": 15.978, "step": 19300 }, { "epoch": 258.6666666666667, "eval_loss": 1.8680918216705322, "eval_runtime": 0.5507, "eval_samples_per_second": 65.374, "eval_steps_per_second": 16.344, "step": 19400 }, { "epoch": 260.0, "grad_norm": 0.08226627856492996, "learning_rate": 3.7e-06, "loss": 0.9874, "step": 19500 }, { "epoch": 260.0, "eval_loss": 1.9040900468826294, "eval_runtime": 0.5417, "eval_samples_per_second": 66.46, "eval_steps_per_second": 16.615, "step": 19500 }, { "epoch": 261.3333333333333, "eval_loss": 2.081179618835449, "eval_runtime": 0.5509, "eval_samples_per_second": 65.351, "eval_steps_per_second": 16.338, "step": 19600 }, { "epoch": 262.6666666666667, "eval_loss": 2.159592628479004, "eval_runtime": 0.719, "eval_samples_per_second": 50.072, "eval_steps_per_second": 12.518, "step": 19700 }, { "epoch": 264.0, "eval_loss": 1.889189600944519, "eval_runtime": 0.5471, "eval_samples_per_second": 65.8, "eval_steps_per_second": 16.45, "step": 19800 }, { "epoch": 265.3333333333333, "eval_loss": 2.0072860717773438, "eval_runtime": 0.7128, "eval_samples_per_second": 50.502, "eval_steps_per_second": 12.626, "step": 19900 }, { "epoch": 266.6666666666667, "grad_norm": 1.5182013511657715, "learning_rate": 3.6666666666666666e-06, "loss": 0.9658, "step": 20000 }, { "epoch": 266.6666666666667, "eval_loss": 2.1017017364501953, "eval_runtime": 0.5523, "eval_samples_per_second": 65.181, "eval_steps_per_second": 16.295, "step": 20000 }, { "epoch": 268.0, "eval_loss": 1.9787592887878418, "eval_runtime": 0.552, "eval_samples_per_second": 65.221, "eval_steps_per_second": 16.305, "step": 20100 }, { "epoch": 269.3333333333333, "eval_loss": 1.9468598365783691, "eval_runtime": 0.5502, "eval_samples_per_second": 65.436, "eval_steps_per_second": 16.359, "step": 20200 }, { "epoch": 270.6666666666667, "eval_loss": 1.7929949760437012, "eval_runtime": 0.5784, "eval_samples_per_second": 62.244, "eval_steps_per_second": 15.561, "step": 20300 }, { "epoch": 272.0, "eval_loss": 2.047785758972168, "eval_runtime": 0.547, "eval_samples_per_second": 65.818, "eval_steps_per_second": 16.455, "step": 20400 }, { "epoch": 273.3333333333333, "grad_norm": 1.3280600309371948, "learning_rate": 3.633333333333334e-06, "loss": 0.9922, "step": 20500 }, { "epoch": 273.3333333333333, "eval_loss": 1.8701856136322021, "eval_runtime": 0.5292, "eval_samples_per_second": 68.029, "eval_steps_per_second": 17.007, "step": 20500 }, { "epoch": 274.6666666666667, "eval_loss": 1.7104636430740356, "eval_runtime": 0.5623, "eval_samples_per_second": 64.025, "eval_steps_per_second": 16.006, "step": 20600 }, { "epoch": 276.0, "eval_loss": 1.707152247428894, "eval_runtime": 0.545, "eval_samples_per_second": 66.051, "eval_steps_per_second": 16.513, "step": 20700 }, { "epoch": 277.3333333333333, "eval_loss": 1.6416645050048828, "eval_runtime": 0.647, "eval_samples_per_second": 55.639, "eval_steps_per_second": 13.91, "step": 20800 }, { "epoch": 278.6666666666667, "eval_loss": 1.7585358619689941, "eval_runtime": 0.5328, "eval_samples_per_second": 67.571, "eval_steps_per_second": 16.893, "step": 20900 }, { "epoch": 280.0, "grad_norm": 0.09741370379924774, "learning_rate": 3.6000000000000003e-06, "loss": 0.9838, "step": 21000 }, { "epoch": 280.0, "eval_loss": 1.7427773475646973, "eval_runtime": 0.5657, "eval_samples_per_second": 63.638, "eval_steps_per_second": 15.909, "step": 21000 }, { "epoch": 281.3333333333333, "eval_loss": 1.791472315788269, "eval_runtime": 0.549, "eval_samples_per_second": 65.577, "eval_steps_per_second": 16.394, "step": 21100 }, { "epoch": 282.6666666666667, "eval_loss": 1.7449997663497925, "eval_runtime": 0.5306, "eval_samples_per_second": 67.848, "eval_steps_per_second": 16.962, "step": 21200 }, { "epoch": 284.0, "eval_loss": 1.8439627885818481, "eval_runtime": 0.5204, "eval_samples_per_second": 69.182, "eval_steps_per_second": 17.295, "step": 21300 }, { "epoch": 285.3333333333333, "eval_loss": 1.8163613080978394, "eval_runtime": 0.5763, "eval_samples_per_second": 62.472, "eval_steps_per_second": 15.618, "step": 21400 }, { "epoch": 286.6666666666667, "grad_norm": 0.11030168831348419, "learning_rate": 3.566666666666667e-06, "loss": 0.9815, "step": 21500 }, { "epoch": 286.6666666666667, "eval_loss": 1.7035778760910034, "eval_runtime": 0.5376, "eval_samples_per_second": 66.965, "eval_steps_per_second": 16.741, "step": 21500 }, { "epoch": 288.0, "eval_loss": 1.832694172859192, "eval_runtime": 0.5689, "eval_samples_per_second": 63.276, "eval_steps_per_second": 15.819, "step": 21600 }, { "epoch": 289.3333333333333, "eval_loss": 1.6627203226089478, "eval_runtime": 0.5297, "eval_samples_per_second": 67.961, "eval_steps_per_second": 16.99, "step": 21700 }, { "epoch": 290.6666666666667, "eval_loss": 1.6602559089660645, "eval_runtime": 0.7206, "eval_samples_per_second": 49.962, "eval_steps_per_second": 12.49, "step": 21800 }, { "epoch": 292.0, "eval_loss": 1.670076608657837, "eval_runtime": 0.5544, "eval_samples_per_second": 64.931, "eval_steps_per_second": 16.233, "step": 21900 }, { "epoch": 293.3333333333333, "grad_norm": 0.10531683266162872, "learning_rate": 3.5333333333333335e-06, "loss": 0.9836, "step": 22000 }, { "epoch": 293.3333333333333, "eval_loss": 1.7467204332351685, "eval_runtime": 0.5744, "eval_samples_per_second": 62.671, "eval_steps_per_second": 15.668, "step": 22000 }, { "epoch": 294.6666666666667, "eval_loss": 1.8038678169250488, "eval_runtime": 0.5674, "eval_samples_per_second": 63.447, "eval_steps_per_second": 15.862, "step": 22100 }, { "epoch": 296.0, "eval_loss": 1.6190006732940674, "eval_runtime": 0.5479, "eval_samples_per_second": 65.701, "eval_steps_per_second": 16.425, "step": 22200 }, { "epoch": 297.3333333333333, "eval_loss": 1.6193716526031494, "eval_runtime": 0.5508, "eval_samples_per_second": 65.36, "eval_steps_per_second": 16.34, "step": 22300 }, { "epoch": 298.6666666666667, "eval_loss": 1.5753411054611206, "eval_runtime": 0.5655, "eval_samples_per_second": 63.655, "eval_steps_per_second": 15.914, "step": 22400 }, { "epoch": 300.0, "grad_norm": 0.06573054939508438, "learning_rate": 3.5e-06, "loss": 0.9888, "step": 22500 }, { "epoch": 300.0, "eval_loss": 1.4922865629196167, "eval_runtime": 0.5485, "eval_samples_per_second": 65.63, "eval_steps_per_second": 16.408, "step": 22500 }, { "epoch": 301.3333333333333, "eval_loss": 1.5360524654388428, "eval_runtime": 0.5592, "eval_samples_per_second": 64.381, "eval_steps_per_second": 16.095, "step": 22600 }, { "epoch": 302.6666666666667, "eval_loss": 1.887408971786499, "eval_runtime": 0.5424, "eval_samples_per_second": 66.374, "eval_steps_per_second": 16.594, "step": 22700 }, { "epoch": 304.0, "eval_loss": 1.8452906608581543, "eval_runtime": 0.5388, "eval_samples_per_second": 66.815, "eval_steps_per_second": 16.704, "step": 22800 }, { "epoch": 305.3333333333333, "eval_loss": 1.672760009765625, "eval_runtime": 0.5616, "eval_samples_per_second": 64.099, "eval_steps_per_second": 16.025, "step": 22900 }, { "epoch": 306.6666666666667, "grad_norm": 0.11862615495920181, "learning_rate": 3.4666666666666672e-06, "loss": 0.9797, "step": 23000 }, { "epoch": 306.6666666666667, "eval_loss": 1.8921773433685303, "eval_runtime": 0.5475, "eval_samples_per_second": 65.75, "eval_steps_per_second": 16.438, "step": 23000 }, { "epoch": 308.0, "eval_loss": 1.86057710647583, "eval_runtime": 0.5526, "eval_samples_per_second": 65.152, "eval_steps_per_second": 16.288, "step": 23100 }, { "epoch": 309.3333333333333, "eval_loss": 1.9504114389419556, "eval_runtime": 0.541, "eval_samples_per_second": 66.542, "eval_steps_per_second": 16.635, "step": 23200 }, { "epoch": 310.6666666666667, "eval_loss": 1.6589155197143555, "eval_runtime": 0.547, "eval_samples_per_second": 65.809, "eval_steps_per_second": 16.452, "step": 23300 }, { "epoch": 312.0, "eval_loss": 1.5259159803390503, "eval_runtime": 0.5302, "eval_samples_per_second": 67.894, "eval_steps_per_second": 16.974, "step": 23400 }, { "epoch": 313.3333333333333, "grad_norm": 0.060514457523822784, "learning_rate": 3.4333333333333336e-06, "loss": 0.9917, "step": 23500 }, { "epoch": 313.3333333333333, "eval_loss": 1.5227484703063965, "eval_runtime": 0.5278, "eval_samples_per_second": 68.203, "eval_steps_per_second": 17.051, "step": 23500 }, { "epoch": 314.6666666666667, "eval_loss": 1.6789741516113281, "eval_runtime": 0.5415, "eval_samples_per_second": 66.476, "eval_steps_per_second": 16.619, "step": 23600 }, { "epoch": 316.0, "eval_loss": 1.7229197025299072, "eval_runtime": 0.5589, "eval_samples_per_second": 64.407, "eval_steps_per_second": 16.102, "step": 23700 }, { "epoch": 317.3333333333333, "eval_loss": 1.6771472692489624, "eval_runtime": 0.5686, "eval_samples_per_second": 63.313, "eval_steps_per_second": 15.828, "step": 23800 }, { "epoch": 318.6666666666667, "eval_loss": 1.6256533861160278, "eval_runtime": 0.5317, "eval_samples_per_second": 67.706, "eval_steps_per_second": 16.927, "step": 23900 }, { "epoch": 320.0, "grad_norm": 0.12644121050834656, "learning_rate": 3.4000000000000005e-06, "loss": 0.9926, "step": 24000 }, { "epoch": 320.0, "eval_loss": 1.9987858533859253, "eval_runtime": 0.7133, "eval_samples_per_second": 50.469, "eval_steps_per_second": 12.617, "step": 24000 }, { "epoch": 321.3333333333333, "eval_loss": 1.5779814720153809, "eval_runtime": 0.5829, "eval_samples_per_second": 61.765, "eval_steps_per_second": 15.441, "step": 24100 }, { "epoch": 322.6666666666667, "eval_loss": 1.6263625621795654, "eval_runtime": 0.542, "eval_samples_per_second": 66.424, "eval_steps_per_second": 16.606, "step": 24200 }, { "epoch": 324.0, "eval_loss": 1.8259631395339966, "eval_runtime": 0.5525, "eval_samples_per_second": 65.153, "eval_steps_per_second": 16.288, "step": 24300 }, { "epoch": 325.3333333333333, "eval_loss": 1.8610644340515137, "eval_runtime": 0.7041, "eval_samples_per_second": 51.126, "eval_steps_per_second": 12.782, "step": 24400 }, { "epoch": 326.6666666666667, "grad_norm": 0.0809217169880867, "learning_rate": 3.366666666666667e-06, "loss": 0.9907, "step": 24500 }, { "epoch": 326.6666666666667, "eval_loss": 1.8522862195968628, "eval_runtime": 0.5782, "eval_samples_per_second": 62.261, "eval_steps_per_second": 15.565, "step": 24500 }, { "epoch": 328.0, "eval_loss": 1.6931966543197632, "eval_runtime": 0.5755, "eval_samples_per_second": 62.552, "eval_steps_per_second": 15.638, "step": 24600 }, { "epoch": 329.3333333333333, "eval_loss": 1.7654081583023071, "eval_runtime": 0.5618, "eval_samples_per_second": 64.084, "eval_steps_per_second": 16.021, "step": 24700 }, { "epoch": 330.6666666666667, "eval_loss": 1.7334980964660645, "eval_runtime": 0.5597, "eval_samples_per_second": 64.318, "eval_steps_per_second": 16.08, "step": 24800 }, { "epoch": 332.0, "eval_loss": 1.7039364576339722, "eval_runtime": 0.5577, "eval_samples_per_second": 64.548, "eval_steps_per_second": 16.137, "step": 24900 }, { "epoch": 333.3333333333333, "grad_norm": 0.14974096417427063, "learning_rate": 3.3333333333333333e-06, "loss": 0.9724, "step": 25000 }, { "epoch": 333.3333333333333, "eval_loss": 1.661787748336792, "eval_runtime": 0.5621, "eval_samples_per_second": 64.04, "eval_steps_per_second": 16.01, "step": 25000 }, { "epoch": 334.6666666666667, "eval_loss": 1.96225106716156, "eval_runtime": 0.7494, "eval_samples_per_second": 48.038, "eval_steps_per_second": 12.01, "step": 25100 }, { "epoch": 336.0, "eval_loss": 1.936820149421692, "eval_runtime": 0.5708, "eval_samples_per_second": 63.072, "eval_steps_per_second": 15.768, "step": 25200 }, { "epoch": 337.3333333333333, "eval_loss": 1.7732367515563965, "eval_runtime": 0.5727, "eval_samples_per_second": 62.861, "eval_steps_per_second": 15.715, "step": 25300 }, { "epoch": 338.6666666666667, "eval_loss": 1.956594705581665, "eval_runtime": 0.5603, "eval_samples_per_second": 64.255, "eval_steps_per_second": 16.064, "step": 25400 }, { "epoch": 340.0, "grad_norm": 0.11350958794355392, "learning_rate": 3.3000000000000006e-06, "loss": 0.9798, "step": 25500 }, { "epoch": 340.0, "eval_loss": 1.8826667070388794, "eval_runtime": 0.5453, "eval_samples_per_second": 66.023, "eval_steps_per_second": 16.506, "step": 25500 }, { "epoch": 341.3333333333333, "eval_loss": 1.7523289918899536, "eval_runtime": 0.5736, "eval_samples_per_second": 62.761, "eval_steps_per_second": 15.69, "step": 25600 }, { "epoch": 342.6666666666667, "eval_loss": 1.7630534172058105, "eval_runtime": 0.5947, "eval_samples_per_second": 60.532, "eval_steps_per_second": 15.133, "step": 25700 }, { "epoch": 344.0, "eval_loss": 1.8616409301757812, "eval_runtime": 0.5692, "eval_samples_per_second": 63.251, "eval_steps_per_second": 15.813, "step": 25800 }, { "epoch": 345.3333333333333, "eval_loss": 1.6151959896087646, "eval_runtime": 0.5699, "eval_samples_per_second": 63.165, "eval_steps_per_second": 15.791, "step": 25900 }, { "epoch": 346.6666666666667, "grad_norm": 0.0736708790063858, "learning_rate": 3.266666666666667e-06, "loss": 1.0, "step": 26000 }, { "epoch": 346.6666666666667, "eval_loss": 1.711294174194336, "eval_runtime": 0.6966, "eval_samples_per_second": 51.678, "eval_steps_per_second": 12.92, "step": 26000 }, { "epoch": 348.0, "eval_loss": 1.7062689065933228, "eval_runtime": 0.7279, "eval_samples_per_second": 49.456, "eval_steps_per_second": 12.364, "step": 26100 }, { "epoch": 349.3333333333333, "eval_loss": 1.586897373199463, "eval_runtime": 0.5751, "eval_samples_per_second": 62.594, "eval_steps_per_second": 15.648, "step": 26200 }, { "epoch": 350.6666666666667, "eval_loss": 1.687790870666504, "eval_runtime": 0.7576, "eval_samples_per_second": 47.519, "eval_steps_per_second": 11.88, "step": 26300 }, { "epoch": 352.0, "eval_loss": 1.5914911031723022, "eval_runtime": 0.5702, "eval_samples_per_second": 63.138, "eval_steps_per_second": 15.785, "step": 26400 }, { "epoch": 353.3333333333333, "grad_norm": 0.1014912948012352, "learning_rate": 3.2333333333333334e-06, "loss": 0.9816, "step": 26500 }, { "epoch": 353.3333333333333, "eval_loss": 1.5613493919372559, "eval_runtime": 0.5667, "eval_samples_per_second": 63.52, "eval_steps_per_second": 15.88, "step": 26500 }, { "epoch": 354.6666666666667, "eval_loss": 1.7842038869857788, "eval_runtime": 0.5578, "eval_samples_per_second": 64.543, "eval_steps_per_second": 16.136, "step": 26600 }, { "epoch": 356.0, "eval_loss": 1.681370496749878, "eval_runtime": 0.7335, "eval_samples_per_second": 49.081, "eval_steps_per_second": 12.27, "step": 26700 }, { "epoch": 357.3333333333333, "eval_loss": 1.7293224334716797, "eval_runtime": 0.5473, "eval_samples_per_second": 65.783, "eval_steps_per_second": 16.446, "step": 26800 }, { "epoch": 358.6666666666667, "eval_loss": 1.888627529144287, "eval_runtime": 0.5641, "eval_samples_per_second": 63.821, "eval_steps_per_second": 15.955, "step": 26900 }, { "epoch": 360.0, "grad_norm": 0.008342635817825794, "learning_rate": 3.2000000000000003e-06, "loss": 0.9845, "step": 27000 }, { "epoch": 360.0, "eval_loss": 1.9464117288589478, "eval_runtime": 0.5603, "eval_samples_per_second": 64.253, "eval_steps_per_second": 16.063, "step": 27000 }, { "epoch": 361.3333333333333, "eval_loss": 1.7939698696136475, "eval_runtime": 0.583, "eval_samples_per_second": 61.745, "eval_steps_per_second": 15.436, "step": 27100 }, { "epoch": 362.6666666666667, "eval_loss": 1.7593424320220947, "eval_runtime": 0.5744, "eval_samples_per_second": 62.679, "eval_steps_per_second": 15.67, "step": 27200 }, { "epoch": 364.0, "eval_loss": 1.6186896562576294, "eval_runtime": 0.583, "eval_samples_per_second": 61.745, "eval_steps_per_second": 15.436, "step": 27300 }, { "epoch": 365.3333333333333, "eval_loss": 1.6947681903839111, "eval_runtime": 0.569, "eval_samples_per_second": 63.269, "eval_steps_per_second": 15.817, "step": 27400 }, { "epoch": 366.6666666666667, "grad_norm": 0.0887143537402153, "learning_rate": 3.1666666666666667e-06, "loss": 0.9705, "step": 27500 }, { "epoch": 366.6666666666667, "eval_loss": 1.7642453908920288, "eval_runtime": 0.5865, "eval_samples_per_second": 61.378, "eval_steps_per_second": 15.345, "step": 27500 }, { "epoch": 368.0, "eval_loss": 1.6307570934295654, "eval_runtime": 0.6295, "eval_samples_per_second": 57.191, "eval_steps_per_second": 14.298, "step": 27600 }, { "epoch": 369.3333333333333, "eval_loss": 1.5529420375823975, "eval_runtime": 0.5763, "eval_samples_per_second": 62.47, "eval_steps_per_second": 15.617, "step": 27700 }, { "epoch": 370.6666666666667, "eval_loss": 1.641685962677002, "eval_runtime": 0.6285, "eval_samples_per_second": 57.281, "eval_steps_per_second": 14.32, "step": 27800 }, { "epoch": 372.0, "eval_loss": 1.6802366971969604, "eval_runtime": 0.604, "eval_samples_per_second": 59.603, "eval_steps_per_second": 14.901, "step": 27900 }, { "epoch": 373.3333333333333, "grad_norm": 0.1064884141087532, "learning_rate": 3.133333333333334e-06, "loss": 0.975, "step": 28000 }, { "epoch": 373.3333333333333, "eval_loss": 1.78561270236969, "eval_runtime": 0.5924, "eval_samples_per_second": 60.773, "eval_steps_per_second": 15.193, "step": 28000 }, { "epoch": 374.6666666666667, "eval_loss": 1.5540636777877808, "eval_runtime": 0.5962, "eval_samples_per_second": 60.378, "eval_steps_per_second": 15.095, "step": 28100 }, { "epoch": 376.0, "eval_loss": 1.6259450912475586, "eval_runtime": 0.565, "eval_samples_per_second": 63.713, "eval_steps_per_second": 15.928, "step": 28200 }, { "epoch": 377.3333333333333, "eval_loss": 1.7255088090896606, "eval_runtime": 0.5895, "eval_samples_per_second": 61.068, "eval_steps_per_second": 15.267, "step": 28300 }, { "epoch": 378.6666666666667, "eval_loss": 1.78397798538208, "eval_runtime": 0.6285, "eval_samples_per_second": 57.283, "eval_steps_per_second": 14.321, "step": 28400 }, { "epoch": 380.0, "grad_norm": 0.08971001207828522, "learning_rate": 3.1000000000000004e-06, "loss": 0.99, "step": 28500 }, { "epoch": 380.0, "eval_loss": 1.6973234415054321, "eval_runtime": 0.5976, "eval_samples_per_second": 60.24, "eval_steps_per_second": 15.06, "step": 28500 }, { "epoch": 381.3333333333333, "eval_loss": 1.719337821006775, "eval_runtime": 0.5946, "eval_samples_per_second": 60.543, "eval_steps_per_second": 15.136, "step": 28600 }, { "epoch": 382.6666666666667, "eval_loss": 1.6016268730163574, "eval_runtime": 0.5927, "eval_samples_per_second": 60.74, "eval_steps_per_second": 15.185, "step": 28700 }, { "epoch": 384.0, "eval_loss": 1.8772557973861694, "eval_runtime": 0.5978, "eval_samples_per_second": 60.225, "eval_steps_per_second": 15.056, "step": 28800 }, { "epoch": 385.3333333333333, "eval_loss": 1.6752413511276245, "eval_runtime": 0.5941, "eval_samples_per_second": 60.595, "eval_steps_per_second": 15.149, "step": 28900 }, { "epoch": 386.6666666666667, "grad_norm": 0.14836561679840088, "learning_rate": 3.066666666666667e-06, "loss": 0.9784, "step": 29000 }, { "epoch": 386.6666666666667, "eval_loss": 1.6632412672042847, "eval_runtime": 0.5759, "eval_samples_per_second": 62.508, "eval_steps_per_second": 15.627, "step": 29000 }, { "epoch": 388.0, "eval_loss": 1.5473161935806274, "eval_runtime": 0.6027, "eval_samples_per_second": 59.727, "eval_steps_per_second": 14.932, "step": 29100 }, { "epoch": 389.3333333333333, "eval_loss": 1.5605320930480957, "eval_runtime": 0.6164, "eval_samples_per_second": 58.406, "eval_steps_per_second": 14.602, "step": 29200 }, { "epoch": 390.6666666666667, "eval_loss": 1.6579989194869995, "eval_runtime": 0.7778, "eval_samples_per_second": 46.283, "eval_steps_per_second": 11.571, "step": 29300 }, { "epoch": 392.0, "eval_loss": 1.775632381439209, "eval_runtime": 0.5912, "eval_samples_per_second": 60.894, "eval_steps_per_second": 15.223, "step": 29400 }, { "epoch": 393.3333333333333, "grad_norm": 1.308126449584961, "learning_rate": 3.0333333333333337e-06, "loss": 0.9945, "step": 29500 }, { "epoch": 393.3333333333333, "eval_loss": 1.609598994255066, "eval_runtime": 0.6101, "eval_samples_per_second": 59.011, "eval_steps_per_second": 14.753, "step": 29500 }, { "epoch": 394.6666666666667, "eval_loss": 1.7556321620941162, "eval_runtime": 0.62, "eval_samples_per_second": 58.065, "eval_steps_per_second": 14.516, "step": 29600 }, { "epoch": 396.0, "eval_loss": 2.056849718093872, "eval_runtime": 0.6441, "eval_samples_per_second": 55.891, "eval_steps_per_second": 13.973, "step": 29700 }, { "epoch": 397.3333333333333, "eval_loss": 1.6420453786849976, "eval_runtime": 0.7613, "eval_samples_per_second": 47.29, "eval_steps_per_second": 11.822, "step": 29800 }, { "epoch": 398.6666666666667, "eval_loss": 1.7073485851287842, "eval_runtime": 0.601, "eval_samples_per_second": 59.9, "eval_steps_per_second": 14.975, "step": 29900 }, { "epoch": 400.0, "grad_norm": 0.07207546383142471, "learning_rate": 3e-06, "loss": 0.9859, "step": 30000 }, { "epoch": 400.0, "eval_loss": 1.598061442375183, "eval_runtime": 0.6175, "eval_samples_per_second": 58.299, "eval_steps_per_second": 14.575, "step": 30000 }, { "epoch": 401.3333333333333, "eval_loss": 1.5672425031661987, "eval_runtime": 0.7347, "eval_samples_per_second": 49.0, "eval_steps_per_second": 12.25, "step": 30100 }, { "epoch": 402.6666666666667, "eval_loss": 1.5389965772628784, "eval_runtime": 0.5765, "eval_samples_per_second": 62.441, "eval_steps_per_second": 15.61, "step": 30200 }, { "epoch": 404.0, "eval_loss": 1.5045199394226074, "eval_runtime": 0.5878, "eval_samples_per_second": 61.247, "eval_steps_per_second": 15.312, "step": 30300 }, { "epoch": 405.3333333333333, "eval_loss": 1.63267183303833, "eval_runtime": 0.6042, "eval_samples_per_second": 59.583, "eval_steps_per_second": 14.896, "step": 30400 }, { "epoch": 406.6666666666667, "grad_norm": 0.13364674150943756, "learning_rate": 2.9666666666666673e-06, "loss": 0.987, "step": 30500 }, { "epoch": 406.6666666666667, "eval_loss": 1.4833016395568848, "eval_runtime": 0.6301, "eval_samples_per_second": 57.13, "eval_steps_per_second": 14.283, "step": 30500 }, { "epoch": 408.0, "eval_loss": 1.7449281215667725, "eval_runtime": 0.5681, "eval_samples_per_second": 63.371, "eval_steps_per_second": 15.843, "step": 30600 }, { "epoch": 409.3333333333333, "eval_loss": 1.7675163745880127, "eval_runtime": 0.6316, "eval_samples_per_second": 56.997, "eval_steps_per_second": 14.249, "step": 30700 }, { "epoch": 410.6666666666667, "eval_loss": 1.7203176021575928, "eval_runtime": 0.6245, "eval_samples_per_second": 57.645, "eval_steps_per_second": 14.411, "step": 30800 }, { "epoch": 412.0, "eval_loss": 1.6196057796478271, "eval_runtime": 0.5741, "eval_samples_per_second": 62.702, "eval_steps_per_second": 15.675, "step": 30900 }, { "epoch": 413.3333333333333, "grad_norm": 0.09078539907932281, "learning_rate": 2.9333333333333338e-06, "loss": 0.9725, "step": 31000 }, { "epoch": 413.3333333333333, "eval_loss": 1.6556806564331055, "eval_runtime": 0.6074, "eval_samples_per_second": 59.268, "eval_steps_per_second": 14.817, "step": 31000 }, { "epoch": 414.6666666666667, "eval_loss": 1.77994966506958, "eval_runtime": 0.7217, "eval_samples_per_second": 49.884, "eval_steps_per_second": 12.471, "step": 31100 }, { "epoch": 416.0, "eval_loss": 1.9889085292816162, "eval_runtime": 0.5512, "eval_samples_per_second": 65.317, "eval_steps_per_second": 16.329, "step": 31200 }, { "epoch": 417.3333333333333, "eval_loss": 1.7929357290267944, "eval_runtime": 0.5718, "eval_samples_per_second": 62.957, "eval_steps_per_second": 15.739, "step": 31300 }, { "epoch": 418.6666666666667, "eval_loss": 1.7601008415222168, "eval_runtime": 0.5454, "eval_samples_per_second": 66.012, "eval_steps_per_second": 16.503, "step": 31400 }, { "epoch": 420.0, "grad_norm": 0.10652674734592438, "learning_rate": 2.9e-06, "loss": 0.9783, "step": 31500 }, { "epoch": 420.0, "eval_loss": 1.7172222137451172, "eval_runtime": 0.5324, "eval_samples_per_second": 67.616, "eval_steps_per_second": 16.904, "step": 31500 }, { "epoch": 421.3333333333333, "eval_loss": 1.875618815422058, "eval_runtime": 0.5734, "eval_samples_per_second": 62.781, "eval_steps_per_second": 15.695, "step": 31600 }, { "epoch": 422.6666666666667, "eval_loss": 1.774788498878479, "eval_runtime": 0.5939, "eval_samples_per_second": 60.616, "eval_steps_per_second": 15.154, "step": 31700 }, { "epoch": 424.0, "eval_loss": 1.7908679246902466, "eval_runtime": 0.5675, "eval_samples_per_second": 63.431, "eval_steps_per_second": 15.858, "step": 31800 }, { "epoch": 425.3333333333333, "eval_loss": 1.8519622087478638, "eval_runtime": 0.5622, "eval_samples_per_second": 64.035, "eval_steps_per_second": 16.009, "step": 31900 }, { "epoch": 426.6666666666667, "grad_norm": 0.08114868402481079, "learning_rate": 2.866666666666667e-06, "loss": 0.9849, "step": 32000 }, { "epoch": 426.6666666666667, "eval_loss": 1.4917383193969727, "eval_runtime": 0.5683, "eval_samples_per_second": 63.35, "eval_steps_per_second": 15.838, "step": 32000 }, { "epoch": 428.0, "eval_loss": 1.5645169019699097, "eval_runtime": 0.5611, "eval_samples_per_second": 64.158, "eval_steps_per_second": 16.039, "step": 32100 }, { "epoch": 429.3333333333333, "eval_loss": 1.5803431272506714, "eval_runtime": 0.5558, "eval_samples_per_second": 64.772, "eval_steps_per_second": 16.193, "step": 32200 }, { "epoch": 430.6666666666667, "eval_loss": 1.6522114276885986, "eval_runtime": 0.5646, "eval_samples_per_second": 63.759, "eval_steps_per_second": 15.94, "step": 32300 }, { "epoch": 432.0, "eval_loss": 1.9999171495437622, "eval_runtime": 0.5723, "eval_samples_per_second": 62.905, "eval_steps_per_second": 15.726, "step": 32400 }, { "epoch": 433.3333333333333, "grad_norm": 0.15275415778160095, "learning_rate": 2.8333333333333335e-06, "loss": 0.9772, "step": 32500 }, { "epoch": 433.3333333333333, "eval_loss": 1.6718401908874512, "eval_runtime": 0.5687, "eval_samples_per_second": 63.299, "eval_steps_per_second": 15.825, "step": 32500 }, { "epoch": 434.6666666666667, "eval_loss": 1.6779158115386963, "eval_runtime": 0.5589, "eval_samples_per_second": 64.409, "eval_steps_per_second": 16.102, "step": 32600 }, { "epoch": 436.0, "eval_loss": 1.5872241258621216, "eval_runtime": 0.5603, "eval_samples_per_second": 64.253, "eval_steps_per_second": 16.063, "step": 32700 }, { "epoch": 437.3333333333333, "eval_loss": 1.7565385103225708, "eval_runtime": 0.5405, "eval_samples_per_second": 66.606, "eval_steps_per_second": 16.652, "step": 32800 }, { "epoch": 438.6666666666667, "eval_loss": 1.7727419137954712, "eval_runtime": 0.5678, "eval_samples_per_second": 63.403, "eval_steps_per_second": 15.851, "step": 32900 }, { "epoch": 440.0, "grad_norm": 0.07925722002983093, "learning_rate": 2.8000000000000003e-06, "loss": 0.9756, "step": 33000 }, { "epoch": 440.0, "eval_loss": 1.7909719944000244, "eval_runtime": 0.5463, "eval_samples_per_second": 65.903, "eval_steps_per_second": 16.476, "step": 33000 }, { "epoch": 441.3333333333333, "eval_loss": 1.7867096662521362, "eval_runtime": 0.5502, "eval_samples_per_second": 65.428, "eval_steps_per_second": 16.357, "step": 33100 }, { "epoch": 442.6666666666667, "eval_loss": 1.7775189876556396, "eval_runtime": 0.556, "eval_samples_per_second": 64.748, "eval_steps_per_second": 16.187, "step": 33200 }, { "epoch": 444.0, "eval_loss": 1.797627329826355, "eval_runtime": 0.5434, "eval_samples_per_second": 66.245, "eval_steps_per_second": 16.561, "step": 33300 }, { "epoch": 445.3333333333333, "eval_loss": 1.5831018686294556, "eval_runtime": 0.7145, "eval_samples_per_second": 50.388, "eval_steps_per_second": 12.597, "step": 33400 }, { "epoch": 446.6666666666667, "grad_norm": 0.053291406482458115, "learning_rate": 2.766666666666667e-06, "loss": 0.9806, "step": 33500 }, { "epoch": 446.6666666666667, "eval_loss": 1.7222111225128174, "eval_runtime": 0.5475, "eval_samples_per_second": 65.75, "eval_steps_per_second": 16.438, "step": 33500 }, { "epoch": 448.0, "eval_loss": 1.7587040662765503, "eval_runtime": 0.7128, "eval_samples_per_second": 50.504, "eval_steps_per_second": 12.626, "step": 33600 }, { "epoch": 449.3333333333333, "eval_loss": 1.771296501159668, "eval_runtime": 0.5572, "eval_samples_per_second": 64.613, "eval_steps_per_second": 16.153, "step": 33700 }, { "epoch": 450.6666666666667, "eval_loss": 1.7805464267730713, "eval_runtime": 0.5416, "eval_samples_per_second": 66.472, "eval_steps_per_second": 16.618, "step": 33800 }, { "epoch": 452.0, "eval_loss": 1.782245397567749, "eval_runtime": 0.5575, "eval_samples_per_second": 64.569, "eval_steps_per_second": 16.142, "step": 33900 }, { "epoch": 453.3333333333333, "grad_norm": 0.14553554356098175, "learning_rate": 2.7333333333333336e-06, "loss": 0.9821, "step": 34000 }, { "epoch": 453.3333333333333, "eval_loss": 1.7681078910827637, "eval_runtime": 0.5653, "eval_samples_per_second": 63.687, "eval_steps_per_second": 15.922, "step": 34000 }, { "epoch": 454.6666666666667, "eval_loss": 1.8011735677719116, "eval_runtime": 0.5369, "eval_samples_per_second": 67.05, "eval_steps_per_second": 16.762, "step": 34100 }, { "epoch": 456.0, "eval_loss": 1.78472900390625, "eval_runtime": 0.5623, "eval_samples_per_second": 64.017, "eval_steps_per_second": 16.004, "step": 34200 }, { "epoch": 457.3333333333333, "eval_loss": 1.778428554534912, "eval_runtime": 0.563, "eval_samples_per_second": 63.944, "eval_steps_per_second": 15.986, "step": 34300 }, { "epoch": 458.6666666666667, "eval_loss": 1.734017014503479, "eval_runtime": 0.5603, "eval_samples_per_second": 64.247, "eval_steps_per_second": 16.062, "step": 34400 }, { "epoch": 460.0, "grad_norm": 0.0946693867444992, "learning_rate": 2.7000000000000004e-06, "loss": 0.9786, "step": 34500 }, { "epoch": 460.0, "eval_loss": 1.7655307054519653, "eval_runtime": 0.5437, "eval_samples_per_second": 66.218, "eval_steps_per_second": 16.554, "step": 34500 }, { "epoch": 461.3333333333333, "eval_loss": 1.7260526418685913, "eval_runtime": 0.5568, "eval_samples_per_second": 64.655, "eval_steps_per_second": 16.164, "step": 34600 }, { "epoch": 462.6666666666667, "eval_loss": 1.7453144788742065, "eval_runtime": 0.5474, "eval_samples_per_second": 65.761, "eval_steps_per_second": 16.44, "step": 34700 }, { "epoch": 464.0, "eval_loss": 1.6568406820297241, "eval_runtime": 0.7661, "eval_samples_per_second": 46.988, "eval_steps_per_second": 11.747, "step": 34800 }, { "epoch": 465.3333333333333, "eval_loss": 1.6578307151794434, "eval_runtime": 0.7742, "eval_samples_per_second": 46.5, "eval_steps_per_second": 11.625, "step": 34900 }, { "epoch": 466.6666666666667, "grad_norm": 0.0641135424375534, "learning_rate": 2.666666666666667e-06, "loss": 0.972, "step": 35000 }, { "epoch": 466.6666666666667, "eval_loss": 1.6701782941818237, "eval_runtime": 0.5662, "eval_samples_per_second": 63.583, "eval_steps_per_second": 15.896, "step": 35000 }, { "epoch": 468.0, "eval_loss": 1.675178050994873, "eval_runtime": 0.5493, "eval_samples_per_second": 65.538, "eval_steps_per_second": 16.385, "step": 35100 }, { "epoch": 469.3333333333333, "eval_loss": 1.9994406700134277, "eval_runtime": 0.5546, "eval_samples_per_second": 64.909, "eval_steps_per_second": 16.227, "step": 35200 }, { "epoch": 470.6666666666667, "eval_loss": 1.6725106239318848, "eval_runtime": 0.5887, "eval_samples_per_second": 61.151, "eval_steps_per_second": 15.288, "step": 35300 }, { "epoch": 472.0, "eval_loss": 1.662109375, "eval_runtime": 0.55, "eval_samples_per_second": 65.459, "eval_steps_per_second": 16.365, "step": 35400 }, { "epoch": 473.3333333333333, "grad_norm": 0.04419280216097832, "learning_rate": 2.6333333333333332e-06, "loss": 0.9736, "step": 35500 }, { "epoch": 473.3333333333333, "eval_loss": 1.703607201576233, "eval_runtime": 0.5546, "eval_samples_per_second": 64.914, "eval_steps_per_second": 16.229, "step": 35500 }, { "epoch": 474.6666666666667, "eval_loss": 1.6911473274230957, "eval_runtime": 0.5445, "eval_samples_per_second": 66.114, "eval_steps_per_second": 16.528, "step": 35600 }, { "epoch": 476.0, "eval_loss": 1.6890218257904053, "eval_runtime": 0.5522, "eval_samples_per_second": 65.194, "eval_steps_per_second": 16.298, "step": 35700 }, { "epoch": 477.3333333333333, "eval_loss": 1.730622410774231, "eval_runtime": 0.5611, "eval_samples_per_second": 64.157, "eval_steps_per_second": 16.039, "step": 35800 }, { "epoch": 478.6666666666667, "eval_loss": 1.728278636932373, "eval_runtime": 0.5489, "eval_samples_per_second": 65.585, "eval_steps_per_second": 16.396, "step": 35900 }, { "epoch": 480.0, "grad_norm": 0.077215276658535, "learning_rate": 2.6e-06, "loss": 0.9742, "step": 36000 }, { "epoch": 480.0, "eval_loss": 1.4973845481872559, "eval_runtime": 0.7454, "eval_samples_per_second": 48.298, "eval_steps_per_second": 12.074, "step": 36000 }, { "epoch": 481.3333333333333, "eval_loss": 1.7059286832809448, "eval_runtime": 0.7505, "eval_samples_per_second": 47.969, "eval_steps_per_second": 11.992, "step": 36100 }, { "epoch": 482.6666666666667, "eval_loss": 1.7060019969940186, "eval_runtime": 0.6182, "eval_samples_per_second": 58.229, "eval_steps_per_second": 14.557, "step": 36200 }, { "epoch": 484.0, "eval_loss": 1.7079213857650757, "eval_runtime": 0.5704, "eval_samples_per_second": 63.118, "eval_steps_per_second": 15.779, "step": 36300 }, { "epoch": 485.3333333333333, "eval_loss": 1.7405229806900024, "eval_runtime": 0.5624, "eval_samples_per_second": 64.017, "eval_steps_per_second": 16.004, "step": 36400 }, { "epoch": 486.6666666666667, "grad_norm": 0.03689847141504288, "learning_rate": 2.566666666666667e-06, "loss": 0.9902, "step": 36500 }, { "epoch": 486.6666666666667, "eval_loss": 1.7769001722335815, "eval_runtime": 0.7521, "eval_samples_per_second": 47.869, "eval_steps_per_second": 11.967, "step": 36500 }, { "epoch": 488.0, "eval_loss": 1.651777744293213, "eval_runtime": 0.5716, "eval_samples_per_second": 62.981, "eval_steps_per_second": 15.745, "step": 36600 }, { "epoch": 489.3333333333333, "eval_loss": 1.766890048980713, "eval_runtime": 0.5397, "eval_samples_per_second": 66.702, "eval_steps_per_second": 16.675, "step": 36700 }, { "epoch": 490.6666666666667, "eval_loss": 1.8105168342590332, "eval_runtime": 0.5502, "eval_samples_per_second": 65.426, "eval_steps_per_second": 16.357, "step": 36800 }, { "epoch": 492.0, "eval_loss": 1.6839150190353394, "eval_runtime": 0.5472, "eval_samples_per_second": 65.788, "eval_steps_per_second": 16.447, "step": 36900 }, { "epoch": 493.3333333333333, "grad_norm": 0.11671959608793259, "learning_rate": 2.5333333333333338e-06, "loss": 0.9767, "step": 37000 }, { "epoch": 493.3333333333333, "eval_loss": 1.8279399871826172, "eval_runtime": 0.5748, "eval_samples_per_second": 62.634, "eval_steps_per_second": 15.658, "step": 37000 }, { "epoch": 494.6666666666667, "eval_loss": 1.7738555669784546, "eval_runtime": 0.5591, "eval_samples_per_second": 64.386, "eval_steps_per_second": 16.096, "step": 37100 }, { "epoch": 496.0, "eval_loss": 1.6554198265075684, "eval_runtime": 0.6198, "eval_samples_per_second": 58.085, "eval_steps_per_second": 14.521, "step": 37200 }, { "epoch": 497.3333333333333, "eval_loss": 1.6143044233322144, "eval_runtime": 0.553, "eval_samples_per_second": 65.105, "eval_steps_per_second": 16.276, "step": 37300 }, { "epoch": 498.6666666666667, "eval_loss": 1.9613202810287476, "eval_runtime": 0.5513, "eval_samples_per_second": 65.3, "eval_steps_per_second": 16.325, "step": 37400 }, { "epoch": 500.0, "grad_norm": 0.053611986339092255, "learning_rate": 2.5e-06, "loss": 0.9845, "step": 37500 }, { "epoch": 500.0, "eval_loss": 1.7611021995544434, "eval_runtime": 0.5501, "eval_samples_per_second": 65.443, "eval_steps_per_second": 16.361, "step": 37500 }, { "epoch": 501.3333333333333, "eval_loss": 1.7701151371002197, "eval_runtime": 0.5555, "eval_samples_per_second": 64.805, "eval_steps_per_second": 16.201, "step": 37600 }, { "epoch": 502.6666666666667, "eval_loss": 1.8047661781311035, "eval_runtime": 0.5305, "eval_samples_per_second": 67.857, "eval_steps_per_second": 16.964, "step": 37700 }, { "epoch": 504.0, "eval_loss": 1.7464929819107056, "eval_runtime": 0.5556, "eval_samples_per_second": 64.79, "eval_steps_per_second": 16.198, "step": 37800 }, { "epoch": 505.3333333333333, "eval_loss": 2.023526906967163, "eval_runtime": 0.5456, "eval_samples_per_second": 65.983, "eval_steps_per_second": 16.496, "step": 37900 }, { "epoch": 506.6666666666667, "grad_norm": 0.07890354096889496, "learning_rate": 2.466666666666667e-06, "loss": 0.9934, "step": 38000 }, { "epoch": 506.6666666666667, "eval_loss": 1.6972378492355347, "eval_runtime": 0.591, "eval_samples_per_second": 60.914, "eval_steps_per_second": 15.228, "step": 38000 }, { "epoch": 508.0, "eval_loss": 1.7221157550811768, "eval_runtime": 0.5622, "eval_samples_per_second": 64.037, "eval_steps_per_second": 16.009, "step": 38100 }, { "epoch": 509.3333333333333, "eval_loss": 1.690049409866333, "eval_runtime": 0.5475, "eval_samples_per_second": 65.748, "eval_steps_per_second": 16.437, "step": 38200 }, { "epoch": 510.6666666666667, "eval_loss": 1.69050133228302, "eval_runtime": 0.5518, "eval_samples_per_second": 65.239, "eval_steps_per_second": 16.31, "step": 38300 }, { "epoch": 512.0, "eval_loss": 1.7761505842208862, "eval_runtime": 0.521, "eval_samples_per_second": 69.102, "eval_steps_per_second": 17.276, "step": 38400 }, { "epoch": 513.3333333333334, "grad_norm": 0.04196887090802193, "learning_rate": 2.4333333333333335e-06, "loss": 0.9837, "step": 38500 }, { "epoch": 513.3333333333334, "eval_loss": 1.738215446472168, "eval_runtime": 0.5443, "eval_samples_per_second": 66.142, "eval_steps_per_second": 16.536, "step": 38500 }, { "epoch": 514.6666666666666, "eval_loss": 1.7434948682785034, "eval_runtime": 0.5433, "eval_samples_per_second": 66.258, "eval_steps_per_second": 16.564, "step": 38600 }, { "epoch": 516.0, "eval_loss": 1.73280668258667, "eval_runtime": 0.5668, "eval_samples_per_second": 63.509, "eval_steps_per_second": 15.877, "step": 38700 }, { "epoch": 517.3333333333334, "eval_loss": 1.75234854221344, "eval_runtime": 0.555, "eval_samples_per_second": 64.863, "eval_steps_per_second": 16.216, "step": 38800 }, { "epoch": 518.6666666666666, "eval_loss": 1.774735927581787, "eval_runtime": 0.5421, "eval_samples_per_second": 66.41, "eval_steps_per_second": 16.602, "step": 38900 }, { "epoch": 520.0, "grad_norm": 0.004215251188725233, "learning_rate": 2.4000000000000003e-06, "loss": 0.9842, "step": 39000 }, { "epoch": 520.0, "eval_loss": 1.8171730041503906, "eval_runtime": 0.5582, "eval_samples_per_second": 64.492, "eval_steps_per_second": 16.123, "step": 39000 }, { "epoch": 521.3333333333334, "eval_loss": 1.7907897233963013, "eval_runtime": 0.5509, "eval_samples_per_second": 65.35, "eval_steps_per_second": 16.337, "step": 39100 }, { "epoch": 522.6666666666666, "eval_loss": 1.7846055030822754, "eval_runtime": 0.5302, "eval_samples_per_second": 67.893, "eval_steps_per_second": 16.973, "step": 39200 }, { "epoch": 524.0, "eval_loss": 1.7872586250305176, "eval_runtime": 0.5429, "eval_samples_per_second": 66.315, "eval_steps_per_second": 16.579, "step": 39300 }, { "epoch": 525.3333333333334, "eval_loss": 1.7076282501220703, "eval_runtime": 0.5692, "eval_samples_per_second": 63.25, "eval_steps_per_second": 15.813, "step": 39400 }, { "epoch": 526.6666666666666, "grad_norm": 0.04876990243792534, "learning_rate": 2.3666666666666667e-06, "loss": 0.9665, "step": 39500 }, { "epoch": 526.6666666666666, "eval_loss": 1.6819778680801392, "eval_runtime": 0.561, "eval_samples_per_second": 64.168, "eval_steps_per_second": 16.042, "step": 39500 }, { "epoch": 528.0, "eval_loss": 1.7303131818771362, "eval_runtime": 0.5636, "eval_samples_per_second": 63.873, "eval_steps_per_second": 15.968, "step": 39600 }, { "epoch": 529.3333333333334, "eval_loss": 1.7035993337631226, "eval_runtime": 0.5549, "eval_samples_per_second": 64.88, "eval_steps_per_second": 16.22, "step": 39700 }, { "epoch": 530.6666666666666, "eval_loss": 1.7115942239761353, "eval_runtime": 0.5492, "eval_samples_per_second": 65.554, "eval_steps_per_second": 16.388, "step": 39800 }, { "epoch": 532.0, "eval_loss": 1.7333368062973022, "eval_runtime": 0.5221, "eval_samples_per_second": 68.949, "eval_steps_per_second": 17.237, "step": 39900 }, { "epoch": 533.3333333333334, "grad_norm": 0.07899472862482071, "learning_rate": 2.3333333333333336e-06, "loss": 0.9951, "step": 40000 }, { "epoch": 533.3333333333334, "eval_loss": 1.7402613162994385, "eval_runtime": 0.5595, "eval_samples_per_second": 64.346, "eval_steps_per_second": 16.086, "step": 40000 }, { "epoch": 534.6666666666666, "eval_loss": 1.6699178218841553, "eval_runtime": 0.5461, "eval_samples_per_second": 65.918, "eval_steps_per_second": 16.48, "step": 40100 }, { "epoch": 536.0, "eval_loss": 1.6886802911758423, "eval_runtime": 0.5401, "eval_samples_per_second": 66.66, "eval_steps_per_second": 16.665, "step": 40200 }, { "epoch": 537.3333333333334, "eval_loss": 1.686381220817566, "eval_runtime": 0.5558, "eval_samples_per_second": 64.775, "eval_steps_per_second": 16.194, "step": 40300 }, { "epoch": 538.6666666666666, "eval_loss": 1.6653372049331665, "eval_runtime": 0.5679, "eval_samples_per_second": 63.391, "eval_steps_per_second": 15.848, "step": 40400 }, { "epoch": 540.0, "grad_norm": 0.10551416128873825, "learning_rate": 2.3000000000000004e-06, "loss": 0.9753, "step": 40500 }, { "epoch": 540.0, "eval_loss": 1.6738240718841553, "eval_runtime": 0.5453, "eval_samples_per_second": 66.023, "eval_steps_per_second": 16.506, "step": 40500 }, { "epoch": 541.3333333333334, "eval_loss": 1.6468851566314697, "eval_runtime": 0.5848, "eval_samples_per_second": 61.558, "eval_steps_per_second": 15.39, "step": 40600 }, { "epoch": 542.6666666666666, "eval_loss": 1.8661946058273315, "eval_runtime": 0.5667, "eval_samples_per_second": 63.529, "eval_steps_per_second": 15.882, "step": 40700 }, { "epoch": 544.0, "eval_loss": 2.1967360973358154, "eval_runtime": 0.5484, "eval_samples_per_second": 65.643, "eval_steps_per_second": 16.411, "step": 40800 }, { "epoch": 545.3333333333334, "eval_loss": 1.7642713785171509, "eval_runtime": 0.5703, "eval_samples_per_second": 63.128, "eval_steps_per_second": 15.782, "step": 40900 }, { "epoch": 546.6666666666666, "grad_norm": 3.1107699871063232, "learning_rate": 2.266666666666667e-06, "loss": 0.9893, "step": 41000 }, { "epoch": 546.6666666666666, "eval_loss": 1.7935450077056885, "eval_runtime": 0.5464, "eval_samples_per_second": 65.884, "eval_steps_per_second": 16.471, "step": 41000 }, { "epoch": 548.0, "eval_loss": 1.8633232116699219, "eval_runtime": 0.5461, "eval_samples_per_second": 65.917, "eval_steps_per_second": 16.479, "step": 41100 }, { "epoch": 549.3333333333334, "eval_loss": 1.8705569505691528, "eval_runtime": 0.5584, "eval_samples_per_second": 64.475, "eval_steps_per_second": 16.119, "step": 41200 }, { "epoch": 550.6666666666666, "eval_loss": 1.8711670637130737, "eval_runtime": 0.5532, "eval_samples_per_second": 65.08, "eval_steps_per_second": 16.27, "step": 41300 }, { "epoch": 552.0, "eval_loss": 1.8823400735855103, "eval_runtime": 0.5308, "eval_samples_per_second": 67.821, "eval_steps_per_second": 16.955, "step": 41400 }, { "epoch": 553.3333333333334, "grad_norm": 0.0894642025232315, "learning_rate": 2.2333333333333333e-06, "loss": 0.9922, "step": 41500 }, { "epoch": 553.3333333333334, "eval_loss": 1.866148829460144, "eval_runtime": 0.5421, "eval_samples_per_second": 66.407, "eval_steps_per_second": 16.602, "step": 41500 }, { "epoch": 554.6666666666666, "eval_loss": 1.8561292886734009, "eval_runtime": 0.5468, "eval_samples_per_second": 65.843, "eval_steps_per_second": 16.461, "step": 41600 }, { "epoch": 556.0, "eval_loss": 1.8343923091888428, "eval_runtime": 0.5439, "eval_samples_per_second": 66.191, "eval_steps_per_second": 16.548, "step": 41700 }, { "epoch": 557.3333333333334, "eval_loss": 1.8317135572433472, "eval_runtime": 0.5606, "eval_samples_per_second": 64.218, "eval_steps_per_second": 16.055, "step": 41800 }, { "epoch": 558.6666666666666, "eval_loss": 1.805317759513855, "eval_runtime": 0.5452, "eval_samples_per_second": 66.033, "eval_steps_per_second": 16.508, "step": 41900 }, { "epoch": 560.0, "grad_norm": 0.07705187052488327, "learning_rate": 2.2e-06, "loss": 0.969, "step": 42000 }, { "epoch": 560.0, "eval_loss": 1.748168706893921, "eval_runtime": 0.5517, "eval_samples_per_second": 65.249, "eval_steps_per_second": 16.312, "step": 42000 }, { "epoch": 561.3333333333334, "eval_loss": 1.767458438873291, "eval_runtime": 0.5576, "eval_samples_per_second": 64.565, "eval_steps_per_second": 16.141, "step": 42100 }, { "epoch": 562.6666666666666, "eval_loss": 1.7875497341156006, "eval_runtime": 0.5294, "eval_samples_per_second": 68.0, "eval_steps_per_second": 17.0, "step": 42200 }, { "epoch": 564.0, "eval_loss": 1.8815007209777832, "eval_runtime": 0.5386, "eval_samples_per_second": 66.838, "eval_steps_per_second": 16.709, "step": 42300 }, { "epoch": 565.3333333333334, "eval_loss": 1.795351266860962, "eval_runtime": 0.5679, "eval_samples_per_second": 63.387, "eval_steps_per_second": 15.847, "step": 42400 }, { "epoch": 566.6666666666666, "grad_norm": 0.08833177387714386, "learning_rate": 2.166666666666667e-06, "loss": 0.9774, "step": 42500 }, { "epoch": 566.6666666666666, "eval_loss": 1.8543084859848022, "eval_runtime": 0.5525, "eval_samples_per_second": 65.153, "eval_steps_per_second": 16.288, "step": 42500 }, { "epoch": 568.0, "eval_loss": 1.8677423000335693, "eval_runtime": 0.533, "eval_samples_per_second": 67.543, "eval_steps_per_second": 16.886, "step": 42600 }, { "epoch": 569.3333333333334, "eval_loss": 1.8582379817962646, "eval_runtime": 0.5678, "eval_samples_per_second": 63.407, "eval_steps_per_second": 15.852, "step": 42700 }, { "epoch": 570.6666666666666, "eval_loss": 1.7221639156341553, "eval_runtime": 0.5323, "eval_samples_per_second": 67.63, "eval_steps_per_second": 16.908, "step": 42800 }, { "epoch": 572.0, "eval_loss": 1.8003919124603271, "eval_runtime": 0.5599, "eval_samples_per_second": 64.301, "eval_steps_per_second": 16.075, "step": 42900 }, { "epoch": 573.3333333333334, "grad_norm": 0.2233036607503891, "learning_rate": 2.133333333333334e-06, "loss": 0.9801, "step": 43000 }, { "epoch": 573.3333333333334, "eval_loss": 1.7879968881607056, "eval_runtime": 0.5335, "eval_samples_per_second": 67.48, "eval_steps_per_second": 16.87, "step": 43000 }, { "epoch": 574.6666666666666, "eval_loss": 1.88143789768219, "eval_runtime": 0.5428, "eval_samples_per_second": 66.318, "eval_steps_per_second": 16.58, "step": 43100 }, { "epoch": 576.0, "eval_loss": 1.8420606851577759, "eval_runtime": 0.5379, "eval_samples_per_second": 66.925, "eval_steps_per_second": 16.731, "step": 43200 }, { "epoch": 577.3333333333334, "eval_loss": 1.8551725149154663, "eval_runtime": 0.5496, "eval_samples_per_second": 65.506, "eval_steps_per_second": 16.376, "step": 43300 }, { "epoch": 578.6666666666666, "eval_loss": 2.1513121128082275, "eval_runtime": 0.5206, "eval_samples_per_second": 69.148, "eval_steps_per_second": 17.287, "step": 43400 }, { "epoch": 580.0, "grad_norm": 0.1499890685081482, "learning_rate": 2.1000000000000002e-06, "loss": 0.9812, "step": 43500 }, { "epoch": 580.0, "eval_loss": 2.1082208156585693, "eval_runtime": 0.5712, "eval_samples_per_second": 63.02, "eval_steps_per_second": 15.755, "step": 43500 }, { "epoch": 581.3333333333334, "eval_loss": 2.1574254035949707, "eval_runtime": 0.5705, "eval_samples_per_second": 63.101, "eval_steps_per_second": 15.775, "step": 43600 }, { "epoch": 582.6666666666666, "eval_loss": 2.1666057109832764, "eval_runtime": 0.5199, "eval_samples_per_second": 69.243, "eval_steps_per_second": 17.311, "step": 43700 }, { "epoch": 584.0, "eval_loss": 1.9116859436035156, "eval_runtime": 0.5547, "eval_samples_per_second": 64.9, "eval_steps_per_second": 16.225, "step": 43800 }, { "epoch": 585.3333333333334, "eval_loss": 1.837393045425415, "eval_runtime": 0.5498, "eval_samples_per_second": 65.476, "eval_steps_per_second": 16.369, "step": 43900 }, { "epoch": 586.6666666666666, "grad_norm": 0.05849548429250717, "learning_rate": 2.0666666666666666e-06, "loss": 0.979, "step": 44000 }, { "epoch": 586.6666666666666, "eval_loss": 2.0417330265045166, "eval_runtime": 0.5437, "eval_samples_per_second": 66.207, "eval_steps_per_second": 16.552, "step": 44000 }, { "epoch": 588.0, "eval_loss": 2.031531810760498, "eval_runtime": 0.5452, "eval_samples_per_second": 66.031, "eval_steps_per_second": 16.508, "step": 44100 }, { "epoch": 589.3333333333334, "eval_loss": 1.9730753898620605, "eval_runtime": 0.5495, "eval_samples_per_second": 65.515, "eval_steps_per_second": 16.379, "step": 44200 }, { "epoch": 590.6666666666666, "eval_loss": 1.857057809829712, "eval_runtime": 0.5681, "eval_samples_per_second": 63.364, "eval_steps_per_second": 15.841, "step": 44300 }, { "epoch": 592.0, "eval_loss": 2.0701072216033936, "eval_runtime": 0.544, "eval_samples_per_second": 66.179, "eval_steps_per_second": 16.545, "step": 44400 }, { "epoch": 593.3333333333334, "grad_norm": 0.08897858113050461, "learning_rate": 2.0333333333333335e-06, "loss": 0.9794, "step": 44500 }, { "epoch": 593.3333333333334, "eval_loss": 2.0198774337768555, "eval_runtime": 0.5825, "eval_samples_per_second": 61.804, "eval_steps_per_second": 15.451, "step": 44500 }, { "epoch": 594.6666666666666, "eval_loss": 2.0371484756469727, "eval_runtime": 0.5618, "eval_samples_per_second": 64.076, "eval_steps_per_second": 16.019, "step": 44600 }, { "epoch": 596.0, "eval_loss": 2.072148561477661, "eval_runtime": 0.5241, "eval_samples_per_second": 68.691, "eval_steps_per_second": 17.173, "step": 44700 }, { "epoch": 597.3333333333334, "eval_loss": 1.8098841905593872, "eval_runtime": 0.7237, "eval_samples_per_second": 49.744, "eval_steps_per_second": 12.436, "step": 44800 }, { "epoch": 598.6666666666666, "eval_loss": 1.8729602098464966, "eval_runtime": 0.5292, "eval_samples_per_second": 68.03, "eval_steps_per_second": 17.007, "step": 44900 }, { "epoch": 600.0, "grad_norm": 0.18704155087471008, "learning_rate": 2.0000000000000003e-06, "loss": 0.9798, "step": 45000 }, { "epoch": 600.0, "eval_loss": 1.8336758613586426, "eval_runtime": 0.5503, "eval_samples_per_second": 65.42, "eval_steps_per_second": 16.355, "step": 45000 }, { "epoch": 601.3333333333334, "eval_loss": 1.798879623413086, "eval_runtime": 0.5501, "eval_samples_per_second": 65.44, "eval_steps_per_second": 16.36, "step": 45100 }, { "epoch": 602.6666666666666, "eval_loss": 1.838456630706787, "eval_runtime": 0.5318, "eval_samples_per_second": 67.691, "eval_steps_per_second": 16.923, "step": 45200 }, { "epoch": 604.0, "eval_loss": 1.855639100074768, "eval_runtime": 0.5367, "eval_samples_per_second": 67.079, "eval_steps_per_second": 16.77, "step": 45300 }, { "epoch": 605.3333333333334, "eval_loss": 1.8457268476486206, "eval_runtime": 0.5452, "eval_samples_per_second": 66.036, "eval_steps_per_second": 16.509, "step": 45400 }, { "epoch": 606.6666666666666, "grad_norm": 0.11866223812103271, "learning_rate": 1.9666666666666668e-06, "loss": 0.9685, "step": 45500 }, { "epoch": 606.6666666666666, "eval_loss": 1.822603464126587, "eval_runtime": 0.5537, "eval_samples_per_second": 65.013, "eval_steps_per_second": 16.253, "step": 45500 }, { "epoch": 608.0, "eval_loss": 1.8561204671859741, "eval_runtime": 0.5625, "eval_samples_per_second": 64.001, "eval_steps_per_second": 16.0, "step": 45600 }, { "epoch": 609.3333333333334, "eval_loss": 1.8446394205093384, "eval_runtime": 0.5482, "eval_samples_per_second": 65.671, "eval_steps_per_second": 16.418, "step": 45700 }, { "epoch": 610.6666666666666, "eval_loss": 1.8913522958755493, "eval_runtime": 0.5296, "eval_samples_per_second": 67.973, "eval_steps_per_second": 16.993, "step": 45800 }, { "epoch": 612.0, "eval_loss": 1.9162565469741821, "eval_runtime": 0.5568, "eval_samples_per_second": 64.65, "eval_steps_per_second": 16.162, "step": 45900 }, { "epoch": 613.3333333333334, "grad_norm": 0.1157788634300232, "learning_rate": 1.9333333333333336e-06, "loss": 0.9883, "step": 46000 }, { "epoch": 613.3333333333334, "eval_loss": 2.072524309158325, "eval_runtime": 0.5534, "eval_samples_per_second": 65.048, "eval_steps_per_second": 16.262, "step": 46000 }, { "epoch": 614.6666666666666, "eval_loss": 1.8508360385894775, "eval_runtime": 0.5491, "eval_samples_per_second": 65.564, "eval_steps_per_second": 16.391, "step": 46100 }, { "epoch": 616.0, "eval_loss": 1.7464029788970947, "eval_runtime": 0.5615, "eval_samples_per_second": 64.119, "eval_steps_per_second": 16.03, "step": 46200 }, { "epoch": 617.3333333333334, "eval_loss": 1.748913288116455, "eval_runtime": 0.5479, "eval_samples_per_second": 65.7, "eval_steps_per_second": 16.425, "step": 46300 }, { "epoch": 618.6666666666666, "eval_loss": 1.8173909187316895, "eval_runtime": 0.563, "eval_samples_per_second": 63.938, "eval_steps_per_second": 15.984, "step": 46400 }, { "epoch": 620.0, "grad_norm": 0.0547378808259964, "learning_rate": 1.9000000000000002e-06, "loss": 0.9681, "step": 46500 }, { "epoch": 620.0, "eval_loss": 1.843059778213501, "eval_runtime": 0.5565, "eval_samples_per_second": 64.685, "eval_steps_per_second": 16.171, "step": 46500 }, { "epoch": 621.3333333333334, "eval_loss": 1.8340284824371338, "eval_runtime": 0.5779, "eval_samples_per_second": 62.296, "eval_steps_per_second": 15.574, "step": 46600 }, { "epoch": 622.6666666666666, "eval_loss": 1.8259119987487793, "eval_runtime": 0.5498, "eval_samples_per_second": 65.483, "eval_steps_per_second": 16.371, "step": 46700 }, { "epoch": 624.0, "eval_loss": 1.7972196340560913, "eval_runtime": 0.5663, "eval_samples_per_second": 63.576, "eval_steps_per_second": 15.894, "step": 46800 }, { "epoch": 625.3333333333334, "eval_loss": 1.7690060138702393, "eval_runtime": 0.5871, "eval_samples_per_second": 61.32, "eval_steps_per_second": 15.33, "step": 46900 }, { "epoch": 626.6666666666666, "grad_norm": 0.07820368558168411, "learning_rate": 1.8666666666666669e-06, "loss": 0.9857, "step": 47000 }, { "epoch": 626.6666666666666, "eval_loss": 1.665734052658081, "eval_runtime": 0.5766, "eval_samples_per_second": 62.434, "eval_steps_per_second": 15.609, "step": 47000 }, { "epoch": 628.0, "eval_loss": 1.6628824472427368, "eval_runtime": 0.5658, "eval_samples_per_second": 63.628, "eval_steps_per_second": 15.907, "step": 47100 }, { "epoch": 629.3333333333334, "eval_loss": 1.6513375043869019, "eval_runtime": 0.579, "eval_samples_per_second": 62.179, "eval_steps_per_second": 15.545, "step": 47200 }, { "epoch": 630.6666666666666, "eval_loss": 1.7079135179519653, "eval_runtime": 0.5595, "eval_samples_per_second": 64.339, "eval_steps_per_second": 16.085, "step": 47300 }, { "epoch": 632.0, "eval_loss": 1.747074007987976, "eval_runtime": 0.5655, "eval_samples_per_second": 63.656, "eval_steps_per_second": 15.914, "step": 47400 }, { "epoch": 633.3333333333334, "grad_norm": 0.06210293248295784, "learning_rate": 1.8333333333333333e-06, "loss": 0.9682, "step": 47500 }, { "epoch": 633.3333333333334, "eval_loss": 1.7287406921386719, "eval_runtime": 0.5597, "eval_samples_per_second": 64.325, "eval_steps_per_second": 16.081, "step": 47500 }, { "epoch": 634.6666666666666, "eval_loss": 1.727621078491211, "eval_runtime": 0.5613, "eval_samples_per_second": 64.133, "eval_steps_per_second": 16.033, "step": 47600 }, { "epoch": 636.0, "eval_loss": 1.7201074361801147, "eval_runtime": 0.5681, "eval_samples_per_second": 63.373, "eval_steps_per_second": 15.843, "step": 47700 }, { "epoch": 637.3333333333334, "eval_loss": 1.7353777885437012, "eval_runtime": 0.5933, "eval_samples_per_second": 60.682, "eval_steps_per_second": 15.171, "step": 47800 }, { "epoch": 638.6666666666666, "eval_loss": 1.7249058485031128, "eval_runtime": 0.6384, "eval_samples_per_second": 56.388, "eval_steps_per_second": 14.097, "step": 47900 }, { "epoch": 640.0, "grad_norm": 0.052343543618917465, "learning_rate": 1.8000000000000001e-06, "loss": 0.9808, "step": 48000 }, { "epoch": 640.0, "eval_loss": 1.71388840675354, "eval_runtime": 0.58, "eval_samples_per_second": 62.073, "eval_steps_per_second": 15.518, "step": 48000 }, { "epoch": 641.3333333333334, "eval_loss": 1.7039189338684082, "eval_runtime": 0.571, "eval_samples_per_second": 63.044, "eval_steps_per_second": 15.761, "step": 48100 }, { "epoch": 642.6666666666666, "eval_loss": 1.6999964714050293, "eval_runtime": 0.562, "eval_samples_per_second": 64.058, "eval_steps_per_second": 16.014, "step": 48200 }, { "epoch": 644.0, "eval_loss": 1.7216050624847412, "eval_runtime": 0.7542, "eval_samples_per_second": 47.735, "eval_steps_per_second": 11.934, "step": 48300 }, { "epoch": 645.3333333333334, "eval_loss": 1.722676157951355, "eval_runtime": 0.5543, "eval_samples_per_second": 64.949, "eval_steps_per_second": 16.237, "step": 48400 }, { "epoch": 646.6666666666666, "grad_norm": 2.488457441329956, "learning_rate": 1.7666666666666668e-06, "loss": 0.9715, "step": 48500 }, { "epoch": 646.6666666666666, "eval_loss": 1.7267606258392334, "eval_runtime": 0.5695, "eval_samples_per_second": 63.212, "eval_steps_per_second": 15.803, "step": 48500 }, { "epoch": 648.0, "eval_loss": 1.7102618217468262, "eval_runtime": 0.5809, "eval_samples_per_second": 61.969, "eval_steps_per_second": 15.492, "step": 48600 }, { "epoch": 649.3333333333334, "eval_loss": 1.782753825187683, "eval_runtime": 0.562, "eval_samples_per_second": 64.055, "eval_steps_per_second": 16.014, "step": 48700 }, { "epoch": 650.6666666666666, "eval_loss": 1.762971043586731, "eval_runtime": 0.5751, "eval_samples_per_second": 62.597, "eval_steps_per_second": 15.649, "step": 48800 }, { "epoch": 652.0, "eval_loss": 1.7744052410125732, "eval_runtime": 0.5745, "eval_samples_per_second": 62.668, "eval_steps_per_second": 15.667, "step": 48900 }, { "epoch": 653.3333333333334, "grad_norm": 0.09381379187107086, "learning_rate": 1.7333333333333336e-06, "loss": 0.9787, "step": 49000 }, { "epoch": 653.3333333333334, "eval_loss": 1.6837247610092163, "eval_runtime": 0.5642, "eval_samples_per_second": 63.806, "eval_steps_per_second": 15.952, "step": 49000 }, { "epoch": 654.6666666666666, "eval_loss": 1.6776472330093384, "eval_runtime": 0.7372, "eval_samples_per_second": 48.836, "eval_steps_per_second": 12.209, "step": 49100 }, { "epoch": 656.0, "eval_loss": 1.6681586503982544, "eval_runtime": 0.5579, "eval_samples_per_second": 64.525, "eval_steps_per_second": 16.131, "step": 49200 }, { "epoch": 657.3333333333334, "eval_loss": 1.6666615009307861, "eval_runtime": 0.5645, "eval_samples_per_second": 63.771, "eval_steps_per_second": 15.943, "step": 49300 }, { "epoch": 658.6666666666666, "eval_loss": 1.6990092992782593, "eval_runtime": 0.5638, "eval_samples_per_second": 63.853, "eval_steps_per_second": 15.963, "step": 49400 }, { "epoch": 660.0, "grad_norm": 0.06082923337817192, "learning_rate": 1.7000000000000002e-06, "loss": 0.9841, "step": 49500 }, { "epoch": 660.0, "eval_loss": 1.7287834882736206, "eval_runtime": 0.5687, "eval_samples_per_second": 63.304, "eval_steps_per_second": 15.826, "step": 49500 }, { "epoch": 661.3333333333334, "eval_loss": 1.742866039276123, "eval_runtime": 0.5945, "eval_samples_per_second": 60.554, "eval_steps_per_second": 15.138, "step": 49600 }, { "epoch": 662.6666666666666, "eval_loss": 1.8161176443099976, "eval_runtime": 0.5713, "eval_samples_per_second": 63.017, "eval_steps_per_second": 15.754, "step": 49700 }, { "epoch": 664.0, "eval_loss": 1.7449095249176025, "eval_runtime": 0.5546, "eval_samples_per_second": 64.911, "eval_steps_per_second": 16.228, "step": 49800 }, { "epoch": 665.3333333333334, "eval_loss": 1.6393769979476929, "eval_runtime": 0.5539, "eval_samples_per_second": 64.99, "eval_steps_per_second": 16.248, "step": 49900 }, { "epoch": 666.6666666666666, "grad_norm": 0.10215742141008377, "learning_rate": 1.6666666666666667e-06, "loss": 0.9763, "step": 50000 }, { "epoch": 666.6666666666666, "eval_loss": 1.6390786170959473, "eval_runtime": 0.559, "eval_samples_per_second": 64.403, "eval_steps_per_second": 16.101, "step": 50000 }, { "epoch": 668.0, "eval_loss": 1.7599010467529297, "eval_runtime": 0.553, "eval_samples_per_second": 65.095, "eval_steps_per_second": 16.274, "step": 50100 }, { "epoch": 669.3333333333334, "eval_loss": 1.7627545595169067, "eval_runtime": 0.5815, "eval_samples_per_second": 61.905, "eval_steps_per_second": 15.476, "step": 50200 }, { "epoch": 670.6666666666666, "eval_loss": 1.754063606262207, "eval_runtime": 0.5621, "eval_samples_per_second": 64.043, "eval_steps_per_second": 16.011, "step": 50300 }, { "epoch": 672.0, "eval_loss": 1.754131555557251, "eval_runtime": 0.5584, "eval_samples_per_second": 64.475, "eval_steps_per_second": 16.119, "step": 50400 }, { "epoch": 673.3333333333334, "grad_norm": 0.29858145117759705, "learning_rate": 1.6333333333333335e-06, "loss": 0.9701, "step": 50500 }, { "epoch": 673.3333333333334, "eval_loss": 1.760127067565918, "eval_runtime": 0.5699, "eval_samples_per_second": 63.173, "eval_steps_per_second": 15.793, "step": 50500 }, { "epoch": 674.6666666666666, "eval_loss": 1.7486934661865234, "eval_runtime": 0.6341, "eval_samples_per_second": 56.778, "eval_steps_per_second": 14.194, "step": 50600 }, { "epoch": 676.0, "eval_loss": 1.757436752319336, "eval_runtime": 0.5908, "eval_samples_per_second": 60.933, "eval_steps_per_second": 15.233, "step": 50700 }, { "epoch": 677.3333333333334, "eval_loss": 1.771299958229065, "eval_runtime": 0.5756, "eval_samples_per_second": 62.544, "eval_steps_per_second": 15.636, "step": 50800 }, { "epoch": 678.6666666666666, "eval_loss": 1.7428710460662842, "eval_runtime": 0.5481, "eval_samples_per_second": 65.678, "eval_steps_per_second": 16.419, "step": 50900 }, { "epoch": 680.0, "grad_norm": 0.14955511689186096, "learning_rate": 1.6000000000000001e-06, "loss": 0.9776, "step": 51000 }, { "epoch": 680.0, "eval_loss": 1.7331171035766602, "eval_runtime": 0.5709, "eval_samples_per_second": 63.058, "eval_steps_per_second": 15.764, "step": 51000 }, { "epoch": 681.3333333333334, "eval_loss": 1.7390122413635254, "eval_runtime": 0.5624, "eval_samples_per_second": 64.017, "eval_steps_per_second": 16.004, "step": 51100 }, { "epoch": 682.6666666666666, "eval_loss": 1.7458256483078003, "eval_runtime": 0.5831, "eval_samples_per_second": 61.738, "eval_steps_per_second": 15.435, "step": 51200 }, { "epoch": 684.0, "eval_loss": 1.779284119606018, "eval_runtime": 0.6067, "eval_samples_per_second": 59.341, "eval_steps_per_second": 14.835, "step": 51300 }, { "epoch": 685.3333333333334, "eval_loss": 1.7581945657730103, "eval_runtime": 0.5799, "eval_samples_per_second": 62.082, "eval_steps_per_second": 15.521, "step": 51400 }, { "epoch": 686.6666666666666, "grad_norm": 0.13983415067195892, "learning_rate": 1.566666666666667e-06, "loss": 0.9879, "step": 51500 }, { "epoch": 686.6666666666666, "eval_loss": 1.6377967596054077, "eval_runtime": 0.5937, "eval_samples_per_second": 60.633, "eval_steps_per_second": 15.158, "step": 51500 }, { "epoch": 688.0, "eval_loss": 1.7654058933258057, "eval_runtime": 0.5534, "eval_samples_per_second": 65.051, "eval_steps_per_second": 16.263, "step": 51600 }, { "epoch": 689.3333333333334, "eval_loss": 1.803229808807373, "eval_runtime": 0.5827, "eval_samples_per_second": 61.784, "eval_steps_per_second": 15.446, "step": 51700 }, { "epoch": 690.6666666666666, "eval_loss": 1.7764018774032593, "eval_runtime": 0.5767, "eval_samples_per_second": 62.428, "eval_steps_per_second": 15.607, "step": 51800 }, { "epoch": 692.0, "eval_loss": 1.7827038764953613, "eval_runtime": 0.5615, "eval_samples_per_second": 64.111, "eval_steps_per_second": 16.028, "step": 51900 }, { "epoch": 693.3333333333334, "grad_norm": 0.09914771467447281, "learning_rate": 1.5333333333333334e-06, "loss": 0.972, "step": 52000 }, { "epoch": 693.3333333333334, "eval_loss": 1.785254955291748, "eval_runtime": 0.5651, "eval_samples_per_second": 63.706, "eval_steps_per_second": 15.926, "step": 52000 }, { "epoch": 694.6666666666666, "eval_loss": 1.760009765625, "eval_runtime": 0.5548, "eval_samples_per_second": 64.884, "eval_steps_per_second": 16.221, "step": 52100 }, { "epoch": 696.0, "eval_loss": 1.7468301057815552, "eval_runtime": 0.5749, "eval_samples_per_second": 62.614, "eval_steps_per_second": 15.654, "step": 52200 }, { "epoch": 697.3333333333334, "eval_loss": 1.7260651588439941, "eval_runtime": 0.5698, "eval_samples_per_second": 63.183, "eval_steps_per_second": 15.796, "step": 52300 }, { "epoch": 698.6666666666666, "eval_loss": 1.7251561880111694, "eval_runtime": 0.5709, "eval_samples_per_second": 63.054, "eval_steps_per_second": 15.763, "step": 52400 }, { "epoch": 700.0, "grad_norm": 11.26338005065918, "learning_rate": 1.5e-06, "loss": 0.984, "step": 52500 }, { "epoch": 700.0, "eval_loss": 1.6725823879241943, "eval_runtime": 0.5728, "eval_samples_per_second": 62.845, "eval_steps_per_second": 15.711, "step": 52500 }, { "epoch": 701.3333333333334, "eval_loss": 1.7238188982009888, "eval_runtime": 0.5501, "eval_samples_per_second": 65.438, "eval_steps_per_second": 16.359, "step": 52600 }, { "epoch": 702.6666666666666, "eval_loss": 1.7595596313476562, "eval_runtime": 0.5473, "eval_samples_per_second": 65.779, "eval_steps_per_second": 16.445, "step": 52700 }, { "epoch": 704.0, "eval_loss": 1.7719124555587769, "eval_runtime": 0.5464, "eval_samples_per_second": 65.891, "eval_steps_per_second": 16.473, "step": 52800 }, { "epoch": 705.3333333333334, "eval_loss": 1.78886878490448, "eval_runtime": 0.5638, "eval_samples_per_second": 63.857, "eval_steps_per_second": 15.964, "step": 52900 }, { "epoch": 706.6666666666666, "grad_norm": 0.08296012878417969, "learning_rate": 1.4666666666666669e-06, "loss": 0.9784, "step": 53000 }, { "epoch": 706.6666666666666, "eval_loss": 1.8150266408920288, "eval_runtime": 0.5446, "eval_samples_per_second": 66.102, "eval_steps_per_second": 16.525, "step": 53000 }, { "epoch": 708.0, "eval_loss": 1.7816083431243896, "eval_runtime": 0.5776, "eval_samples_per_second": 62.322, "eval_steps_per_second": 15.58, "step": 53100 }, { "epoch": 709.3333333333334, "eval_loss": 1.7753357887268066, "eval_runtime": 0.5719, "eval_samples_per_second": 62.944, "eval_steps_per_second": 15.736, "step": 53200 }, { "epoch": 710.6666666666666, "eval_loss": 1.7961137294769287, "eval_runtime": 0.5638, "eval_samples_per_second": 63.852, "eval_steps_per_second": 15.963, "step": 53300 }, { "epoch": 712.0, "eval_loss": 1.7825465202331543, "eval_runtime": 0.5765, "eval_samples_per_second": 62.448, "eval_steps_per_second": 15.612, "step": 53400 }, { "epoch": 713.3333333333334, "grad_norm": 0.21331265568733215, "learning_rate": 1.4333333333333335e-06, "loss": 0.9961, "step": 53500 }, { "epoch": 713.3333333333334, "eval_loss": 1.7892060279846191, "eval_runtime": 0.5669, "eval_samples_per_second": 63.506, "eval_steps_per_second": 15.876, "step": 53500 }, { "epoch": 714.6666666666666, "eval_loss": 1.7976782321929932, "eval_runtime": 0.5821, "eval_samples_per_second": 61.841, "eval_steps_per_second": 15.46, "step": 53600 }, { "epoch": 716.0, "eval_loss": 1.789144515991211, "eval_runtime": 0.5799, "eval_samples_per_second": 62.084, "eval_steps_per_second": 15.521, "step": 53700 }, { "epoch": 717.3333333333334, "eval_loss": 1.7800437211990356, "eval_runtime": 0.5728, "eval_samples_per_second": 62.852, "eval_steps_per_second": 15.713, "step": 53800 }, { "epoch": 718.6666666666666, "eval_loss": 1.8110613822937012, "eval_runtime": 0.5656, "eval_samples_per_second": 63.649, "eval_steps_per_second": 15.912, "step": 53900 }, { "epoch": 720.0, "grad_norm": 0.08738431334495544, "learning_rate": 1.4000000000000001e-06, "loss": 0.9735, "step": 54000 }, { "epoch": 720.0, "eval_loss": 1.8085912466049194, "eval_runtime": 0.5731, "eval_samples_per_second": 62.821, "eval_steps_per_second": 15.705, "step": 54000 }, { "epoch": 721.3333333333334, "eval_loss": 1.782285213470459, "eval_runtime": 0.5462, "eval_samples_per_second": 65.907, "eval_steps_per_second": 16.477, "step": 54100 }, { "epoch": 722.6666666666666, "eval_loss": 1.7697291374206543, "eval_runtime": 0.5647, "eval_samples_per_second": 63.747, "eval_steps_per_second": 15.937, "step": 54200 }, { "epoch": 724.0, "eval_loss": 1.85777747631073, "eval_runtime": 0.5746, "eval_samples_per_second": 62.65, "eval_steps_per_second": 15.662, "step": 54300 }, { "epoch": 725.3333333333334, "eval_loss": 1.7846357822418213, "eval_runtime": 0.5795, "eval_samples_per_second": 62.124, "eval_steps_per_second": 15.531, "step": 54400 }, { "epoch": 726.6666666666666, "grad_norm": 0.11066329479217529, "learning_rate": 1.3666666666666668e-06, "loss": 0.9722, "step": 54500 }, { "epoch": 726.6666666666666, "eval_loss": 1.7788323163986206, "eval_runtime": 0.5969, "eval_samples_per_second": 60.315, "eval_steps_per_second": 15.079, "step": 54500 }, { "epoch": 728.0, "eval_loss": 1.792999267578125, "eval_runtime": 0.5606, "eval_samples_per_second": 64.22, "eval_steps_per_second": 16.055, "step": 54600 }, { "epoch": 729.3333333333334, "eval_loss": 1.79288649559021, "eval_runtime": 0.5722, "eval_samples_per_second": 62.917, "eval_steps_per_second": 15.729, "step": 54700 }, { "epoch": 730.6666666666666, "eval_loss": 1.7887053489685059, "eval_runtime": 0.6716, "eval_samples_per_second": 53.601, "eval_steps_per_second": 13.4, "step": 54800 }, { "epoch": 732.0, "eval_loss": 1.777323842048645, "eval_runtime": 0.5665, "eval_samples_per_second": 63.552, "eval_steps_per_second": 15.888, "step": 54900 }, { "epoch": 733.3333333333334, "grad_norm": 0.15428341925144196, "learning_rate": 1.3333333333333334e-06, "loss": 0.9764, "step": 55000 }, { "epoch": 733.3333333333334, "eval_loss": 1.7724252939224243, "eval_runtime": 0.5596, "eval_samples_per_second": 64.331, "eval_steps_per_second": 16.083, "step": 55000 }, { "epoch": 734.6666666666666, "eval_loss": 1.7710621356964111, "eval_runtime": 0.6494, "eval_samples_per_second": 55.436, "eval_steps_per_second": 13.859, "step": 55100 }, { "epoch": 736.0, "eval_loss": 1.767142653465271, "eval_runtime": 0.5678, "eval_samples_per_second": 63.403, "eval_steps_per_second": 15.851, "step": 55200 }, { "epoch": 737.3333333333334, "eval_loss": 1.7742037773132324, "eval_runtime": 0.5806, "eval_samples_per_second": 62.008, "eval_steps_per_second": 15.502, "step": 55300 }, { "epoch": 738.6666666666666, "eval_loss": 1.7692784070968628, "eval_runtime": 0.5872, "eval_samples_per_second": 61.306, "eval_steps_per_second": 15.327, "step": 55400 }, { "epoch": 740.0, "grad_norm": 0.23935452103614807, "learning_rate": 1.3e-06, "loss": 0.9724, "step": 55500 }, { "epoch": 740.0, "eval_loss": 1.769684076309204, "eval_runtime": 0.5858, "eval_samples_per_second": 61.452, "eval_steps_per_second": 15.363, "step": 55500 }, { "epoch": 741.3333333333334, "eval_loss": 1.7725666761398315, "eval_runtime": 0.5631, "eval_samples_per_second": 63.931, "eval_steps_per_second": 15.983, "step": 55600 }, { "epoch": 742.6666666666666, "eval_loss": 1.7719544172286987, "eval_runtime": 0.5719, "eval_samples_per_second": 62.952, "eval_steps_per_second": 15.738, "step": 55700 }, { "epoch": 744.0, "eval_loss": 1.7681567668914795, "eval_runtime": 0.568, "eval_samples_per_second": 63.375, "eval_steps_per_second": 15.844, "step": 55800 }, { "epoch": 745.3333333333334, "eval_loss": 1.7703454494476318, "eval_runtime": 0.7271, "eval_samples_per_second": 49.509, "eval_steps_per_second": 12.377, "step": 55900 }, { "epoch": 746.6666666666666, "grad_norm": 0.06856387108564377, "learning_rate": 1.2666666666666669e-06, "loss": 0.9801, "step": 56000 }, { "epoch": 746.6666666666666, "eval_loss": 1.763393759727478, "eval_runtime": 0.5845, "eval_samples_per_second": 61.593, "eval_steps_per_second": 15.398, "step": 56000 }, { "epoch": 748.0, "eval_loss": 1.7663676738739014, "eval_runtime": 0.5739, "eval_samples_per_second": 62.731, "eval_steps_per_second": 15.683, "step": 56100 }, { "epoch": 749.3333333333334, "eval_loss": 1.7590817213058472, "eval_runtime": 0.5715, "eval_samples_per_second": 62.989, "eval_steps_per_second": 15.747, "step": 56200 }, { "epoch": 750.6666666666666, "eval_loss": 1.76565682888031, "eval_runtime": 0.5536, "eval_samples_per_second": 65.026, "eval_steps_per_second": 16.257, "step": 56300 }, { "epoch": 752.0, "eval_loss": 1.7648284435272217, "eval_runtime": 0.5627, "eval_samples_per_second": 63.972, "eval_steps_per_second": 15.993, "step": 56400 }, { "epoch": 753.3333333333334, "grad_norm": 0.08953717350959778, "learning_rate": 1.2333333333333335e-06, "loss": 0.9817, "step": 56500 }, { "epoch": 753.3333333333334, "eval_loss": 1.7627615928649902, "eval_runtime": 0.5453, "eval_samples_per_second": 66.022, "eval_steps_per_second": 16.506, "step": 56500 }, { "epoch": 754.6666666666666, "eval_loss": 1.7614855766296387, "eval_runtime": 0.5517, "eval_samples_per_second": 65.256, "eval_steps_per_second": 16.314, "step": 56600 }, { "epoch": 756.0, "eval_loss": 1.7645567655563354, "eval_runtime": 0.5905, "eval_samples_per_second": 60.964, "eval_steps_per_second": 15.241, "step": 56700 }, { "epoch": 757.3333333333334, "eval_loss": 1.7689096927642822, "eval_runtime": 0.5515, "eval_samples_per_second": 65.274, "eval_steps_per_second": 16.318, "step": 56800 }, { "epoch": 758.6666666666666, "eval_loss": 1.8117413520812988, "eval_runtime": 0.6219, "eval_samples_per_second": 57.883, "eval_steps_per_second": 14.471, "step": 56900 }, { "epoch": 760.0, "grad_norm": 11.162915229797363, "learning_rate": 1.2000000000000002e-06, "loss": 0.9688, "step": 57000 }, { "epoch": 760.0, "eval_loss": 1.8123798370361328, "eval_runtime": 0.5876, "eval_samples_per_second": 61.268, "eval_steps_per_second": 15.317, "step": 57000 }, { "epoch": 761.3333333333334, "eval_loss": 1.7966175079345703, "eval_runtime": 0.5691, "eval_samples_per_second": 63.262, "eval_steps_per_second": 15.815, "step": 57100 }, { "epoch": 762.6666666666666, "eval_loss": 1.990064024925232, "eval_runtime": 0.6374, "eval_samples_per_second": 56.481, "eval_steps_per_second": 14.12, "step": 57200 }, { "epoch": 764.0, "eval_loss": 1.7721017599105835, "eval_runtime": 0.5931, "eval_samples_per_second": 60.703, "eval_steps_per_second": 15.176, "step": 57300 }, { "epoch": 765.3333333333334, "eval_loss": 1.8064172267913818, "eval_runtime": 0.5656, "eval_samples_per_second": 63.654, "eval_steps_per_second": 15.913, "step": 57400 }, { "epoch": 766.6666666666666, "grad_norm": 0.0940064936876297, "learning_rate": 1.1666666666666668e-06, "loss": 0.9712, "step": 57500 }, { "epoch": 766.6666666666666, "eval_loss": 1.7841684818267822, "eval_runtime": 0.598, "eval_samples_per_second": 60.196, "eval_steps_per_second": 15.049, "step": 57500 }, { "epoch": 768.0, "eval_loss": 1.762632131576538, "eval_runtime": 0.5807, "eval_samples_per_second": 61.99, "eval_steps_per_second": 15.498, "step": 57600 }, { "epoch": 769.3333333333334, "eval_loss": 1.7605628967285156, "eval_runtime": 0.567, "eval_samples_per_second": 63.489, "eval_steps_per_second": 15.872, "step": 57700 }, { "epoch": 770.6666666666666, "eval_loss": 1.7560803890228271, "eval_runtime": 0.5673, "eval_samples_per_second": 63.458, "eval_steps_per_second": 15.864, "step": 57800 }, { "epoch": 772.0, "eval_loss": 1.8055216073989868, "eval_runtime": 0.5835, "eval_samples_per_second": 61.7, "eval_steps_per_second": 15.425, "step": 57900 }, { "epoch": 773.3333333333334, "grad_norm": 0.06758938729763031, "learning_rate": 1.1333333333333334e-06, "loss": 0.9802, "step": 58000 }, { "epoch": 773.3333333333334, "eval_loss": 1.7569754123687744, "eval_runtime": 0.7778, "eval_samples_per_second": 46.284, "eval_steps_per_second": 11.571, "step": 58000 }, { "epoch": 774.6666666666666, "eval_loss": 1.627150058746338, "eval_runtime": 0.5942, "eval_samples_per_second": 60.588, "eval_steps_per_second": 15.147, "step": 58100 }, { "epoch": 776.0, "eval_loss": 1.763795256614685, "eval_runtime": 0.5941, "eval_samples_per_second": 60.599, "eval_steps_per_second": 15.15, "step": 58200 }, { "epoch": 777.3333333333334, "eval_loss": 1.7548481225967407, "eval_runtime": 0.5679, "eval_samples_per_second": 63.391, "eval_steps_per_second": 15.848, "step": 58300 }, { "epoch": 778.6666666666666, "eval_loss": 1.7789323329925537, "eval_runtime": 0.5591, "eval_samples_per_second": 64.39, "eval_steps_per_second": 16.097, "step": 58400 }, { "epoch": 780.0, "grad_norm": 0.06792610138654709, "learning_rate": 1.1e-06, "loss": 0.9779, "step": 58500 }, { "epoch": 780.0, "eval_loss": 1.897459864616394, "eval_runtime": 0.5703, "eval_samples_per_second": 63.12, "eval_steps_per_second": 15.78, "step": 58500 }, { "epoch": 781.3333333333334, "eval_loss": 1.7796739339828491, "eval_runtime": 0.5625, "eval_samples_per_second": 63.995, "eval_steps_per_second": 15.999, "step": 58600 }, { "epoch": 782.6666666666666, "eval_loss": 1.7793612480163574, "eval_runtime": 0.5705, "eval_samples_per_second": 63.105, "eval_steps_per_second": 15.776, "step": 58700 }, { "epoch": 784.0, "eval_loss": 1.783490538597107, "eval_runtime": 0.586, "eval_samples_per_second": 61.43, "eval_steps_per_second": 15.357, "step": 58800 }, { "epoch": 785.3333333333334, "eval_loss": 1.7745269536972046, "eval_runtime": 0.588, "eval_samples_per_second": 61.228, "eval_steps_per_second": 15.307, "step": 58900 }, { "epoch": 786.6666666666666, "grad_norm": 0.05725092440843582, "learning_rate": 1.066666666666667e-06, "loss": 0.9772, "step": 59000 }, { "epoch": 786.6666666666666, "eval_loss": 1.5808134078979492, "eval_runtime": 0.5854, "eval_samples_per_second": 61.498, "eval_steps_per_second": 15.375, "step": 59000 }, { "epoch": 788.0, "eval_loss": 1.7920979261398315, "eval_runtime": 0.5645, "eval_samples_per_second": 63.773, "eval_steps_per_second": 15.943, "step": 59100 }, { "epoch": 789.3333333333334, "eval_loss": 1.8161612749099731, "eval_runtime": 0.6094, "eval_samples_per_second": 59.073, "eval_steps_per_second": 14.768, "step": 59200 }, { "epoch": 790.6666666666666, "eval_loss": 1.785134196281433, "eval_runtime": 0.5949, "eval_samples_per_second": 60.511, "eval_steps_per_second": 15.128, "step": 59300 }, { "epoch": 792.0, "eval_loss": 1.8116917610168457, "eval_runtime": 0.5795, "eval_samples_per_second": 62.119, "eval_steps_per_second": 15.53, "step": 59400 }, { "epoch": 793.3333333333334, "grad_norm": 0.09758958965539932, "learning_rate": 1.0333333333333333e-06, "loss": 0.9736, "step": 59500 }, { "epoch": 793.3333333333334, "eval_loss": 1.8234097957611084, "eval_runtime": 0.5593, "eval_samples_per_second": 64.367, "eval_steps_per_second": 16.092, "step": 59500 }, { "epoch": 794.6666666666666, "eval_loss": 1.804282307624817, "eval_runtime": 0.6764, "eval_samples_per_second": 53.224, "eval_steps_per_second": 13.306, "step": 59600 }, { "epoch": 796.0, "eval_loss": 1.8073744773864746, "eval_runtime": 0.5915, "eval_samples_per_second": 60.863, "eval_steps_per_second": 15.216, "step": 59700 }, { "epoch": 797.3333333333334, "eval_loss": 1.7647827863693237, "eval_runtime": 0.5659, "eval_samples_per_second": 63.61, "eval_steps_per_second": 15.903, "step": 59800 }, { "epoch": 798.6666666666666, "eval_loss": 1.7625632286071777, "eval_runtime": 0.5668, "eval_samples_per_second": 63.51, "eval_steps_per_second": 15.878, "step": 59900 }, { "epoch": 800.0, "grad_norm": 0.03814810514450073, "learning_rate": 1.0000000000000002e-06, "loss": 0.9771, "step": 60000 }, { "epoch": 800.0, "eval_loss": 1.8124709129333496, "eval_runtime": 0.583, "eval_samples_per_second": 61.746, "eval_steps_per_second": 15.437, "step": 60000 }, { "epoch": 801.3333333333334, "eval_loss": 1.8087245225906372, "eval_runtime": 0.6055, "eval_samples_per_second": 59.454, "eval_steps_per_second": 14.864, "step": 60100 }, { "epoch": 802.6666666666666, "eval_loss": 1.8008049726486206, "eval_runtime": 0.5533, "eval_samples_per_second": 65.059, "eval_steps_per_second": 16.265, "step": 60200 }, { "epoch": 804.0, "eval_loss": 1.829994797706604, "eval_runtime": 0.5694, "eval_samples_per_second": 63.229, "eval_steps_per_second": 15.807, "step": 60300 }, { "epoch": 805.3333333333334, "eval_loss": 1.7893199920654297, "eval_runtime": 0.5781, "eval_samples_per_second": 62.276, "eval_steps_per_second": 15.569, "step": 60400 }, { "epoch": 806.6666666666666, "grad_norm": 0.061076752841472626, "learning_rate": 9.666666666666668e-07, "loss": 0.9765, "step": 60500 }, { "epoch": 806.6666666666666, "eval_loss": 1.7675565481185913, "eval_runtime": 0.5838, "eval_samples_per_second": 61.669, "eval_steps_per_second": 15.417, "step": 60500 }, { "epoch": 808.0, "eval_loss": 1.6955784559249878, "eval_runtime": 0.5825, "eval_samples_per_second": 61.799, "eval_steps_per_second": 15.45, "step": 60600 }, { "epoch": 809.3333333333334, "eval_loss": 1.7824609279632568, "eval_runtime": 0.5739, "eval_samples_per_second": 62.733, "eval_steps_per_second": 15.683, "step": 60700 }, { "epoch": 810.6666666666666, "eval_loss": 1.6787306070327759, "eval_runtime": 0.5865, "eval_samples_per_second": 61.384, "eval_steps_per_second": 15.346, "step": 60800 }, { "epoch": 812.0, "eval_loss": 1.7716015577316284, "eval_runtime": 0.5861, "eval_samples_per_second": 61.421, "eval_steps_per_second": 15.355, "step": 60900 }, { "epoch": 813.3333333333334, "grad_norm": 0.07845693826675415, "learning_rate": 9.333333333333334e-07, "loss": 0.984, "step": 61000 }, { "epoch": 813.3333333333334, "eval_loss": 1.803680658340454, "eval_runtime": 0.5991, "eval_samples_per_second": 60.086, "eval_steps_per_second": 15.022, "step": 61000 }, { "epoch": 814.6666666666666, "eval_loss": 1.8008581399917603, "eval_runtime": 0.5742, "eval_samples_per_second": 62.701, "eval_steps_per_second": 15.675, "step": 61100 }, { "epoch": 816.0, "eval_loss": 1.7911458015441895, "eval_runtime": 0.5834, "eval_samples_per_second": 61.703, "eval_steps_per_second": 15.426, "step": 61200 }, { "epoch": 817.3333333333334, "eval_loss": 1.8005414009094238, "eval_runtime": 0.5702, "eval_samples_per_second": 63.131, "eval_steps_per_second": 15.783, "step": 61300 }, { "epoch": 818.6666666666666, "eval_loss": 1.786150574684143, "eval_runtime": 0.553, "eval_samples_per_second": 65.094, "eval_steps_per_second": 16.273, "step": 61400 }, { "epoch": 820.0, "grad_norm": 3.19755220413208, "learning_rate": 9.000000000000001e-07, "loss": 0.9678, "step": 61500 }, { "epoch": 820.0, "eval_loss": 1.7462598085403442, "eval_runtime": 0.5626, "eval_samples_per_second": 63.984, "eval_steps_per_second": 15.996, "step": 61500 }, { "epoch": 821.3333333333334, "eval_loss": 1.7832919359207153, "eval_runtime": 0.5863, "eval_samples_per_second": 61.406, "eval_steps_per_second": 15.351, "step": 61600 }, { "epoch": 822.6666666666666, "eval_loss": 1.783091425895691, "eval_runtime": 0.5727, "eval_samples_per_second": 62.862, "eval_steps_per_second": 15.715, "step": 61700 }, { "epoch": 824.0, "eval_loss": 1.7775006294250488, "eval_runtime": 0.5839, "eval_samples_per_second": 61.655, "eval_steps_per_second": 15.414, "step": 61800 }, { "epoch": 825.3333333333334, "eval_loss": 1.7872780561447144, "eval_runtime": 0.571, "eval_samples_per_second": 63.048, "eval_steps_per_second": 15.762, "step": 61900 }, { "epoch": 826.6666666666666, "grad_norm": 0.05740072578191757, "learning_rate": 8.666666666666668e-07, "loss": 0.9803, "step": 62000 }, { "epoch": 826.6666666666666, "eval_loss": 1.7958818674087524, "eval_runtime": 0.586, "eval_samples_per_second": 61.429, "eval_steps_per_second": 15.357, "step": 62000 }, { "epoch": 828.0, "eval_loss": 1.8055405616760254, "eval_runtime": 0.5826, "eval_samples_per_second": 61.789, "eval_steps_per_second": 15.447, "step": 62100 }, { "epoch": 829.3333333333334, "eval_loss": 1.809425950050354, "eval_runtime": 0.5805, "eval_samples_per_second": 62.013, "eval_steps_per_second": 15.503, "step": 62200 }, { "epoch": 830.6666666666666, "eval_loss": 1.8108429908752441, "eval_runtime": 0.5467, "eval_samples_per_second": 65.846, "eval_steps_per_second": 16.461, "step": 62300 }, { "epoch": 832.0, "eval_loss": 1.820143222808838, "eval_runtime": 0.5404, "eval_samples_per_second": 66.621, "eval_steps_per_second": 16.655, "step": 62400 }, { "epoch": 833.3333333333334, "grad_norm": 0.051857877522706985, "learning_rate": 8.333333333333333e-07, "loss": 0.9756, "step": 62500 }, { "epoch": 833.3333333333334, "eval_loss": 1.8310799598693848, "eval_runtime": 0.7677, "eval_samples_per_second": 46.891, "eval_steps_per_second": 11.723, "step": 62500 }, { "epoch": 834.6666666666666, "eval_loss": 1.817750096321106, "eval_runtime": 0.5507, "eval_samples_per_second": 65.37, "eval_steps_per_second": 16.343, "step": 62600 }, { "epoch": 836.0, "eval_loss": 1.7940722703933716, "eval_runtime": 0.5607, "eval_samples_per_second": 64.207, "eval_steps_per_second": 16.052, "step": 62700 }, { "epoch": 837.3333333333334, "eval_loss": 1.7446445226669312, "eval_runtime": 0.5594, "eval_samples_per_second": 64.353, "eval_steps_per_second": 16.088, "step": 62800 }, { "epoch": 838.6666666666666, "eval_loss": 1.7396012544631958, "eval_runtime": 0.5638, "eval_samples_per_second": 63.854, "eval_steps_per_second": 15.963, "step": 62900 }, { "epoch": 840.0, "grad_norm": 0.0870683565735817, "learning_rate": 8.000000000000001e-07, "loss": 0.9814, "step": 63000 }, { "epoch": 840.0, "eval_loss": 1.736430287361145, "eval_runtime": 0.5515, "eval_samples_per_second": 65.272, "eval_steps_per_second": 16.318, "step": 63000 }, { "epoch": 841.3333333333334, "eval_loss": 1.756650686264038, "eval_runtime": 0.5618, "eval_samples_per_second": 64.081, "eval_steps_per_second": 16.02, "step": 63100 }, { "epoch": 842.6666666666666, "eval_loss": 1.7455768585205078, "eval_runtime": 0.5353, "eval_samples_per_second": 67.252, "eval_steps_per_second": 16.813, "step": 63200 }, { "epoch": 844.0, "eval_loss": 1.725757122039795, "eval_runtime": 0.5426, "eval_samples_per_second": 66.35, "eval_steps_per_second": 16.588, "step": 63300 }, { "epoch": 845.3333333333334, "eval_loss": 1.75327730178833, "eval_runtime": 0.5463, "eval_samples_per_second": 65.902, "eval_steps_per_second": 16.476, "step": 63400 }, { "epoch": 846.6666666666666, "grad_norm": 0.07694292813539505, "learning_rate": 7.666666666666667e-07, "loss": 0.9788, "step": 63500 }, { "epoch": 846.6666666666666, "eval_loss": 1.7578186988830566, "eval_runtime": 0.5728, "eval_samples_per_second": 62.845, "eval_steps_per_second": 15.711, "step": 63500 }, { "epoch": 848.0, "eval_loss": 1.7655243873596191, "eval_runtime": 0.5488, "eval_samples_per_second": 65.602, "eval_steps_per_second": 16.4, "step": 63600 }, { "epoch": 849.3333333333334, "eval_loss": 1.7662264108657837, "eval_runtime": 0.557, "eval_samples_per_second": 64.628, "eval_steps_per_second": 16.157, "step": 63700 }, { "epoch": 850.6666666666666, "eval_loss": 1.7633532285690308, "eval_runtime": 0.8072, "eval_samples_per_second": 44.597, "eval_steps_per_second": 11.149, "step": 63800 }, { "epoch": 852.0, "eval_loss": 1.7632039785385132, "eval_runtime": 0.5594, "eval_samples_per_second": 64.355, "eval_steps_per_second": 16.089, "step": 63900 }, { "epoch": 853.3333333333334, "grad_norm": 0.1147078350186348, "learning_rate": 7.333333333333334e-07, "loss": 0.9764, "step": 64000 }, { "epoch": 853.3333333333334, "eval_loss": 1.7601314783096313, "eval_runtime": 0.5665, "eval_samples_per_second": 63.546, "eval_steps_per_second": 15.887, "step": 64000 }, { "epoch": 854.6666666666666, "eval_loss": 1.7543902397155762, "eval_runtime": 0.561, "eval_samples_per_second": 64.175, "eval_steps_per_second": 16.044, "step": 64100 }, { "epoch": 856.0, "eval_loss": 1.749356746673584, "eval_runtime": 0.5482, "eval_samples_per_second": 65.667, "eval_steps_per_second": 16.417, "step": 64200 }, { "epoch": 857.3333333333334, "eval_loss": 1.750355839729309, "eval_runtime": 0.552, "eval_samples_per_second": 65.214, "eval_steps_per_second": 16.303, "step": 64300 }, { "epoch": 858.6666666666666, "eval_loss": 1.7478199005126953, "eval_runtime": 0.5532, "eval_samples_per_second": 65.071, "eval_steps_per_second": 16.268, "step": 64400 }, { "epoch": 860.0, "grad_norm": 2.3960113525390625, "learning_rate": 7.000000000000001e-07, "loss": 0.9679, "step": 64500 }, { "epoch": 860.0, "eval_loss": 1.7417558431625366, "eval_runtime": 0.5565, "eval_samples_per_second": 64.691, "eval_steps_per_second": 16.173, "step": 64500 }, { "epoch": 861.3333333333334, "eval_loss": 1.7488526105880737, "eval_runtime": 0.565, "eval_samples_per_second": 63.722, "eval_steps_per_second": 15.93, "step": 64600 }, { "epoch": 862.6666666666666, "eval_loss": 1.7406384944915771, "eval_runtime": 0.5645, "eval_samples_per_second": 63.772, "eval_steps_per_second": 15.943, "step": 64700 }, { "epoch": 864.0, "eval_loss": 1.74196457862854, "eval_runtime": 0.5416, "eval_samples_per_second": 66.474, "eval_steps_per_second": 16.618, "step": 64800 }, { "epoch": 865.3333333333334, "eval_loss": 1.7383569478988647, "eval_runtime": 0.6052, "eval_samples_per_second": 59.486, "eval_steps_per_second": 14.872, "step": 64900 }, { "epoch": 866.6666666666666, "grad_norm": 0.15001507103443146, "learning_rate": 6.666666666666667e-07, "loss": 0.9712, "step": 65000 }, { "epoch": 866.6666666666666, "eval_loss": 1.7424674034118652, "eval_runtime": 0.6201, "eval_samples_per_second": 58.056, "eval_steps_per_second": 14.514, "step": 65000 }, { "epoch": 868.0, "eval_loss": 1.7455703020095825, "eval_runtime": 0.6699, "eval_samples_per_second": 53.738, "eval_steps_per_second": 13.435, "step": 65100 }, { "epoch": 869.3333333333334, "eval_loss": 1.743823528289795, "eval_runtime": 0.5399, "eval_samples_per_second": 66.675, "eval_steps_per_second": 16.669, "step": 65200 }, { "epoch": 870.6666666666666, "eval_loss": 1.7426486015319824, "eval_runtime": 0.5528, "eval_samples_per_second": 65.122, "eval_steps_per_second": 16.28, "step": 65300 }, { "epoch": 872.0, "eval_loss": 1.7444124221801758, "eval_runtime": 0.5363, "eval_samples_per_second": 67.128, "eval_steps_per_second": 16.782, "step": 65400 }, { "epoch": 873.3333333333334, "grad_norm": 0.0696408748626709, "learning_rate": 6.333333333333334e-07, "loss": 0.9858, "step": 65500 }, { "epoch": 873.3333333333334, "eval_loss": 1.745002269744873, "eval_runtime": 0.5338, "eval_samples_per_second": 67.439, "eval_steps_per_second": 16.86, "step": 65500 }, { "epoch": 874.6666666666666, "eval_loss": 1.74554443359375, "eval_runtime": 0.758, "eval_samples_per_second": 47.492, "eval_steps_per_second": 11.873, "step": 65600 }, { "epoch": 876.0, "eval_loss": 1.7522801160812378, "eval_runtime": 0.5875, "eval_samples_per_second": 61.276, "eval_steps_per_second": 15.319, "step": 65700 }, { "epoch": 877.3333333333334, "eval_loss": 1.7494921684265137, "eval_runtime": 0.5617, "eval_samples_per_second": 64.095, "eval_steps_per_second": 16.024, "step": 65800 }, { "epoch": 878.6666666666666, "eval_loss": 1.748443365097046, "eval_runtime": 0.5763, "eval_samples_per_second": 62.463, "eval_steps_per_second": 15.616, "step": 65900 }, { "epoch": 880.0, "grad_norm": 0.10572971403598785, "learning_rate": 6.000000000000001e-07, "loss": 0.9741, "step": 66000 }, { "epoch": 880.0, "eval_loss": 1.7534348964691162, "eval_runtime": 0.5707, "eval_samples_per_second": 63.076, "eval_steps_per_second": 15.769, "step": 66000 }, { "epoch": 881.3333333333334, "eval_loss": 1.7553422451019287, "eval_runtime": 0.5435, "eval_samples_per_second": 66.243, "eval_steps_per_second": 16.561, "step": 66100 }, { "epoch": 882.6666666666666, "eval_loss": 1.752484679222107, "eval_runtime": 0.5701, "eval_samples_per_second": 63.146, "eval_steps_per_second": 15.787, "step": 66200 }, { "epoch": 884.0, "eval_loss": 1.752428650856018, "eval_runtime": 0.5487, "eval_samples_per_second": 65.612, "eval_steps_per_second": 16.403, "step": 66300 }, { "epoch": 885.3333333333334, "eval_loss": 1.7548673152923584, "eval_runtime": 0.5558, "eval_samples_per_second": 64.77, "eval_steps_per_second": 16.192, "step": 66400 }, { "epoch": 886.6666666666666, "grad_norm": 0.1696215718984604, "learning_rate": 5.666666666666667e-07, "loss": 0.9791, "step": 66500 }, { "epoch": 886.6666666666666, "eval_loss": 1.752021312713623, "eval_runtime": 0.5513, "eval_samples_per_second": 65.299, "eval_steps_per_second": 16.325, "step": 66500 }, { "epoch": 888.0, "eval_loss": 1.7501262426376343, "eval_runtime": 0.55, "eval_samples_per_second": 65.449, "eval_steps_per_second": 16.362, "step": 66600 }, { "epoch": 889.3333333333334, "eval_loss": 1.749840497970581, "eval_runtime": 0.5551, "eval_samples_per_second": 64.848, "eval_steps_per_second": 16.212, "step": 66700 }, { "epoch": 890.6666666666666, "eval_loss": 1.7463749647140503, "eval_runtime": 0.5577, "eval_samples_per_second": 64.546, "eval_steps_per_second": 16.136, "step": 66800 }, { "epoch": 892.0, "eval_loss": 1.7526402473449707, "eval_runtime": 0.5602, "eval_samples_per_second": 64.263, "eval_steps_per_second": 16.066, "step": 66900 }, { "epoch": 893.3333333333334, "grad_norm": 3.3557944297790527, "learning_rate": 5.333333333333335e-07, "loss": 0.9866, "step": 67000 }, { "epoch": 893.3333333333334, "eval_loss": 1.7550595998764038, "eval_runtime": 0.5598, "eval_samples_per_second": 64.312, "eval_steps_per_second": 16.078, "step": 67000 }, { "epoch": 894.6666666666666, "eval_loss": 1.7563176155090332, "eval_runtime": 0.5748, "eval_samples_per_second": 62.626, "eval_steps_per_second": 15.657, "step": 67100 }, { "epoch": 896.0, "eval_loss": 1.7577656507492065, "eval_runtime": 0.5703, "eval_samples_per_second": 63.12, "eval_steps_per_second": 15.78, "step": 67200 }, { "epoch": 897.3333333333334, "eval_loss": 1.7577285766601562, "eval_runtime": 0.6251, "eval_samples_per_second": 57.594, "eval_steps_per_second": 14.398, "step": 67300 }, { "epoch": 898.6666666666666, "eval_loss": 1.758493185043335, "eval_runtime": 0.5702, "eval_samples_per_second": 63.137, "eval_steps_per_second": 15.784, "step": 67400 }, { "epoch": 900.0, "grad_norm": 0.002574497601017356, "learning_rate": 5.000000000000001e-07, "loss": 0.9886, "step": 67500 }, { "epoch": 900.0, "eval_loss": 1.7604880332946777, "eval_runtime": 0.5635, "eval_samples_per_second": 63.884, "eval_steps_per_second": 15.971, "step": 67500 }, { "epoch": 901.3333333333334, "eval_loss": 1.7590281963348389, "eval_runtime": 0.5676, "eval_samples_per_second": 63.429, "eval_steps_per_second": 15.857, "step": 67600 }, { "epoch": 902.6666666666666, "eval_loss": 1.7605786323547363, "eval_runtime": 0.5638, "eval_samples_per_second": 63.857, "eval_steps_per_second": 15.964, "step": 67700 }, { "epoch": 904.0, "eval_loss": 1.7571004629135132, "eval_runtime": 0.5653, "eval_samples_per_second": 63.679, "eval_steps_per_second": 15.92, "step": 67800 }, { "epoch": 905.3333333333334, "eval_loss": 1.7539509534835815, "eval_runtime": 0.5568, "eval_samples_per_second": 64.657, "eval_steps_per_second": 16.164, "step": 67900 }, { "epoch": 906.6666666666666, "grad_norm": 0.08656620234251022, "learning_rate": 4.666666666666667e-07, "loss": 0.9781, "step": 68000 }, { "epoch": 906.6666666666666, "eval_loss": 1.7556588649749756, "eval_runtime": 0.5594, "eval_samples_per_second": 64.357, "eval_steps_per_second": 16.089, "step": 68000 }, { "epoch": 908.0, "eval_loss": 1.7576310634613037, "eval_runtime": 0.5771, "eval_samples_per_second": 62.383, "eval_steps_per_second": 15.596, "step": 68100 }, { "epoch": 909.3333333333334, "eval_loss": 1.7636322975158691, "eval_runtime": 0.7285, "eval_samples_per_second": 49.418, "eval_steps_per_second": 12.355, "step": 68200 }, { "epoch": 910.6666666666666, "eval_loss": 1.762420415878296, "eval_runtime": 0.5667, "eval_samples_per_second": 63.522, "eval_steps_per_second": 15.881, "step": 68300 }, { "epoch": 912.0, "eval_loss": 1.7663708925247192, "eval_runtime": 0.5558, "eval_samples_per_second": 64.773, "eval_steps_per_second": 16.193, "step": 68400 }, { "epoch": 913.3333333333334, "grad_norm": 3.1362080574035645, "learning_rate": 4.333333333333334e-07, "loss": 0.9754, "step": 68500 }, { "epoch": 913.3333333333334, "eval_loss": 1.7648608684539795, "eval_runtime": 0.5467, "eval_samples_per_second": 65.849, "eval_steps_per_second": 16.462, "step": 68500 }, { "epoch": 914.6666666666666, "eval_loss": 1.76447331905365, "eval_runtime": 0.5726, "eval_samples_per_second": 62.868, "eval_steps_per_second": 15.717, "step": 68600 }, { "epoch": 916.0, "eval_loss": 1.7645812034606934, "eval_runtime": 0.5729, "eval_samples_per_second": 62.834, "eval_steps_per_second": 15.709, "step": 68700 }, { "epoch": 917.3333333333334, "eval_loss": 1.7644435167312622, "eval_runtime": 0.7369, "eval_samples_per_second": 48.852, "eval_steps_per_second": 12.213, "step": 68800 }, { "epoch": 918.6666666666666, "eval_loss": 1.7620030641555786, "eval_runtime": 0.561, "eval_samples_per_second": 64.17, "eval_steps_per_second": 16.042, "step": 68900 }, { "epoch": 920.0, "grad_norm": 0.002259365050122142, "learning_rate": 4.0000000000000003e-07, "loss": 0.975, "step": 69000 }, { "epoch": 920.0, "eval_loss": 1.7633707523345947, "eval_runtime": 0.6072, "eval_samples_per_second": 59.285, "eval_steps_per_second": 14.821, "step": 69000 }, { "epoch": 921.3333333333334, "eval_loss": 1.760779619216919, "eval_runtime": 0.5716, "eval_samples_per_second": 62.984, "eval_steps_per_second": 15.746, "step": 69100 }, { "epoch": 922.6666666666666, "eval_loss": 1.7606711387634277, "eval_runtime": 0.5511, "eval_samples_per_second": 65.329, "eval_steps_per_second": 16.332, "step": 69200 }, { "epoch": 924.0, "eval_loss": 1.7627135515213013, "eval_runtime": 0.5468, "eval_samples_per_second": 65.832, "eval_steps_per_second": 16.458, "step": 69300 }, { "epoch": 925.3333333333334, "eval_loss": 1.7657824754714966, "eval_runtime": 0.5425, "eval_samples_per_second": 66.353, "eval_steps_per_second": 16.588, "step": 69400 }, { "epoch": 926.6666666666666, "grad_norm": 0.14232970774173737, "learning_rate": 3.666666666666667e-07, "loss": 0.9695, "step": 69500 }, { "epoch": 926.6666666666666, "eval_loss": 1.7679473161697388, "eval_runtime": 0.5516, "eval_samples_per_second": 65.27, "eval_steps_per_second": 16.317, "step": 69500 }, { "epoch": 928.0, "eval_loss": 1.7663129568099976, "eval_runtime": 0.5817, "eval_samples_per_second": 61.886, "eval_steps_per_second": 15.472, "step": 69600 }, { "epoch": 929.3333333333334, "eval_loss": 1.7645525932312012, "eval_runtime": 0.5736, "eval_samples_per_second": 62.763, "eval_steps_per_second": 15.691, "step": 69700 }, { "epoch": 930.6666666666666, "eval_loss": 1.771233081817627, "eval_runtime": 0.5623, "eval_samples_per_second": 64.021, "eval_steps_per_second": 16.005, "step": 69800 }, { "epoch": 932.0, "eval_loss": 1.77271568775177, "eval_runtime": 0.5712, "eval_samples_per_second": 63.026, "eval_steps_per_second": 15.756, "step": 69900 }, { "epoch": 933.3333333333334, "grad_norm": 0.08317013829946518, "learning_rate": 3.3333333333333335e-07, "loss": 0.9687, "step": 70000 }, { "epoch": 933.3333333333334, "eval_loss": 1.7744166851043701, "eval_runtime": 0.5572, "eval_samples_per_second": 64.611, "eval_steps_per_second": 16.153, "step": 70000 }, { "epoch": 934.6666666666666, "eval_loss": 1.772397756576538, "eval_runtime": 0.5696, "eval_samples_per_second": 63.205, "eval_steps_per_second": 15.801, "step": 70100 }, { "epoch": 936.0, "eval_loss": 1.7733337879180908, "eval_runtime": 0.5625, "eval_samples_per_second": 64.001, "eval_steps_per_second": 16.0, "step": 70200 }, { "epoch": 937.3333333333334, "eval_loss": 1.7753205299377441, "eval_runtime": 0.5512, "eval_samples_per_second": 65.308, "eval_steps_per_second": 16.327, "step": 70300 }, { "epoch": 938.6666666666666, "eval_loss": 1.7780596017837524, "eval_runtime": 0.5629, "eval_samples_per_second": 63.954, "eval_steps_per_second": 15.989, "step": 70400 }, { "epoch": 940.0, "grad_norm": 3.0197455883026123, "learning_rate": 3.0000000000000004e-07, "loss": 0.9851, "step": 70500 }, { "epoch": 940.0, "eval_loss": 1.7846887111663818, "eval_runtime": 0.5438, "eval_samples_per_second": 66.203, "eval_steps_per_second": 16.551, "step": 70500 }, { "epoch": 941.3333333333334, "eval_loss": 1.7847132682800293, "eval_runtime": 0.5457, "eval_samples_per_second": 65.965, "eval_steps_per_second": 16.491, "step": 70600 }, { "epoch": 942.6666666666666, "eval_loss": 1.7830935716629028, "eval_runtime": 0.5605, "eval_samples_per_second": 64.231, "eval_steps_per_second": 16.058, "step": 70700 }, { "epoch": 944.0, "eval_loss": 1.7809431552886963, "eval_runtime": 0.5635, "eval_samples_per_second": 63.892, "eval_steps_per_second": 15.973, "step": 70800 }, { "epoch": 945.3333333333334, "eval_loss": 1.7872589826583862, "eval_runtime": 0.5553, "eval_samples_per_second": 64.834, "eval_steps_per_second": 16.209, "step": 70900 }, { "epoch": 946.6666666666666, "grad_norm": 0.06434129178524017, "learning_rate": 2.666666666666667e-07, "loss": 0.9675, "step": 71000 }, { "epoch": 946.6666666666666, "eval_loss": 1.7981126308441162, "eval_runtime": 0.5614, "eval_samples_per_second": 64.128, "eval_steps_per_second": 16.032, "step": 71000 }, { "epoch": 948.0, "eval_loss": 1.7868714332580566, "eval_runtime": 0.5489, "eval_samples_per_second": 65.582, "eval_steps_per_second": 16.396, "step": 71100 }, { "epoch": 949.3333333333334, "eval_loss": 1.7894632816314697, "eval_runtime": 0.5569, "eval_samples_per_second": 64.644, "eval_steps_per_second": 16.161, "step": 71200 }, { "epoch": 950.6666666666666, "eval_loss": 1.7932921648025513, "eval_runtime": 0.5613, "eval_samples_per_second": 64.141, "eval_steps_per_second": 16.035, "step": 71300 }, { "epoch": 952.0, "eval_loss": 1.8011940717697144, "eval_runtime": 0.5478, "eval_samples_per_second": 65.717, "eval_steps_per_second": 16.429, "step": 71400 }, { "epoch": 953.3333333333334, "grad_norm": 0.061037518084049225, "learning_rate": 2.3333333333333336e-07, "loss": 0.9779, "step": 71500 }, { "epoch": 953.3333333333334, "eval_loss": 1.7844412326812744, "eval_runtime": 0.5621, "eval_samples_per_second": 64.046, "eval_steps_per_second": 16.012, "step": 71500 }, { "epoch": 954.6666666666666, "eval_loss": 1.7828458547592163, "eval_runtime": 0.5541, "eval_samples_per_second": 64.968, "eval_steps_per_second": 16.242, "step": 71600 }, { "epoch": 956.0, "eval_loss": 1.7794026136398315, "eval_runtime": 0.5598, "eval_samples_per_second": 64.314, "eval_steps_per_second": 16.079, "step": 71700 }, { "epoch": 957.3333333333334, "eval_loss": 1.7818756103515625, "eval_runtime": 0.6723, "eval_samples_per_second": 53.549, "eval_steps_per_second": 13.387, "step": 71800 }, { "epoch": 958.6666666666666, "eval_loss": 1.7818056344985962, "eval_runtime": 0.5904, "eval_samples_per_second": 60.973, "eval_steps_per_second": 15.243, "step": 71900 }, { "epoch": 960.0, "grad_norm": 0.06045776233077049, "learning_rate": 2.0000000000000002e-07, "loss": 0.9845, "step": 72000 }, { "epoch": 960.0, "eval_loss": 1.7835536003112793, "eval_runtime": 0.5654, "eval_samples_per_second": 63.677, "eval_steps_per_second": 15.919, "step": 72000 }, { "epoch": 961.3333333333334, "eval_loss": 1.7781590223312378, "eval_runtime": 0.5667, "eval_samples_per_second": 63.528, "eval_steps_per_second": 15.882, "step": 72100 }, { "epoch": 962.6666666666666, "eval_loss": 1.7784353494644165, "eval_runtime": 0.5649, "eval_samples_per_second": 63.733, "eval_steps_per_second": 15.933, "step": 72200 }, { "epoch": 964.0, "eval_loss": 1.7777352333068848, "eval_runtime": 0.5561, "eval_samples_per_second": 64.735, "eval_steps_per_second": 16.184, "step": 72300 }, { "epoch": 965.3333333333334, "eval_loss": 1.7766697406768799, "eval_runtime": 0.575, "eval_samples_per_second": 62.604, "eval_steps_per_second": 15.651, "step": 72400 }, { "epoch": 966.6666666666666, "grad_norm": 0.12095582485198975, "learning_rate": 1.6666666666666668e-07, "loss": 0.9718, "step": 72500 }, { "epoch": 966.6666666666666, "eval_loss": 1.7780518531799316, "eval_runtime": 0.5969, "eval_samples_per_second": 60.315, "eval_steps_per_second": 15.079, "step": 72500 }, { "epoch": 968.0, "eval_loss": 1.779961109161377, "eval_runtime": 0.7489, "eval_samples_per_second": 48.073, "eval_steps_per_second": 12.018, "step": 72600 }, { "epoch": 969.3333333333334, "eval_loss": 1.7817274332046509, "eval_runtime": 0.5717, "eval_samples_per_second": 62.97, "eval_steps_per_second": 15.742, "step": 72700 }, { "epoch": 970.6666666666666, "eval_loss": 1.7832821607589722, "eval_runtime": 0.7486, "eval_samples_per_second": 48.088, "eval_steps_per_second": 12.022, "step": 72800 }, { "epoch": 972.0, "eval_loss": 1.783406138420105, "eval_runtime": 0.5663, "eval_samples_per_second": 63.572, "eval_steps_per_second": 15.893, "step": 72900 }, { "epoch": 973.3333333333334, "grad_norm": 0.06391321867704391, "learning_rate": 1.3333333333333336e-07, "loss": 0.9779, "step": 73000 }, { "epoch": 973.3333333333334, "eval_loss": 1.783936619758606, "eval_runtime": 0.5705, "eval_samples_per_second": 63.105, "eval_steps_per_second": 15.776, "step": 73000 }, { "epoch": 974.6666666666666, "eval_loss": 1.7852482795715332, "eval_runtime": 0.5859, "eval_samples_per_second": 61.439, "eval_steps_per_second": 15.36, "step": 73100 }, { "epoch": 976.0, "eval_loss": 1.7861665487289429, "eval_runtime": 0.5858, "eval_samples_per_second": 61.459, "eval_steps_per_second": 15.365, "step": 73200 }, { "epoch": 977.3333333333334, "eval_loss": 1.787322759628296, "eval_runtime": 0.5747, "eval_samples_per_second": 62.644, "eval_steps_per_second": 15.661, "step": 73300 }, { "epoch": 978.6666666666666, "eval_loss": 1.7844524383544922, "eval_runtime": 0.5675, "eval_samples_per_second": 63.439, "eval_steps_per_second": 15.86, "step": 73400 }, { "epoch": 980.0, "grad_norm": 0.1735711395740509, "learning_rate": 1.0000000000000001e-07, "loss": 0.9743, "step": 73500 }, { "epoch": 980.0, "eval_loss": 1.7858521938323975, "eval_runtime": 0.5641, "eval_samples_per_second": 63.82, "eval_steps_per_second": 15.955, "step": 73500 }, { "epoch": 981.3333333333334, "eval_loss": 1.784421682357788, "eval_runtime": 0.5667, "eval_samples_per_second": 63.529, "eval_steps_per_second": 15.882, "step": 73600 }, { "epoch": 982.6666666666666, "eval_loss": 1.7849678993225098, "eval_runtime": 0.5663, "eval_samples_per_second": 63.572, "eval_steps_per_second": 15.893, "step": 73700 }, { "epoch": 984.0, "eval_loss": 1.7846465110778809, "eval_runtime": 0.562, "eval_samples_per_second": 64.062, "eval_steps_per_second": 16.016, "step": 73800 }, { "epoch": 985.3333333333334, "eval_loss": 1.7852948904037476, "eval_runtime": 0.5692, "eval_samples_per_second": 63.245, "eval_steps_per_second": 15.811, "step": 73900 }, { "epoch": 986.6666666666666, "grad_norm": 0.05577833205461502, "learning_rate": 6.666666666666668e-08, "loss": 0.9786, "step": 74000 }, { "epoch": 986.6666666666666, "eval_loss": 1.7871431112289429, "eval_runtime": 0.5918, "eval_samples_per_second": 60.827, "eval_steps_per_second": 15.207, "step": 74000 }, { "epoch": 988.0, "eval_loss": 1.7876148223876953, "eval_runtime": 0.5731, "eval_samples_per_second": 62.818, "eval_steps_per_second": 15.705, "step": 74100 }, { "epoch": 989.3333333333334, "eval_loss": 1.787313461303711, "eval_runtime": 0.5694, "eval_samples_per_second": 63.227, "eval_steps_per_second": 15.807, "step": 74200 }, { "epoch": 990.6666666666666, "eval_loss": 1.788341999053955, "eval_runtime": 0.5615, "eval_samples_per_second": 64.111, "eval_steps_per_second": 16.028, "step": 74300 }, { "epoch": 992.0, "eval_loss": 1.7886956930160522, "eval_runtime": 0.5643, "eval_samples_per_second": 63.795, "eval_steps_per_second": 15.949, "step": 74400 }, { "epoch": 993.3333333333334, "grad_norm": 0.06452155113220215, "learning_rate": 3.333333333333334e-08, "loss": 0.9788, "step": 74500 }, { "epoch": 993.3333333333334, "eval_loss": 1.7889302968978882, "eval_runtime": 0.543, "eval_samples_per_second": 66.294, "eval_steps_per_second": 16.573, "step": 74500 }, { "epoch": 994.6666666666666, "eval_loss": 1.7889537811279297, "eval_runtime": 0.5374, "eval_samples_per_second": 66.984, "eval_steps_per_second": 16.746, "step": 74600 }, { "epoch": 996.0, "eval_loss": 1.7883883714675903, "eval_runtime": 0.5389, "eval_samples_per_second": 66.801, "eval_steps_per_second": 16.7, "step": 74700 }, { "epoch": 997.3333333333334, "eval_loss": 1.7884782552719116, "eval_runtime": 0.547, "eval_samples_per_second": 65.818, "eval_steps_per_second": 16.454, "step": 74800 }, { "epoch": 998.6666666666666, "eval_loss": 1.7885273694992065, "eval_runtime": 0.534, "eval_samples_per_second": 67.42, "eval_steps_per_second": 16.855, "step": 74900 }, { "epoch": 1000.0, "grad_norm": 0.07397235929965973, "learning_rate": 0.0, "loss": 0.9749, "step": 75000 }, { "epoch": 1000.0, "eval_loss": 1.7884608507156372, "eval_runtime": 0.5338, "eval_samples_per_second": 67.445, "eval_steps_per_second": 16.861, "step": 75000 }, { "epoch": 1000.0, "step": 75000, "total_flos": 3.992166672e+16, "train_loss": 0.9881956270345053, "train_runtime": 11653.197, "train_samples_per_second": 25.658, "train_steps_per_second": 6.436 } ], "logging_steps": 500, "max_steps": 75000, "num_input_tokens_seen": 0, "num_train_epochs": 1000, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.992166672e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }