diff --git "a/run-4/checkpoint-620/trainer_state.json" "b/run-4/checkpoint-620/trainer_state.json" new file mode 100644--- /dev/null +++ "b/run-4/checkpoint-620/trainer_state.json" @@ -0,0 +1,4440 @@ +{ + "best_metric": 1.9000149676084739, + "best_model_checkpoint": "./modernBERT-content-regression/run-4/checkpoint-620", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 620, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008064516129032258, + "grad_norm": 312.8161315917969, + "learning_rate": 2.305260772966439e-06, + "loss": 21.3087, + "step": 1 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 88.59754180908203, + "learning_rate": 2.3015366037047808e-06, + "loss": 1.6705, + "step": 2 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 525.8513793945312, + "learning_rate": 2.2978124344431226e-06, + "loss": 151.777, + "step": 3 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 207.6021270751953, + "learning_rate": 2.2940882651814644e-06, + "loss": 16.9986, + "step": 4 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 95.76493072509766, + "learning_rate": 2.2903640959198062e-06, + "loss": 3.2381, + "step": 5 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 26.680326461791992, + "learning_rate": 2.286639926658148e-06, + "loss": 0.3182, + "step": 6 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 134.4029083251953, + "learning_rate": 2.28291575739649e-06, + "loss": 7.6237, + "step": 7 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 299.9959716796875, + "learning_rate": 2.2791915881348312e-06, + "loss": 16.5691, + "step": 8 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 8.101390838623047, + "learning_rate": 2.275467418873173e-06, + "loss": 0.0379, + "step": 9 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 109.74885559082031, + "learning_rate": 2.271743249611515e-06, + "loss": 3.1775, + "step": 10 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 204.84671020507812, + "learning_rate": 2.2680190803498567e-06, + "loss": 14.3084, + "step": 11 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 97.9755859375, + "learning_rate": 2.2642949110881985e-06, + "loss": 0.7043, + "step": 12 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 107.0622787475586, + "learning_rate": 2.2605707418265403e-06, + "loss": 1.1753, + "step": 13 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 61.44425964355469, + "learning_rate": 2.256846572564882e-06, + "loss": 1.8918, + "step": 14 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 99.69449615478516, + "learning_rate": 2.253122403303224e-06, + "loss": 0.9187, + "step": 15 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 160.03836059570312, + "learning_rate": 2.2493982340415658e-06, + "loss": 8.8633, + "step": 16 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 130.26107788085938, + "learning_rate": 2.245674064779907e-06, + "loss": 1.5865, + "step": 17 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 51.57841491699219, + "learning_rate": 2.241949895518249e-06, + "loss": 1.7055, + "step": 18 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 119.63735961914062, + "learning_rate": 2.238225726256591e-06, + "loss": 1.5964, + "step": 19 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 218.9719696044922, + "learning_rate": 2.2345015569949326e-06, + "loss": 8.2474, + "step": 20 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 213.2459716796875, + "learning_rate": 2.2307773877332744e-06, + "loss": 2.9405, + "step": 21 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 313.0417175292969, + "learning_rate": 2.2270532184716162e-06, + "loss": 4.7883, + "step": 22 + }, + { + "epoch": 0.18548387096774194, + "grad_norm": 178.1278839111328, + "learning_rate": 2.223329049209958e-06, + "loss": 16.9713, + "step": 23 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 340.25732421875, + "learning_rate": 2.2196048799483e-06, + "loss": 5.7798, + "step": 24 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 145.1334228515625, + "learning_rate": 2.2158807106866417e-06, + "loss": 1.827, + "step": 25 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 162.4819793701172, + "learning_rate": 2.212156541424983e-06, + "loss": 1.9449, + "step": 26 + }, + { + "epoch": 0.21774193548387097, + "grad_norm": 120.5811538696289, + "learning_rate": 2.208432372163325e-06, + "loss": 3.0724, + "step": 27 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 184.12374877929688, + "learning_rate": 2.2047082029016667e-06, + "loss": 2.8759, + "step": 28 + }, + { + "epoch": 0.23387096774193547, + "grad_norm": 85.72262573242188, + "learning_rate": 2.200984033640009e-06, + "loss": 1.3223, + "step": 29 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 45.07551193237305, + "learning_rate": 2.1972598643783508e-06, + "loss": 0.5763, + "step": 30 + }, + { + "epoch": 0.25, + "grad_norm": 17.517133712768555, + "learning_rate": 2.193535695116692e-06, + "loss": 0.9204, + "step": 31 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 67.82421112060547, + "learning_rate": 2.189811525855034e-06, + "loss": 0.9659, + "step": 32 + }, + { + "epoch": 0.2661290322580645, + "grad_norm": 32.993446350097656, + "learning_rate": 2.186087356593376e-06, + "loss": 0.8629, + "step": 33 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 56.122249603271484, + "learning_rate": 2.1823631873317176e-06, + "loss": 1.5599, + "step": 34 + }, + { + "epoch": 0.28225806451612906, + "grad_norm": 348.0657958984375, + "learning_rate": 2.1786390180700594e-06, + "loss": 24.736, + "step": 35 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 26.41761016845703, + "learning_rate": 2.1749148488084013e-06, + "loss": 0.5115, + "step": 36 + }, + { + "epoch": 0.29838709677419356, + "grad_norm": 109.63630676269531, + "learning_rate": 2.171190679546743e-06, + "loss": 1.4894, + "step": 37 + }, + { + "epoch": 0.3064516129032258, + "grad_norm": 254.22276306152344, + "learning_rate": 2.167466510285085e-06, + "loss": 26.0623, + "step": 38 + }, + { + "epoch": 0.31451612903225806, + "grad_norm": 41.65399932861328, + "learning_rate": 2.1637423410234267e-06, + "loss": 0.3779, + "step": 39 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 450.0879821777344, + "learning_rate": 2.160018171761768e-06, + "loss": 34.1566, + "step": 40 + }, + { + "epoch": 0.33064516129032256, + "grad_norm": 57.8630256652832, + "learning_rate": 2.15629400250011e-06, + "loss": 1.1786, + "step": 41 + }, + { + "epoch": 0.3387096774193548, + "grad_norm": 253.47035217285156, + "learning_rate": 2.1525698332384517e-06, + "loss": 19.6974, + "step": 42 + }, + { + "epoch": 0.3467741935483871, + "grad_norm": 172.5160675048828, + "learning_rate": 2.1488456639767935e-06, + "loss": 27.7034, + "step": 43 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 25.08224868774414, + "learning_rate": 2.1451214947151354e-06, + "loss": 0.331, + "step": 44 + }, + { + "epoch": 0.3629032258064516, + "grad_norm": 40.45454788208008, + "learning_rate": 2.141397325453477e-06, + "loss": 1.2259, + "step": 45 + }, + { + "epoch": 0.3709677419354839, + "grad_norm": 62.238983154296875, + "learning_rate": 2.137673156191819e-06, + "loss": 2.3635, + "step": 46 + }, + { + "epoch": 0.3790322580645161, + "grad_norm": 40.89179229736328, + "learning_rate": 2.133948986930161e-06, + "loss": 2.3359, + "step": 47 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 143.2526397705078, + "learning_rate": 2.1302248176685026e-06, + "loss": 1.7717, + "step": 48 + }, + { + "epoch": 0.3951612903225806, + "grad_norm": 97.91305541992188, + "learning_rate": 2.1265006484068444e-06, + "loss": 0.9276, + "step": 49 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 146.67626953125, + "learning_rate": 2.122776479145186e-06, + "loss": 7.5945, + "step": 50 + }, + { + "epoch": 0.4112903225806452, + "grad_norm": 132.352294921875, + "learning_rate": 2.1190523098835277e-06, + "loss": 1.4938, + "step": 51 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 167.54727172851562, + "learning_rate": 2.1153281406218695e-06, + "loss": 0.3165, + "step": 52 + }, + { + "epoch": 0.4274193548387097, + "grad_norm": 300.7011413574219, + "learning_rate": 2.1116039713602113e-06, + "loss": 13.5344, + "step": 53 + }, + { + "epoch": 0.43548387096774194, + "grad_norm": 63.001609802246094, + "learning_rate": 2.1078798020985535e-06, + "loss": 1.0206, + "step": 54 + }, + { + "epoch": 0.4435483870967742, + "grad_norm": 37.191322326660156, + "learning_rate": 2.104155632836895e-06, + "loss": 4.3386, + "step": 55 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 42.49811935424805, + "learning_rate": 2.1004314635752367e-06, + "loss": 0.7406, + "step": 56 + }, + { + "epoch": 0.4596774193548387, + "grad_norm": 106.07703399658203, + "learning_rate": 2.0967072943135785e-06, + "loss": 1.2151, + "step": 57 + }, + { + "epoch": 0.46774193548387094, + "grad_norm": 204.26516723632812, + "learning_rate": 2.0929831250519204e-06, + "loss": 7.2585, + "step": 58 + }, + { + "epoch": 0.47580645161290325, + "grad_norm": 70.08997344970703, + "learning_rate": 2.089258955790262e-06, + "loss": 0.737, + "step": 59 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 213.41036987304688, + "learning_rate": 2.085534786528604e-06, + "loss": 14.7889, + "step": 60 + }, + { + "epoch": 0.49193548387096775, + "grad_norm": 98.5826644897461, + "learning_rate": 2.081810617266946e-06, + "loss": 0.8851, + "step": 61 + }, + { + "epoch": 0.5, + "grad_norm": 118.79777526855469, + "learning_rate": 2.0780864480052876e-06, + "loss": 23.6807, + "step": 62 + }, + { + "epoch": 0.5080645161290323, + "grad_norm": 118.90921783447266, + "learning_rate": 2.0743622787436294e-06, + "loss": 1.4873, + "step": 63 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 45.01438903808594, + "learning_rate": 2.070638109481971e-06, + "loss": 2.5075, + "step": 64 + }, + { + "epoch": 0.5241935483870968, + "grad_norm": 143.5907440185547, + "learning_rate": 2.0669139402203127e-06, + "loss": 14.8131, + "step": 65 + }, + { + "epoch": 0.532258064516129, + "grad_norm": 58.62938690185547, + "learning_rate": 2.0631897709586545e-06, + "loss": 1.2206, + "step": 66 + }, + { + "epoch": 0.5403225806451613, + "grad_norm": 83.41098022460938, + "learning_rate": 2.0594656016969963e-06, + "loss": 1.4715, + "step": 67 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 72.7550048828125, + "learning_rate": 2.055741432435338e-06, + "loss": 0.8449, + "step": 68 + }, + { + "epoch": 0.5564516129032258, + "grad_norm": 99.98751831054688, + "learning_rate": 2.05201726317368e-06, + "loss": 1.5294, + "step": 69 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 32.452632904052734, + "learning_rate": 2.0482930939120217e-06, + "loss": 0.3351, + "step": 70 + }, + { + "epoch": 0.5725806451612904, + "grad_norm": 87.99835205078125, + "learning_rate": 2.0445689246503636e-06, + "loss": 2.0365, + "step": 71 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 263.8376770019531, + "learning_rate": 2.0408447553887054e-06, + "loss": 15.161, + "step": 72 + }, + { + "epoch": 0.5887096774193549, + "grad_norm": 39.59327697753906, + "learning_rate": 2.0371205861270468e-06, + "loss": 0.6412, + "step": 73 + }, + { + "epoch": 0.5967741935483871, + "grad_norm": 184.743896484375, + "learning_rate": 2.0333964168653886e-06, + "loss": 2.4842, + "step": 74 + }, + { + "epoch": 0.6048387096774194, + "grad_norm": 118.40076446533203, + "learning_rate": 2.0296722476037304e-06, + "loss": 15.6392, + "step": 75 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 42.980979919433594, + "learning_rate": 2.0259480783420722e-06, + "loss": 0.2714, + "step": 76 + }, + { + "epoch": 0.6209677419354839, + "grad_norm": 78.06953430175781, + "learning_rate": 2.022223909080414e-06, + "loss": 0.5073, + "step": 77 + }, + { + "epoch": 0.6290322580645161, + "grad_norm": 124.31591796875, + "learning_rate": 2.018499739818756e-06, + "loss": 12.7986, + "step": 78 + }, + { + "epoch": 0.6370967741935484, + "grad_norm": 179.42996215820312, + "learning_rate": 2.0147755705570977e-06, + "loss": 22.7291, + "step": 79 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 24.730609893798828, + "learning_rate": 2.0110514012954395e-06, + "loss": 1.038, + "step": 80 + }, + { + "epoch": 0.6532258064516129, + "grad_norm": 120.6009292602539, + "learning_rate": 2.0073272320337813e-06, + "loss": 1.043, + "step": 81 + }, + { + "epoch": 0.6612903225806451, + "grad_norm": 211.92623901367188, + "learning_rate": 2.003603062772123e-06, + "loss": 2.2342, + "step": 82 + }, + { + "epoch": 0.6693548387096774, + "grad_norm": 17.749357223510742, + "learning_rate": 1.9998788935104645e-06, + "loss": 0.2666, + "step": 83 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 23.90791893005371, + "learning_rate": 1.9961547242488063e-06, + "loss": 0.9203, + "step": 84 + }, + { + "epoch": 0.6854838709677419, + "grad_norm": 304.3386535644531, + "learning_rate": 1.9924305549871486e-06, + "loss": 18.5165, + "step": 85 + }, + { + "epoch": 0.6935483870967742, + "grad_norm": 115.4675521850586, + "learning_rate": 1.9887063857254904e-06, + "loss": 14.8705, + "step": 86 + }, + { + "epoch": 0.7016129032258065, + "grad_norm": 121.4536361694336, + "learning_rate": 1.9849822164638318e-06, + "loss": 1.6822, + "step": 87 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 75.23461151123047, + "learning_rate": 1.9812580472021736e-06, + "loss": 0.75, + "step": 88 + }, + { + "epoch": 0.717741935483871, + "grad_norm": 109.62806701660156, + "learning_rate": 1.9775338779405154e-06, + "loss": 1.6612, + "step": 89 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 133.42919921875, + "learning_rate": 1.9738097086788572e-06, + "loss": 1.6338, + "step": 90 + }, + { + "epoch": 0.7338709677419355, + "grad_norm": 318.0464172363281, + "learning_rate": 1.970085539417199e-06, + "loss": 19.9659, + "step": 91 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 143.49618530273438, + "learning_rate": 1.966361370155541e-06, + "loss": 6.968, + "step": 92 + }, + { + "epoch": 0.75, + "grad_norm": 41.44537353515625, + "learning_rate": 1.9626372008938827e-06, + "loss": 0.273, + "step": 93 + }, + { + "epoch": 0.7580645161290323, + "grad_norm": 122.76979064941406, + "learning_rate": 1.9589130316322245e-06, + "loss": 1.6946, + "step": 94 + }, + { + "epoch": 0.7661290322580645, + "grad_norm": 233.6233673095703, + "learning_rate": 1.9551888623705663e-06, + "loss": 9.8987, + "step": 95 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 103.65068817138672, + "learning_rate": 1.951464693108908e-06, + "loss": 1.8381, + "step": 96 + }, + { + "epoch": 0.782258064516129, + "grad_norm": 79.15303802490234, + "learning_rate": 1.9477405238472495e-06, + "loss": 0.8288, + "step": 97 + }, + { + "epoch": 0.7903225806451613, + "grad_norm": 77.49793243408203, + "learning_rate": 1.9440163545855913e-06, + "loss": 0.9571, + "step": 98 + }, + { + "epoch": 0.7983870967741935, + "grad_norm": 89.57393646240234, + "learning_rate": 1.940292185323933e-06, + "loss": 1.1071, + "step": 99 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 274.893798828125, + "learning_rate": 1.936568016062275e-06, + "loss": 19.8978, + "step": 100 + }, + { + "epoch": 0.8145161290322581, + "grad_norm": 44.62432098388672, + "learning_rate": 1.9328438468006168e-06, + "loss": 0.8877, + "step": 101 + }, + { + "epoch": 0.8225806451612904, + "grad_norm": 103.63294219970703, + "learning_rate": 1.9291196775389586e-06, + "loss": 0.6422, + "step": 102 + }, + { + "epoch": 0.8306451612903226, + "grad_norm": 118.71654510498047, + "learning_rate": 1.9253955082773004e-06, + "loss": 11.9281, + "step": 103 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 22.929773330688477, + "learning_rate": 1.9216713390156422e-06, + "loss": 0.5554, + "step": 104 + }, + { + "epoch": 0.8467741935483871, + "grad_norm": 164.58189392089844, + "learning_rate": 1.917947169753984e-06, + "loss": 1.0632, + "step": 105 + }, + { + "epoch": 0.8548387096774194, + "grad_norm": 131.65518188476562, + "learning_rate": 1.9142230004923254e-06, + "loss": 3.3713, + "step": 106 + }, + { + "epoch": 0.8629032258064516, + "grad_norm": 103.23025512695312, + "learning_rate": 1.9104988312306673e-06, + "loss": 2.2805, + "step": 107 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 43.43564224243164, + "learning_rate": 1.9067746619690093e-06, + "loss": 0.5328, + "step": 108 + }, + { + "epoch": 0.8790322580645161, + "grad_norm": 109.90870666503906, + "learning_rate": 1.903050492707351e-06, + "loss": 1.4934, + "step": 109 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 12.80819034576416, + "learning_rate": 1.899326323445693e-06, + "loss": 0.8101, + "step": 110 + }, + { + "epoch": 0.8951612903225806, + "grad_norm": 223.2318115234375, + "learning_rate": 1.8956021541840345e-06, + "loss": 4.6677, + "step": 111 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 18.64436912536621, + "learning_rate": 1.8918779849223763e-06, + "loss": 1.0659, + "step": 112 + }, + { + "epoch": 0.9112903225806451, + "grad_norm": 24.009183883666992, + "learning_rate": 1.8881538156607181e-06, + "loss": 0.2875, + "step": 113 + }, + { + "epoch": 0.9193548387096774, + "grad_norm": 21.12612533569336, + "learning_rate": 1.88442964639906e-06, + "loss": 0.7339, + "step": 114 + }, + { + "epoch": 0.9274193548387096, + "grad_norm": 187.74147033691406, + "learning_rate": 1.8807054771374018e-06, + "loss": 6.5702, + "step": 115 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 62.3062744140625, + "learning_rate": 1.8769813078757434e-06, + "loss": 1.5116, + "step": 116 + }, + { + "epoch": 0.9435483870967742, + "grad_norm": 57.31589126586914, + "learning_rate": 1.8732571386140852e-06, + "loss": 0.5057, + "step": 117 + }, + { + "epoch": 0.9516129032258065, + "grad_norm": 136.85366821289062, + "learning_rate": 1.869532969352427e-06, + "loss": 14.2211, + "step": 118 + }, + { + "epoch": 0.9596774193548387, + "grad_norm": 144.6485137939453, + "learning_rate": 1.8658088000907688e-06, + "loss": 1.5384, + "step": 119 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 25.256959915161133, + "learning_rate": 1.8620846308291104e-06, + "loss": 1.369, + "step": 120 + }, + { + "epoch": 0.9758064516129032, + "grad_norm": 82.7289047241211, + "learning_rate": 1.8583604615674523e-06, + "loss": 0.8961, + "step": 121 + }, + { + "epoch": 0.9838709677419355, + "grad_norm": 14.033979415893555, + "learning_rate": 1.854636292305794e-06, + "loss": 0.4005, + "step": 122 + }, + { + "epoch": 0.9919354838709677, + "grad_norm": 33.84238052368164, + "learning_rate": 1.8509121230441359e-06, + "loss": 0.3, + "step": 123 + }, + { + "epoch": 1.0, + "grad_norm": 44.46285629272461, + "learning_rate": 1.8471879537824777e-06, + "loss": 0.0856, + "step": 124 + }, + { + "epoch": 1.0, + "eval_loss": 3.761341094970703, + "eval_mae": 1.156431794166565, + "eval_mse": 3.761340856552124, + "eval_r2": -0.02978968620300293, + "eval_rmse": 1.9394176591317622, + "eval_runtime": 1.3457, + "eval_samples_per_second": 40.87, + "eval_smape": 46.60126864910126, + "eval_steps_per_second": 10.403, + "step": 124 + }, + { + "epoch": 1.0080645161290323, + "grad_norm": 31.391159057617188, + "learning_rate": 1.8434637845208193e-06, + "loss": 0.669, + "step": 125 + }, + { + "epoch": 1.0161290322580645, + "grad_norm": 68.69688415527344, + "learning_rate": 1.8397396152591611e-06, + "loss": 0.7357, + "step": 126 + }, + { + "epoch": 1.0241935483870968, + "grad_norm": 149.95779418945312, + "learning_rate": 1.8360154459975032e-06, + "loss": 6.2925, + "step": 127 + }, + { + "epoch": 1.032258064516129, + "grad_norm": 29.75308609008789, + "learning_rate": 1.832291276735845e-06, + "loss": 0.5026, + "step": 128 + }, + { + "epoch": 1.0403225806451613, + "grad_norm": 31.049043655395508, + "learning_rate": 1.8285671074741868e-06, + "loss": 0.8776, + "step": 129 + }, + { + "epoch": 1.0483870967741935, + "grad_norm": 50.398834228515625, + "learning_rate": 1.8248429382125284e-06, + "loss": 2.3401, + "step": 130 + }, + { + "epoch": 1.0564516129032258, + "grad_norm": 28.888776779174805, + "learning_rate": 1.8211187689508702e-06, + "loss": 0.1173, + "step": 131 + }, + { + "epoch": 1.064516129032258, + "grad_norm": 161.4318389892578, + "learning_rate": 1.817394599689212e-06, + "loss": 5.2758, + "step": 132 + }, + { + "epoch": 1.0725806451612903, + "grad_norm": 61.20700454711914, + "learning_rate": 1.8136704304275538e-06, + "loss": 0.5886, + "step": 133 + }, + { + "epoch": 1.0806451612903225, + "grad_norm": 61.833534240722656, + "learning_rate": 1.8099462611658954e-06, + "loss": 0.5642, + "step": 134 + }, + { + "epoch": 1.0887096774193548, + "grad_norm": 181.53607177734375, + "learning_rate": 1.8062220919042373e-06, + "loss": 7.6422, + "step": 135 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 244.12063598632812, + "learning_rate": 1.802497922642579e-06, + "loss": 16.4538, + "step": 136 + }, + { + "epoch": 1.1048387096774193, + "grad_norm": 203.72213745117188, + "learning_rate": 1.798773753380921e-06, + "loss": 10.7823, + "step": 137 + }, + { + "epoch": 1.1129032258064515, + "grad_norm": 12.267001152038574, + "learning_rate": 1.7950495841192627e-06, + "loss": 0.3536, + "step": 138 + }, + { + "epoch": 1.120967741935484, + "grad_norm": 407.8166809082031, + "learning_rate": 1.7913254148576043e-06, + "loss": 45.116, + "step": 139 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 27.063257217407227, + "learning_rate": 1.7876012455959461e-06, + "loss": 0.077, + "step": 140 + }, + { + "epoch": 1.1370967741935485, + "grad_norm": 44.98855972290039, + "learning_rate": 1.783877076334288e-06, + "loss": 1.1241, + "step": 141 + }, + { + "epoch": 1.1451612903225807, + "grad_norm": 40.457061767578125, + "learning_rate": 1.7801529070726298e-06, + "loss": 0.9006, + "step": 142 + }, + { + "epoch": 1.153225806451613, + "grad_norm": 34.76171112060547, + "learning_rate": 1.7764287378109716e-06, + "loss": 0.2299, + "step": 143 + }, + { + "epoch": 1.1612903225806452, + "grad_norm": 28.188255310058594, + "learning_rate": 1.7727045685493132e-06, + "loss": 0.2326, + "step": 144 + }, + { + "epoch": 1.1693548387096775, + "grad_norm": 37.98823547363281, + "learning_rate": 1.768980399287655e-06, + "loss": 0.5014, + "step": 145 + }, + { + "epoch": 1.1774193548387097, + "grad_norm": 40.45034408569336, + "learning_rate": 1.7652562300259968e-06, + "loss": 2.0889, + "step": 146 + }, + { + "epoch": 1.185483870967742, + "grad_norm": 44.3226432800293, + "learning_rate": 1.7615320607643386e-06, + "loss": 0.4977, + "step": 147 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 29.797224044799805, + "learning_rate": 1.7578078915026802e-06, + "loss": 1.1666, + "step": 148 + }, + { + "epoch": 1.2016129032258065, + "grad_norm": 291.9981689453125, + "learning_rate": 1.754083722241022e-06, + "loss": 17.1074, + "step": 149 + }, + { + "epoch": 1.2096774193548387, + "grad_norm": 344.50335693359375, + "learning_rate": 1.7503595529793639e-06, + "loss": 35.2006, + "step": 150 + }, + { + "epoch": 1.217741935483871, + "grad_norm": 186.88414001464844, + "learning_rate": 1.7466353837177057e-06, + "loss": 16.7039, + "step": 151 + }, + { + "epoch": 1.2258064516129032, + "grad_norm": 192.2663116455078, + "learning_rate": 1.7429112144560475e-06, + "loss": 1.0027, + "step": 152 + }, + { + "epoch": 1.2338709677419355, + "grad_norm": 125.88935089111328, + "learning_rate": 1.7391870451943891e-06, + "loss": 8.6522, + "step": 153 + }, + { + "epoch": 1.2419354838709677, + "grad_norm": 282.10894775390625, + "learning_rate": 1.735462875932731e-06, + "loss": 11.9202, + "step": 154 + }, + { + "epoch": 1.25, + "grad_norm": 87.92141723632812, + "learning_rate": 1.7317387066710727e-06, + "loss": 0.8521, + "step": 155 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 123.09964752197266, + "learning_rate": 1.7280145374094148e-06, + "loss": 1.2827, + "step": 156 + }, + { + "epoch": 1.2661290322580645, + "grad_norm": 137.37051391601562, + "learning_rate": 1.7242903681477566e-06, + "loss": 2.6597, + "step": 157 + }, + { + "epoch": 1.2741935483870968, + "grad_norm": 298.48602294921875, + "learning_rate": 1.7205661988860982e-06, + "loss": 12.6311, + "step": 158 + }, + { + "epoch": 1.282258064516129, + "grad_norm": 1922.1727294921875, + "learning_rate": 1.71684202962444e-06, + "loss": 22.0268, + "step": 159 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 130.57371520996094, + "learning_rate": 1.7131178603627818e-06, + "loss": 9.7016, + "step": 160 + }, + { + "epoch": 1.2983870967741935, + "grad_norm": 325.2424011230469, + "learning_rate": 1.7093936911011236e-06, + "loss": 37.0139, + "step": 161 + }, + { + "epoch": 1.3064516129032258, + "grad_norm": 180.130615234375, + "learning_rate": 1.7056695218394655e-06, + "loss": 1.6304, + "step": 162 + }, + { + "epoch": 1.314516129032258, + "grad_norm": 190.7817840576172, + "learning_rate": 1.701945352577807e-06, + "loss": 1.9045, + "step": 163 + }, + { + "epoch": 1.3225806451612903, + "grad_norm": 175.99176025390625, + "learning_rate": 1.6982211833161489e-06, + "loss": 2.5629, + "step": 164 + }, + { + "epoch": 1.3306451612903225, + "grad_norm": 416.7662353515625, + "learning_rate": 1.6944970140544907e-06, + "loss": 5.4869, + "step": 165 + }, + { + "epoch": 1.3387096774193548, + "grad_norm": 271.3612060546875, + "learning_rate": 1.6907728447928325e-06, + "loss": 3.779, + "step": 166 + }, + { + "epoch": 1.346774193548387, + "grad_norm": 48.5273323059082, + "learning_rate": 1.6870486755311741e-06, + "loss": 1.5463, + "step": 167 + }, + { + "epoch": 1.3548387096774195, + "grad_norm": 139.34823608398438, + "learning_rate": 1.683324506269516e-06, + "loss": 1.3512, + "step": 168 + }, + { + "epoch": 1.3629032258064515, + "grad_norm": 262.9454650878906, + "learning_rate": 1.6796003370078577e-06, + "loss": 4.0148, + "step": 169 + }, + { + "epoch": 1.370967741935484, + "grad_norm": 1780.4842529296875, + "learning_rate": 1.6758761677461996e-06, + "loss": 125.1284, + "step": 170 + }, + { + "epoch": 1.379032258064516, + "grad_norm": 82.82109069824219, + "learning_rate": 1.6721519984845414e-06, + "loss": 1.0954, + "step": 171 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 317.63446044921875, + "learning_rate": 1.668427829222883e-06, + "loss": 5.8565, + "step": 172 + }, + { + "epoch": 1.3951612903225805, + "grad_norm": 186.61788940429688, + "learning_rate": 1.6647036599612248e-06, + "loss": 2.093, + "step": 173 + }, + { + "epoch": 1.403225806451613, + "grad_norm": 192.48291015625, + "learning_rate": 1.6609794906995666e-06, + "loss": 8.7957, + "step": 174 + }, + { + "epoch": 1.4112903225806452, + "grad_norm": 83.75372314453125, + "learning_rate": 1.6572553214379084e-06, + "loss": 0.6524, + "step": 175 + }, + { + "epoch": 1.4193548387096775, + "grad_norm": 387.80767822265625, + "learning_rate": 1.6535311521762503e-06, + "loss": 14.9674, + "step": 176 + }, + { + "epoch": 1.4274193548387097, + "grad_norm": 83.91477966308594, + "learning_rate": 1.6498069829145919e-06, + "loss": 0.8564, + "step": 177 + }, + { + "epoch": 1.435483870967742, + "grad_norm": 138.35964965820312, + "learning_rate": 1.6460828136529337e-06, + "loss": 1.4829, + "step": 178 + }, + { + "epoch": 1.4435483870967742, + "grad_norm": 137.02264404296875, + "learning_rate": 1.6423586443912755e-06, + "loss": 2.6322, + "step": 179 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 164.8358154296875, + "learning_rate": 1.6386344751296173e-06, + "loss": 1.4194, + "step": 180 + }, + { + "epoch": 1.4596774193548387, + "grad_norm": 109.77494812011719, + "learning_rate": 1.634910305867959e-06, + "loss": 0.9658, + "step": 181 + }, + { + "epoch": 1.467741935483871, + "grad_norm": 44.13063430786133, + "learning_rate": 1.6311861366063007e-06, + "loss": 0.7388, + "step": 182 + }, + { + "epoch": 1.4758064516129032, + "grad_norm": 88.6048355102539, + "learning_rate": 1.6274619673446425e-06, + "loss": 1.3648, + "step": 183 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 123.55364990234375, + "learning_rate": 1.6237377980829846e-06, + "loss": 3.6786, + "step": 184 + }, + { + "epoch": 1.4919354838709677, + "grad_norm": 113.06607818603516, + "learning_rate": 1.6200136288213264e-06, + "loss": 1.6744, + "step": 185 + }, + { + "epoch": 1.5, + "grad_norm": 32.589027404785156, + "learning_rate": 1.616289459559668e-06, + "loss": 0.4416, + "step": 186 + }, + { + "epoch": 1.5080645161290323, + "grad_norm": 161.1331329345703, + "learning_rate": 1.6125652902980098e-06, + "loss": 4.6922, + "step": 187 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 59.624202728271484, + "learning_rate": 1.6088411210363516e-06, + "loss": 1.7379, + "step": 188 + }, + { + "epoch": 1.5241935483870968, + "grad_norm": 125.68768310546875, + "learning_rate": 1.6051169517746934e-06, + "loss": 2.1186, + "step": 189 + }, + { + "epoch": 1.532258064516129, + "grad_norm": 47.66250991821289, + "learning_rate": 1.6013927825130353e-06, + "loss": 0.5983, + "step": 190 + }, + { + "epoch": 1.5403225806451613, + "grad_norm": 49.23503112792969, + "learning_rate": 1.5976686132513769e-06, + "loss": 0.6384, + "step": 191 + }, + { + "epoch": 1.5483870967741935, + "grad_norm": 30.719045639038086, + "learning_rate": 1.5939444439897187e-06, + "loss": 0.4656, + "step": 192 + }, + { + "epoch": 1.5564516129032258, + "grad_norm": 42.6551513671875, + "learning_rate": 1.5902202747280605e-06, + "loss": 0.8564, + "step": 193 + }, + { + "epoch": 1.564516129032258, + "grad_norm": 50.98031234741211, + "learning_rate": 1.5864961054664023e-06, + "loss": 0.6173, + "step": 194 + }, + { + "epoch": 1.5725806451612905, + "grad_norm": 45.17729568481445, + "learning_rate": 1.582771936204744e-06, + "loss": 0.368, + "step": 195 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 266.4801330566406, + "learning_rate": 1.5790477669430857e-06, + "loss": 7.7041, + "step": 196 + }, + { + "epoch": 1.588709677419355, + "grad_norm": 53.33144760131836, + "learning_rate": 1.5753235976814275e-06, + "loss": 1.2489, + "step": 197 + }, + { + "epoch": 1.596774193548387, + "grad_norm": 13.414701461791992, + "learning_rate": 1.5715994284197694e-06, + "loss": 0.5211, + "step": 198 + }, + { + "epoch": 1.6048387096774195, + "grad_norm": 29.101728439331055, + "learning_rate": 1.5678752591581112e-06, + "loss": 0.1087, + "step": 199 + }, + { + "epoch": 1.6129032258064515, + "grad_norm": 103.00382995605469, + "learning_rate": 1.5641510898964528e-06, + "loss": 0.6299, + "step": 200 + }, + { + "epoch": 1.620967741935484, + "grad_norm": 21.447538375854492, + "learning_rate": 1.5604269206347946e-06, + "loss": 0.1628, + "step": 201 + }, + { + "epoch": 1.629032258064516, + "grad_norm": 69.59345245361328, + "learning_rate": 1.5567027513731364e-06, + "loss": 1.4756, + "step": 202 + }, + { + "epoch": 1.6370967741935485, + "grad_norm": 25.250011444091797, + "learning_rate": 1.5529785821114782e-06, + "loss": 0.1545, + "step": 203 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 60.4410400390625, + "learning_rate": 1.54925441284982e-06, + "loss": 0.5283, + "step": 204 + }, + { + "epoch": 1.653225806451613, + "grad_norm": 234.9252471923828, + "learning_rate": 1.5455302435881617e-06, + "loss": 16.5494, + "step": 205 + }, + { + "epoch": 1.661290322580645, + "grad_norm": 79.00119018554688, + "learning_rate": 1.5418060743265035e-06, + "loss": 0.3539, + "step": 206 + }, + { + "epoch": 1.6693548387096775, + "grad_norm": 37.666534423828125, + "learning_rate": 1.5380819050648453e-06, + "loss": 0.5521, + "step": 207 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 32.151283264160156, + "learning_rate": 1.5343577358031871e-06, + "loss": 0.3343, + "step": 208 + }, + { + "epoch": 1.685483870967742, + "grad_norm": 23.681564331054688, + "learning_rate": 1.5306335665415291e-06, + "loss": 0.4023, + "step": 209 + }, + { + "epoch": 1.6935483870967742, + "grad_norm": 51.292144775390625, + "learning_rate": 1.5269093972798705e-06, + "loss": 0.3559, + "step": 210 + }, + { + "epoch": 1.7016129032258065, + "grad_norm": 126.73843383789062, + "learning_rate": 1.5231852280182123e-06, + "loss": 2.4061, + "step": 211 + }, + { + "epoch": 1.7096774193548387, + "grad_norm": 57.402000427246094, + "learning_rate": 1.5194610587565544e-06, + "loss": 1.1266, + "step": 212 + }, + { + "epoch": 1.717741935483871, + "grad_norm": 44.26993942260742, + "learning_rate": 1.5157368894948962e-06, + "loss": 0.1871, + "step": 213 + }, + { + "epoch": 1.7258064516129032, + "grad_norm": 331.17962646484375, + "learning_rate": 1.5120127202332378e-06, + "loss": 24.3463, + "step": 214 + }, + { + "epoch": 1.7338709677419355, + "grad_norm": 202.72023010253906, + "learning_rate": 1.5082885509715796e-06, + "loss": 16.1255, + "step": 215 + }, + { + "epoch": 1.7419354838709677, + "grad_norm": 140.6495819091797, + "learning_rate": 1.5045643817099214e-06, + "loss": 1.2096, + "step": 216 + }, + { + "epoch": 1.75, + "grad_norm": 305.29217529296875, + "learning_rate": 1.5008402124482632e-06, + "loss": 6.5256, + "step": 217 + }, + { + "epoch": 1.7580645161290323, + "grad_norm": 100.94656372070312, + "learning_rate": 1.497116043186605e-06, + "loss": 0.1109, + "step": 218 + }, + { + "epoch": 1.7661290322580645, + "grad_norm": 426.759521484375, + "learning_rate": 1.4933918739249467e-06, + "loss": 12.6703, + "step": 219 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 24.68358039855957, + "learning_rate": 1.4896677046632885e-06, + "loss": 0.1721, + "step": 220 + }, + { + "epoch": 1.782258064516129, + "grad_norm": 49.39876174926758, + "learning_rate": 1.4859435354016303e-06, + "loss": 0.3496, + "step": 221 + }, + { + "epoch": 1.7903225806451613, + "grad_norm": 294.5503845214844, + "learning_rate": 1.4822193661399721e-06, + "loss": 0.629, + "step": 222 + }, + { + "epoch": 1.7983870967741935, + "grad_norm": 7.117827415466309, + "learning_rate": 1.478495196878314e-06, + "loss": 0.0899, + "step": 223 + }, + { + "epoch": 1.8064516129032258, + "grad_norm": 274.92755126953125, + "learning_rate": 1.4747710276166555e-06, + "loss": 17.6147, + "step": 224 + }, + { + "epoch": 1.814516129032258, + "grad_norm": 445.4465026855469, + "learning_rate": 1.4710468583549973e-06, + "loss": 5.6809, + "step": 225 + }, + { + "epoch": 1.8225806451612905, + "grad_norm": 42.991477966308594, + "learning_rate": 1.4673226890933392e-06, + "loss": 1.7588, + "step": 226 + }, + { + "epoch": 1.8306451612903225, + "grad_norm": 113.24928283691406, + "learning_rate": 1.463598519831681e-06, + "loss": 0.8243, + "step": 227 + }, + { + "epoch": 1.838709677419355, + "grad_norm": 121.32020568847656, + "learning_rate": 1.4598743505700226e-06, + "loss": 1.5972, + "step": 228 + }, + { + "epoch": 1.846774193548387, + "grad_norm": 52.4281005859375, + "learning_rate": 1.4561501813083644e-06, + "loss": 0.1293, + "step": 229 + }, + { + "epoch": 1.8548387096774195, + "grad_norm": 232.2115936279297, + "learning_rate": 1.4524260120467062e-06, + "loss": 6.8061, + "step": 230 + }, + { + "epoch": 1.8629032258064515, + "grad_norm": 178.65858459472656, + "learning_rate": 1.448701842785048e-06, + "loss": 1.2484, + "step": 231 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 20.669795989990234, + "learning_rate": 1.4449776735233899e-06, + "loss": 0.6529, + "step": 232 + }, + { + "epoch": 1.879032258064516, + "grad_norm": 65.12823486328125, + "learning_rate": 1.4412535042617315e-06, + "loss": 0.7871, + "step": 233 + }, + { + "epoch": 1.8870967741935485, + "grad_norm": 19.540246963500977, + "learning_rate": 1.4375293350000733e-06, + "loss": 0.2503, + "step": 234 + }, + { + "epoch": 1.8951612903225805, + "grad_norm": 37.89284133911133, + "learning_rate": 1.433805165738415e-06, + "loss": 0.378, + "step": 235 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 282.1019592285156, + "learning_rate": 1.430080996476757e-06, + "loss": 8.7697, + "step": 236 + }, + { + "epoch": 1.911290322580645, + "grad_norm": 80.56645965576172, + "learning_rate": 1.426356827215099e-06, + "loss": 6.0227, + "step": 237 + }, + { + "epoch": 1.9193548387096775, + "grad_norm": 38.571327209472656, + "learning_rate": 1.4226326579534403e-06, + "loss": 0.6088, + "step": 238 + }, + { + "epoch": 1.9274193548387095, + "grad_norm": 59.85017395019531, + "learning_rate": 1.4189084886917821e-06, + "loss": 0.6164, + "step": 239 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 112.14440155029297, + "learning_rate": 1.4151843194301242e-06, + "loss": 1.8763, + "step": 240 + }, + { + "epoch": 1.9435483870967742, + "grad_norm": 99.5566177368164, + "learning_rate": 1.411460150168466e-06, + "loss": 0.8691, + "step": 241 + }, + { + "epoch": 1.9516129032258065, + "grad_norm": 49.559993743896484, + "learning_rate": 1.4077359809068074e-06, + "loss": 1.2076, + "step": 242 + }, + { + "epoch": 1.9596774193548387, + "grad_norm": 172.5569305419922, + "learning_rate": 1.4040118116451494e-06, + "loss": 1.7824, + "step": 243 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 334.1643981933594, + "learning_rate": 1.4002876423834912e-06, + "loss": 4.1051, + "step": 244 + }, + { + "epoch": 1.9758064516129032, + "grad_norm": 71.8653564453125, + "learning_rate": 1.396563473121833e-06, + "loss": 0.7466, + "step": 245 + }, + { + "epoch": 1.9838709677419355, + "grad_norm": 295.7119445800781, + "learning_rate": 1.3928393038601749e-06, + "loss": 4.5561, + "step": 246 + }, + { + "epoch": 1.9919354838709677, + "grad_norm": 332.00482177734375, + "learning_rate": 1.3891151345985165e-06, + "loss": 1.4417, + "step": 247 + }, + { + "epoch": 2.0, + "grad_norm": 193.66639709472656, + "learning_rate": 1.3853909653368583e-06, + "loss": 2.9134, + "step": 248 + }, + { + "epoch": 2.0, + "eval_loss": 3.7568323612213135, + "eval_mae": 1.2384544610977173, + "eval_mse": 3.756831407546997, + "eval_r2": -0.02855515480041504, + "eval_rmse": 1.9382547323680128, + "eval_runtime": 1.4194, + "eval_samples_per_second": 38.75, + "eval_smape": 49.87496733665466, + "eval_steps_per_second": 9.864, + "step": 248 + }, + { + "epoch": 2.0080645161290325, + "grad_norm": 177.89308166503906, + "learning_rate": 1.3816667960752e-06, + "loss": 1.4892, + "step": 249 + }, + { + "epoch": 2.0161290322580645, + "grad_norm": 119.0855941772461, + "learning_rate": 1.377942626813542e-06, + "loss": 1.4447, + "step": 250 + }, + { + "epoch": 2.024193548387097, + "grad_norm": 174.97743225097656, + "learning_rate": 1.3742184575518837e-06, + "loss": 13.8914, + "step": 251 + }, + { + "epoch": 2.032258064516129, + "grad_norm": 231.81077575683594, + "learning_rate": 1.3704942882902253e-06, + "loss": 4.5327, + "step": 252 + }, + { + "epoch": 2.0403225806451615, + "grad_norm": 81.22002410888672, + "learning_rate": 1.3667701190285671e-06, + "loss": 0.3658, + "step": 253 + }, + { + "epoch": 2.0483870967741935, + "grad_norm": 226.5207061767578, + "learning_rate": 1.363045949766909e-06, + "loss": 21.8189, + "step": 254 + }, + { + "epoch": 2.056451612903226, + "grad_norm": 20.884220123291016, + "learning_rate": 1.3593217805052508e-06, + "loss": 0.1301, + "step": 255 + }, + { + "epoch": 2.064516129032258, + "grad_norm": 371.02484130859375, + "learning_rate": 1.3555976112435924e-06, + "loss": 21.3565, + "step": 256 + }, + { + "epoch": 2.0725806451612905, + "grad_norm": 92.34310150146484, + "learning_rate": 1.3518734419819342e-06, + "loss": 0.707, + "step": 257 + }, + { + "epoch": 2.0806451612903225, + "grad_norm": 173.9854736328125, + "learning_rate": 1.348149272720276e-06, + "loss": 1.7342, + "step": 258 + }, + { + "epoch": 2.088709677419355, + "grad_norm": 202.5366668701172, + "learning_rate": 1.3444251034586178e-06, + "loss": 25.2022, + "step": 259 + }, + { + "epoch": 2.096774193548387, + "grad_norm": 33.74102783203125, + "learning_rate": 1.3407009341969597e-06, + "loss": 0.4951, + "step": 260 + }, + { + "epoch": 2.1048387096774195, + "grad_norm": 548.375244140625, + "learning_rate": 1.3369767649353013e-06, + "loss": 23.6786, + "step": 261 + }, + { + "epoch": 2.1129032258064515, + "grad_norm": 203.58383178710938, + "learning_rate": 1.333252595673643e-06, + "loss": 2.1325, + "step": 262 + }, + { + "epoch": 2.120967741935484, + "grad_norm": 64.47084045410156, + "learning_rate": 1.3295284264119849e-06, + "loss": 0.5467, + "step": 263 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 30.41888427734375, + "learning_rate": 1.3258042571503267e-06, + "loss": 0.6327, + "step": 264 + }, + { + "epoch": 2.1370967741935485, + "grad_norm": 217.5379180908203, + "learning_rate": 1.3220800878886685e-06, + "loss": 1.7505, + "step": 265 + }, + { + "epoch": 2.1451612903225805, + "grad_norm": 101.2957534790039, + "learning_rate": 1.3183559186270101e-06, + "loss": 2.8499, + "step": 266 + }, + { + "epoch": 2.153225806451613, + "grad_norm": 91.45832061767578, + "learning_rate": 1.314631749365352e-06, + "loss": 0.3068, + "step": 267 + }, + { + "epoch": 2.161290322580645, + "grad_norm": 99.60401153564453, + "learning_rate": 1.310907580103694e-06, + "loss": 1.2989, + "step": 268 + }, + { + "epoch": 2.1693548387096775, + "grad_norm": 112.3869857788086, + "learning_rate": 1.3071834108420358e-06, + "loss": 2.3015, + "step": 269 + }, + { + "epoch": 2.1774193548387095, + "grad_norm": 89.12266540527344, + "learning_rate": 1.3034592415803776e-06, + "loss": 1.0322, + "step": 270 + }, + { + "epoch": 2.185483870967742, + "grad_norm": 161.4855194091797, + "learning_rate": 1.2997350723187192e-06, + "loss": 11.7275, + "step": 271 + }, + { + "epoch": 2.193548387096774, + "grad_norm": 313.5152893066406, + "learning_rate": 1.296010903057061e-06, + "loss": 16.6577, + "step": 272 + }, + { + "epoch": 2.2016129032258065, + "grad_norm": 139.5433807373047, + "learning_rate": 1.2922867337954028e-06, + "loss": 14.2802, + "step": 273 + }, + { + "epoch": 2.2096774193548385, + "grad_norm": 81.8010482788086, + "learning_rate": 1.2885625645337447e-06, + "loss": 1.0866, + "step": 274 + }, + { + "epoch": 2.217741935483871, + "grad_norm": 36.10691833496094, + "learning_rate": 1.2848383952720863e-06, + "loss": 0.2724, + "step": 275 + }, + { + "epoch": 2.225806451612903, + "grad_norm": 44.03886032104492, + "learning_rate": 1.281114226010428e-06, + "loss": 0.1664, + "step": 276 + }, + { + "epoch": 2.2338709677419355, + "grad_norm": 62.077606201171875, + "learning_rate": 1.27739005674877e-06, + "loss": 0.3426, + "step": 277 + }, + { + "epoch": 2.241935483870968, + "grad_norm": 70.45355987548828, + "learning_rate": 1.2736658874871117e-06, + "loss": 0.9209, + "step": 278 + }, + { + "epoch": 2.25, + "grad_norm": 61.781490325927734, + "learning_rate": 1.2699417182254535e-06, + "loss": 0.7348, + "step": 279 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 209.58901977539062, + "learning_rate": 1.2662175489637951e-06, + "loss": 12.0011, + "step": 280 + }, + { + "epoch": 2.2661290322580645, + "grad_norm": 87.33316802978516, + "learning_rate": 1.262493379702137e-06, + "loss": 1.418, + "step": 281 + }, + { + "epoch": 2.274193548387097, + "grad_norm": 493.7389221191406, + "learning_rate": 1.2587692104404788e-06, + "loss": 14.2359, + "step": 282 + }, + { + "epoch": 2.282258064516129, + "grad_norm": 861.2841186523438, + "learning_rate": 1.2550450411788206e-06, + "loss": 3.9053, + "step": 283 + }, + { + "epoch": 2.2903225806451615, + "grad_norm": 558.491943359375, + "learning_rate": 1.2513208719171624e-06, + "loss": 6.4746, + "step": 284 + }, + { + "epoch": 2.2983870967741935, + "grad_norm": 89.52096557617188, + "learning_rate": 1.247596702655504e-06, + "loss": 0.7692, + "step": 285 + }, + { + "epoch": 2.306451612903226, + "grad_norm": 158.73118591308594, + "learning_rate": 1.2438725333938458e-06, + "loss": 1.7061, + "step": 286 + }, + { + "epoch": 2.314516129032258, + "grad_norm": 93.90080261230469, + "learning_rate": 1.2401483641321876e-06, + "loss": 0.8225, + "step": 287 + }, + { + "epoch": 2.3225806451612905, + "grad_norm": 27.100187301635742, + "learning_rate": 1.2364241948705295e-06, + "loss": 0.425, + "step": 288 + }, + { + "epoch": 2.3306451612903225, + "grad_norm": 123.69001770019531, + "learning_rate": 1.232700025608871e-06, + "loss": 1.7102, + "step": 289 + }, + { + "epoch": 2.338709677419355, + "grad_norm": 128.8670196533203, + "learning_rate": 1.2289758563472129e-06, + "loss": 12.3249, + "step": 290 + }, + { + "epoch": 2.346774193548387, + "grad_norm": 194.0598602294922, + "learning_rate": 1.2252516870855547e-06, + "loss": 0.415, + "step": 291 + }, + { + "epoch": 2.3548387096774195, + "grad_norm": 960.8411254882812, + "learning_rate": 1.2215275178238965e-06, + "loss": 5.3627, + "step": 292 + }, + { + "epoch": 2.3629032258064515, + "grad_norm": 50.68828201293945, + "learning_rate": 1.2178033485622383e-06, + "loss": 0.2609, + "step": 293 + }, + { + "epoch": 2.370967741935484, + "grad_norm": 1389.3763427734375, + "learning_rate": 1.21407917930058e-06, + "loss": 12.0135, + "step": 294 + }, + { + "epoch": 2.379032258064516, + "grad_norm": 157.8043212890625, + "learning_rate": 1.2103550100389217e-06, + "loss": 16.9635, + "step": 295 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 160.09109497070312, + "learning_rate": 1.2066308407772638e-06, + "loss": 1.2352, + "step": 296 + }, + { + "epoch": 2.3951612903225805, + "grad_norm": 27.7835750579834, + "learning_rate": 1.2029066715156056e-06, + "loss": 0.2556, + "step": 297 + }, + { + "epoch": 2.403225806451613, + "grad_norm": 177.45115661621094, + "learning_rate": 1.1991825022539474e-06, + "loss": 2.1015, + "step": 298 + }, + { + "epoch": 2.411290322580645, + "grad_norm": 44.05683517456055, + "learning_rate": 1.195458332992289e-06, + "loss": 0.976, + "step": 299 + }, + { + "epoch": 2.4193548387096775, + "grad_norm": 273.5810852050781, + "learning_rate": 1.1917341637306308e-06, + "loss": 2.6291, + "step": 300 + }, + { + "epoch": 2.4274193548387095, + "grad_norm": 151.48826599121094, + "learning_rate": 1.1880099944689726e-06, + "loss": 1.2252, + "step": 301 + }, + { + "epoch": 2.435483870967742, + "grad_norm": 101.94556427001953, + "learning_rate": 1.1842858252073145e-06, + "loss": 0.1799, + "step": 302 + }, + { + "epoch": 2.443548387096774, + "grad_norm": 254.22804260253906, + "learning_rate": 1.180561655945656e-06, + "loss": 6.3368, + "step": 303 + }, + { + "epoch": 2.4516129032258065, + "grad_norm": 137.3714141845703, + "learning_rate": 1.1768374866839979e-06, + "loss": 0.7803, + "step": 304 + }, + { + "epoch": 2.4596774193548385, + "grad_norm": 140.2755584716797, + "learning_rate": 1.1731133174223397e-06, + "loss": 14.2433, + "step": 305 + }, + { + "epoch": 2.467741935483871, + "grad_norm": 43.264320373535156, + "learning_rate": 1.1693891481606815e-06, + "loss": 0.3709, + "step": 306 + }, + { + "epoch": 2.475806451612903, + "grad_norm": 185.11683654785156, + "learning_rate": 1.1656649788990233e-06, + "loss": 1.7764, + "step": 307 + }, + { + "epoch": 2.4838709677419355, + "grad_norm": 303.35791015625, + "learning_rate": 1.161940809637365e-06, + "loss": 11.4762, + "step": 308 + }, + { + "epoch": 2.491935483870968, + "grad_norm": 91.71321868896484, + "learning_rate": 1.1582166403757067e-06, + "loss": 0.6848, + "step": 309 + }, + { + "epoch": 2.5, + "grad_norm": 12.553592681884766, + "learning_rate": 1.1544924711140486e-06, + "loss": 0.3027, + "step": 310 + }, + { + "epoch": 2.508064516129032, + "grad_norm": 66.78060150146484, + "learning_rate": 1.1507683018523904e-06, + "loss": 0.7009, + "step": 311 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 31.330020904541016, + "learning_rate": 1.1470441325907322e-06, + "loss": 0.2209, + "step": 312 + }, + { + "epoch": 2.524193548387097, + "grad_norm": 42.87683868408203, + "learning_rate": 1.143319963329074e-06, + "loss": 0.313, + "step": 313 + }, + { + "epoch": 2.532258064516129, + "grad_norm": 19.70355224609375, + "learning_rate": 1.1395957940674156e-06, + "loss": 0.5462, + "step": 314 + }, + { + "epoch": 2.540322580645161, + "grad_norm": 488.4886474609375, + "learning_rate": 1.1358716248057574e-06, + "loss": 8.1183, + "step": 315 + }, + { + "epoch": 2.5483870967741935, + "grad_norm": 14.261226654052734, + "learning_rate": 1.1321474555440993e-06, + "loss": 0.2522, + "step": 316 + }, + { + "epoch": 2.556451612903226, + "grad_norm": 31.870094299316406, + "learning_rate": 1.128423286282441e-06, + "loss": 1.1169, + "step": 317 + }, + { + "epoch": 2.564516129032258, + "grad_norm": 31.234668731689453, + "learning_rate": 1.1246991170207829e-06, + "loss": 0.3918, + "step": 318 + }, + { + "epoch": 2.5725806451612905, + "grad_norm": 17.941022872924805, + "learning_rate": 1.1209749477591245e-06, + "loss": 0.8012, + "step": 319 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 54.857208251953125, + "learning_rate": 1.1172507784974663e-06, + "loss": 1.1496, + "step": 320 + }, + { + "epoch": 2.588709677419355, + "grad_norm": 332.4779052734375, + "learning_rate": 1.1135266092358081e-06, + "loss": 8.3327, + "step": 321 + }, + { + "epoch": 2.596774193548387, + "grad_norm": 35.55912399291992, + "learning_rate": 1.10980243997415e-06, + "loss": 0.4927, + "step": 322 + }, + { + "epoch": 2.6048387096774195, + "grad_norm": 373.67242431640625, + "learning_rate": 1.1060782707124915e-06, + "loss": 18.8225, + "step": 323 + }, + { + "epoch": 2.6129032258064515, + "grad_norm": 25.760787963867188, + "learning_rate": 1.1023541014508334e-06, + "loss": 0.3753, + "step": 324 + }, + { + "epoch": 2.620967741935484, + "grad_norm": 58.30063247680664, + "learning_rate": 1.0986299321891754e-06, + "loss": 0.7956, + "step": 325 + }, + { + "epoch": 2.629032258064516, + "grad_norm": 31.588315963745117, + "learning_rate": 1.094905762927517e-06, + "loss": 0.6544, + "step": 326 + }, + { + "epoch": 2.6370967741935485, + "grad_norm": 79.70700073242188, + "learning_rate": 1.0911815936658588e-06, + "loss": 1.4362, + "step": 327 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 57.3786506652832, + "learning_rate": 1.0874574244042006e-06, + "loss": 0.2611, + "step": 328 + }, + { + "epoch": 2.653225806451613, + "grad_norm": 661.8717041015625, + "learning_rate": 1.0837332551425424e-06, + "loss": 137.7964, + "step": 329 + }, + { + "epoch": 2.661290322580645, + "grad_norm": 236.0816192626953, + "learning_rate": 1.080009085880884e-06, + "loss": 6.9625, + "step": 330 + }, + { + "epoch": 2.6693548387096775, + "grad_norm": 72.58594512939453, + "learning_rate": 1.0762849166192259e-06, + "loss": 0.8561, + "step": 331 + }, + { + "epoch": 2.6774193548387095, + "grad_norm": 65.97917175292969, + "learning_rate": 1.0725607473575677e-06, + "loss": 1.7101, + "step": 332 + }, + { + "epoch": 2.685483870967742, + "grad_norm": 55.23917770385742, + "learning_rate": 1.0688365780959095e-06, + "loss": 0.5448, + "step": 333 + }, + { + "epoch": 2.693548387096774, + "grad_norm": 501.1816711425781, + "learning_rate": 1.0651124088342513e-06, + "loss": 13.16, + "step": 334 + }, + { + "epoch": 2.7016129032258065, + "grad_norm": 55.06072235107422, + "learning_rate": 1.061388239572593e-06, + "loss": 0.9885, + "step": 335 + }, + { + "epoch": 2.709677419354839, + "grad_norm": 98.7667236328125, + "learning_rate": 1.0576640703109347e-06, + "loss": 1.4739, + "step": 336 + }, + { + "epoch": 2.717741935483871, + "grad_norm": 77.85026550292969, + "learning_rate": 1.0539399010492768e-06, + "loss": 0.9057, + "step": 337 + }, + { + "epoch": 2.725806451612903, + "grad_norm": 95.394287109375, + "learning_rate": 1.0502157317876184e-06, + "loss": 0.7198, + "step": 338 + }, + { + "epoch": 2.7338709677419355, + "grad_norm": 237.02577209472656, + "learning_rate": 1.0464915625259602e-06, + "loss": 13.3706, + "step": 339 + }, + { + "epoch": 2.741935483870968, + "grad_norm": 127.62625122070312, + "learning_rate": 1.042767393264302e-06, + "loss": 0.8813, + "step": 340 + }, + { + "epoch": 2.75, + "grad_norm": 73.3454360961914, + "learning_rate": 1.0390432240026438e-06, + "loss": 0.5758, + "step": 341 + }, + { + "epoch": 2.758064516129032, + "grad_norm": 109.3171615600586, + "learning_rate": 1.0353190547409854e-06, + "loss": 9.7732, + "step": 342 + }, + { + "epoch": 2.7661290322580645, + "grad_norm": 163.61447143554688, + "learning_rate": 1.0315948854793272e-06, + "loss": 0.6441, + "step": 343 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 36.76464080810547, + "learning_rate": 1.027870716217669e-06, + "loss": 0.5595, + "step": 344 + }, + { + "epoch": 2.782258064516129, + "grad_norm": 206.4422607421875, + "learning_rate": 1.0241465469560109e-06, + "loss": 3.4251, + "step": 345 + }, + { + "epoch": 2.790322580645161, + "grad_norm": 46.03797912597656, + "learning_rate": 1.0204223776943527e-06, + "loss": 0.1912, + "step": 346 + }, + { + "epoch": 2.7983870967741935, + "grad_norm": 196.0258026123047, + "learning_rate": 1.0166982084326943e-06, + "loss": 1.2507, + "step": 347 + }, + { + "epoch": 2.806451612903226, + "grad_norm": 171.45388793945312, + "learning_rate": 1.0129740391710361e-06, + "loss": 0.9022, + "step": 348 + }, + { + "epoch": 2.814516129032258, + "grad_norm": 128.89340209960938, + "learning_rate": 1.009249869909378e-06, + "loss": 5.8515, + "step": 349 + }, + { + "epoch": 2.8225806451612905, + "grad_norm": 44.62214660644531, + "learning_rate": 1.0055257006477197e-06, + "loss": 0.6465, + "step": 350 + }, + { + "epoch": 2.8306451612903225, + "grad_norm": 33.63302230834961, + "learning_rate": 1.0018015313860616e-06, + "loss": 0.1617, + "step": 351 + }, + { + "epoch": 2.838709677419355, + "grad_norm": 126.68191528320312, + "learning_rate": 9.980773621244032e-07, + "loss": 4.3174, + "step": 352 + }, + { + "epoch": 2.846774193548387, + "grad_norm": 18.371591567993164, + "learning_rate": 9.943531928627452e-07, + "loss": 0.4257, + "step": 353 + }, + { + "epoch": 2.8548387096774195, + "grad_norm": 324.5527648925781, + "learning_rate": 9.906290236010868e-07, + "loss": 7.0345, + "step": 354 + }, + { + "epoch": 2.8629032258064515, + "grad_norm": 295.57806396484375, + "learning_rate": 9.869048543394286e-07, + "loss": 4.2767, + "step": 355 + }, + { + "epoch": 2.870967741935484, + "grad_norm": 21.32406997680664, + "learning_rate": 9.831806850777704e-07, + "loss": 0.2288, + "step": 356 + }, + { + "epoch": 2.879032258064516, + "grad_norm": 163.4864959716797, + "learning_rate": 9.794565158161122e-07, + "loss": 19.382, + "step": 357 + }, + { + "epoch": 2.8870967741935485, + "grad_norm": 81.98368835449219, + "learning_rate": 9.75732346554454e-07, + "loss": 0.3589, + "step": 358 + }, + { + "epoch": 2.8951612903225805, + "grad_norm": 80.34703063964844, + "learning_rate": 9.720081772927957e-07, + "loss": 1.181, + "step": 359 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 37.47127914428711, + "learning_rate": 9.682840080311375e-07, + "loss": 0.9547, + "step": 360 + }, + { + "epoch": 2.911290322580645, + "grad_norm": 212.42002868652344, + "learning_rate": 9.645598387694793e-07, + "loss": 17.7606, + "step": 361 + }, + { + "epoch": 2.9193548387096775, + "grad_norm": 25.195838928222656, + "learning_rate": 9.608356695078211e-07, + "loss": 0.3995, + "step": 362 + }, + { + "epoch": 2.9274193548387095, + "grad_norm": 39.5676383972168, + "learning_rate": 9.571115002461627e-07, + "loss": 0.3185, + "step": 363 + }, + { + "epoch": 2.935483870967742, + "grad_norm": 96.67916870117188, + "learning_rate": 9.533873309845046e-07, + "loss": 0.4761, + "step": 364 + }, + { + "epoch": 2.943548387096774, + "grad_norm": 67.7193374633789, + "learning_rate": 9.496631617228465e-07, + "loss": 0.6273, + "step": 365 + }, + { + "epoch": 2.9516129032258065, + "grad_norm": 78.17704010009766, + "learning_rate": 9.459389924611882e-07, + "loss": 0.9327, + "step": 366 + }, + { + "epoch": 2.959677419354839, + "grad_norm": 94.27801513671875, + "learning_rate": 9.4221482319953e-07, + "loss": 0.8413, + "step": 367 + }, + { + "epoch": 2.967741935483871, + "grad_norm": 195.948486328125, + "learning_rate": 9.384906539378717e-07, + "loss": 1.9969, + "step": 368 + }, + { + "epoch": 2.975806451612903, + "grad_norm": 78.65729522705078, + "learning_rate": 9.347664846762135e-07, + "loss": 0.8227, + "step": 369 + }, + { + "epoch": 2.9838709677419355, + "grad_norm": 51.40840530395508, + "learning_rate": 9.310423154145552e-07, + "loss": 0.2983, + "step": 370 + }, + { + "epoch": 2.991935483870968, + "grad_norm": 107.39090728759766, + "learning_rate": 9.27318146152897e-07, + "loss": 2.3893, + "step": 371 + }, + { + "epoch": 3.0, + "grad_norm": 592.5924072265625, + "learning_rate": 9.235939768912389e-07, + "loss": 32.2766, + "step": 372 + }, + { + "epoch": 3.0, + "eval_loss": 3.6544721126556396, + "eval_mae": 1.135090947151184, + "eval_mse": 3.654472827911377, + "eval_r2": -0.0005310773849487305, + "eval_rmse": 1.9116675516185802, + "eval_runtime": 1.4093, + "eval_samples_per_second": 39.026, + "eval_smape": 48.73234033584595, + "eval_steps_per_second": 9.934, + "step": 372 + }, + { + "epoch": 3.0080645161290325, + "grad_norm": 167.24183654785156, + "learning_rate": 9.198698076295806e-07, + "loss": 2.4298, + "step": 373 + }, + { + "epoch": 3.0161290322580645, + "grad_norm": 346.9148254394531, + "learning_rate": 9.161456383679225e-07, + "loss": 6.5046, + "step": 374 + }, + { + "epoch": 3.024193548387097, + "grad_norm": 42.78517532348633, + "learning_rate": 9.124214691062642e-07, + "loss": 0.3438, + "step": 375 + }, + { + "epoch": 3.032258064516129, + "grad_norm": 33.160282135009766, + "learning_rate": 9.08697299844606e-07, + "loss": 1.2097, + "step": 376 + }, + { + "epoch": 3.0403225806451615, + "grad_norm": 38.76713562011719, + "learning_rate": 9.049731305829477e-07, + "loss": 0.3487, + "step": 377 + }, + { + "epoch": 3.0483870967741935, + "grad_norm": 167.7181396484375, + "learning_rate": 9.012489613212895e-07, + "loss": 2.2915, + "step": 378 + }, + { + "epoch": 3.056451612903226, + "grad_norm": 91.34284973144531, + "learning_rate": 8.975247920596314e-07, + "loss": 1.243, + "step": 379 + }, + { + "epoch": 3.064516129032258, + "grad_norm": 86.71770477294922, + "learning_rate": 8.938006227979731e-07, + "loss": 1.3177, + "step": 380 + }, + { + "epoch": 3.0725806451612905, + "grad_norm": 674.9639892578125, + "learning_rate": 8.900764535363149e-07, + "loss": 16.8542, + "step": 381 + }, + { + "epoch": 3.0806451612903225, + "grad_norm": 88.91838836669922, + "learning_rate": 8.863522842746566e-07, + "loss": 2.1356, + "step": 382 + }, + { + "epoch": 3.088709677419355, + "grad_norm": 63.240787506103516, + "learning_rate": 8.826281150129984e-07, + "loss": 0.7736, + "step": 383 + }, + { + "epoch": 3.096774193548387, + "grad_norm": 82.86074829101562, + "learning_rate": 8.789039457513401e-07, + "loss": 2.2132, + "step": 384 + }, + { + "epoch": 3.1048387096774195, + "grad_norm": 34.56795883178711, + "learning_rate": 8.751797764896819e-07, + "loss": 1.0128, + "step": 385 + }, + { + "epoch": 3.1129032258064515, + "grad_norm": 24.039697647094727, + "learning_rate": 8.714556072280238e-07, + "loss": 0.4341, + "step": 386 + }, + { + "epoch": 3.120967741935484, + "grad_norm": 96.96089935302734, + "learning_rate": 8.677314379663655e-07, + "loss": 1.5847, + "step": 387 + }, + { + "epoch": 3.129032258064516, + "grad_norm": 91.98773956298828, + "learning_rate": 8.640072687047074e-07, + "loss": 0.9096, + "step": 388 + }, + { + "epoch": 3.1370967741935485, + "grad_norm": 66.54564666748047, + "learning_rate": 8.602830994430491e-07, + "loss": 0.3645, + "step": 389 + }, + { + "epoch": 3.1451612903225805, + "grad_norm": 64.6353759765625, + "learning_rate": 8.565589301813909e-07, + "loss": 0.5853, + "step": 390 + }, + { + "epoch": 3.153225806451613, + "grad_norm": 445.8139343261719, + "learning_rate": 8.528347609197327e-07, + "loss": 14.5169, + "step": 391 + }, + { + "epoch": 3.161290322580645, + "grad_norm": 43.0020637512207, + "learning_rate": 8.491105916580744e-07, + "loss": 0.8973, + "step": 392 + }, + { + "epoch": 3.1693548387096775, + "grad_norm": 74.25178527832031, + "learning_rate": 8.453864223964163e-07, + "loss": 0.2461, + "step": 393 + }, + { + "epoch": 3.1774193548387095, + "grad_norm": 24.665449142456055, + "learning_rate": 8.41662253134758e-07, + "loss": 0.4055, + "step": 394 + }, + { + "epoch": 3.185483870967742, + "grad_norm": 66.4891357421875, + "learning_rate": 8.379380838730998e-07, + "loss": 0.612, + "step": 395 + }, + { + "epoch": 3.193548387096774, + "grad_norm": 55.56448745727539, + "learning_rate": 8.342139146114415e-07, + "loss": 0.2977, + "step": 396 + }, + { + "epoch": 3.2016129032258065, + "grad_norm": 40.89321517944336, + "learning_rate": 8.304897453497833e-07, + "loss": 0.7316, + "step": 397 + }, + { + "epoch": 3.2096774193548385, + "grad_norm": 72.87995910644531, + "learning_rate": 8.267655760881251e-07, + "loss": 1.1076, + "step": 398 + }, + { + "epoch": 3.217741935483871, + "grad_norm": 199.8931427001953, + "learning_rate": 8.230414068264668e-07, + "loss": 22.663, + "step": 399 + }, + { + "epoch": 3.225806451612903, + "grad_norm": 380.7558898925781, + "learning_rate": 8.193172375648087e-07, + "loss": 12.9819, + "step": 400 + }, + { + "epoch": 3.2338709677419355, + "grad_norm": 94.97148895263672, + "learning_rate": 8.155930683031504e-07, + "loss": 2.038, + "step": 401 + }, + { + "epoch": 3.241935483870968, + "grad_norm": 770.9739990234375, + "learning_rate": 8.118688990414923e-07, + "loss": 6.0835, + "step": 402 + }, + { + "epoch": 3.25, + "grad_norm": 276.0569152832031, + "learning_rate": 8.08144729779834e-07, + "loss": 5.7228, + "step": 403 + }, + { + "epoch": 3.258064516129032, + "grad_norm": 64.70281219482422, + "learning_rate": 8.044205605181758e-07, + "loss": 0.4039, + "step": 404 + }, + { + "epoch": 3.2661290322580645, + "grad_norm": 244.07774353027344, + "learning_rate": 8.006963912565176e-07, + "loss": 0.4348, + "step": 405 + }, + { + "epoch": 3.274193548387097, + "grad_norm": 516.8790283203125, + "learning_rate": 7.969722219948593e-07, + "loss": 20.4644, + "step": 406 + }, + { + "epoch": 3.282258064516129, + "grad_norm": 431.8797912597656, + "learning_rate": 7.932480527332012e-07, + "loss": 5.4049, + "step": 407 + }, + { + "epoch": 3.2903225806451615, + "grad_norm": 92.80670928955078, + "learning_rate": 7.895238834715429e-07, + "loss": 8.2338, + "step": 408 + }, + { + "epoch": 3.2983870967741935, + "grad_norm": 208.46914672851562, + "learning_rate": 7.857997142098847e-07, + "loss": 2.0624, + "step": 409 + }, + { + "epoch": 3.306451612903226, + "grad_norm": 66.8282241821289, + "learning_rate": 7.820755449482264e-07, + "loss": 1.6893, + "step": 410 + }, + { + "epoch": 3.314516129032258, + "grad_norm": 58.88878631591797, + "learning_rate": 7.783513756865682e-07, + "loss": 0.5589, + "step": 411 + }, + { + "epoch": 3.3225806451612905, + "grad_norm": 72.84752655029297, + "learning_rate": 7.7462720642491e-07, + "loss": 2.3565, + "step": 412 + }, + { + "epoch": 3.3306451612903225, + "grad_norm": 57.83047866821289, + "learning_rate": 7.709030371632517e-07, + "loss": 0.3691, + "step": 413 + }, + { + "epoch": 3.338709677419355, + "grad_norm": 38.37238311767578, + "learning_rate": 7.671788679015936e-07, + "loss": 0.2926, + "step": 414 + }, + { + "epoch": 3.346774193548387, + "grad_norm": 19.532394409179688, + "learning_rate": 7.634546986399353e-07, + "loss": 0.6592, + "step": 415 + }, + { + "epoch": 3.3548387096774195, + "grad_norm": 97.23928833007812, + "learning_rate": 7.597305293782772e-07, + "loss": 0.271, + "step": 416 + }, + { + "epoch": 3.3629032258064515, + "grad_norm": 107.05040740966797, + "learning_rate": 7.560063601166189e-07, + "loss": 0.3408, + "step": 417 + }, + { + "epoch": 3.370967741935484, + "grad_norm": 38.197364807128906, + "learning_rate": 7.522821908549607e-07, + "loss": 1.5913, + "step": 418 + }, + { + "epoch": 3.379032258064516, + "grad_norm": 74.39269256591797, + "learning_rate": 7.485580215933025e-07, + "loss": 0.534, + "step": 419 + }, + { + "epoch": 3.3870967741935485, + "grad_norm": 361.89697265625, + "learning_rate": 7.448338523316442e-07, + "loss": 11.104, + "step": 420 + }, + { + "epoch": 3.3951612903225805, + "grad_norm": 30.99224090576172, + "learning_rate": 7.411096830699861e-07, + "loss": 0.5005, + "step": 421 + }, + { + "epoch": 3.403225806451613, + "grad_norm": 187.10740661621094, + "learning_rate": 7.373855138083278e-07, + "loss": 1.9135, + "step": 422 + }, + { + "epoch": 3.411290322580645, + "grad_norm": 418.1996154785156, + "learning_rate": 7.336613445466696e-07, + "loss": 14.1697, + "step": 423 + }, + { + "epoch": 3.4193548387096775, + "grad_norm": 331.3507080078125, + "learning_rate": 7.299371752850113e-07, + "loss": 18.4025, + "step": 424 + }, + { + "epoch": 3.4274193548387095, + "grad_norm": 96.46886444091797, + "learning_rate": 7.262130060233531e-07, + "loss": 0.506, + "step": 425 + }, + { + "epoch": 3.435483870967742, + "grad_norm": 93.3166732788086, + "learning_rate": 7.224888367616949e-07, + "loss": 1.3464, + "step": 426 + }, + { + "epoch": 3.443548387096774, + "grad_norm": 60.4049072265625, + "learning_rate": 7.187646675000366e-07, + "loss": 0.2097, + "step": 427 + }, + { + "epoch": 3.4516129032258065, + "grad_norm": 43.35322570800781, + "learning_rate": 7.150404982383785e-07, + "loss": 0.4467, + "step": 428 + }, + { + "epoch": 3.4596774193548385, + "grad_norm": 45.652435302734375, + "learning_rate": 7.113163289767202e-07, + "loss": 0.4335, + "step": 429 + }, + { + "epoch": 3.467741935483871, + "grad_norm": 80.38420867919922, + "learning_rate": 7.075921597150621e-07, + "loss": 0.4553, + "step": 430 + }, + { + "epoch": 3.475806451612903, + "grad_norm": 332.3706970214844, + "learning_rate": 7.038679904534037e-07, + "loss": 22.9866, + "step": 431 + }, + { + "epoch": 3.4838709677419355, + "grad_norm": 263.8254699707031, + "learning_rate": 7.001438211917456e-07, + "loss": 5.5233, + "step": 432 + }, + { + "epoch": 3.491935483870968, + "grad_norm": 73.21070098876953, + "learning_rate": 6.964196519300874e-07, + "loss": 0.7063, + "step": 433 + }, + { + "epoch": 3.5, + "grad_norm": 169.56521606445312, + "learning_rate": 6.926954826684291e-07, + "loss": 1.6993, + "step": 434 + }, + { + "epoch": 3.508064516129032, + "grad_norm": 29.43960189819336, + "learning_rate": 6.88971313406771e-07, + "loss": 0.5154, + "step": 435 + }, + { + "epoch": 3.5161290322580645, + "grad_norm": 29.282577514648438, + "learning_rate": 6.852471441451127e-07, + "loss": 0.3941, + "step": 436 + }, + { + "epoch": 3.524193548387097, + "grad_norm": 24.749528884887695, + "learning_rate": 6.815229748834545e-07, + "loss": 0.0609, + "step": 437 + }, + { + "epoch": 3.532258064516129, + "grad_norm": 268.2690734863281, + "learning_rate": 6.777988056217962e-07, + "loss": 2.3492, + "step": 438 + }, + { + "epoch": 3.540322580645161, + "grad_norm": 61.73430252075195, + "learning_rate": 6.74074636360138e-07, + "loss": 0.468, + "step": 439 + }, + { + "epoch": 3.5483870967741935, + "grad_norm": 148.60581970214844, + "learning_rate": 6.703504670984798e-07, + "loss": 4.6193, + "step": 440 + }, + { + "epoch": 3.556451612903226, + "grad_norm": 135.78884887695312, + "learning_rate": 6.666262978368215e-07, + "loss": 1.1676, + "step": 441 + }, + { + "epoch": 3.564516129032258, + "grad_norm": 49.250648498535156, + "learning_rate": 6.629021285751634e-07, + "loss": 0.2826, + "step": 442 + }, + { + "epoch": 3.5725806451612905, + "grad_norm": 304.4104919433594, + "learning_rate": 6.591779593135051e-07, + "loss": 5.2831, + "step": 443 + }, + { + "epoch": 3.5806451612903225, + "grad_norm": 46.258243560791016, + "learning_rate": 6.55453790051847e-07, + "loss": 0.741, + "step": 444 + }, + { + "epoch": 3.588709677419355, + "grad_norm": 24.82040786743164, + "learning_rate": 6.517296207901888e-07, + "loss": 0.9014, + "step": 445 + }, + { + "epoch": 3.596774193548387, + "grad_norm": 308.60443115234375, + "learning_rate": 6.480054515285305e-07, + "loss": 19.5394, + "step": 446 + }, + { + "epoch": 3.6048387096774195, + "grad_norm": 117.33186340332031, + "learning_rate": 6.442812822668723e-07, + "loss": 10.2945, + "step": 447 + }, + { + "epoch": 3.6129032258064515, + "grad_norm": 21.672733306884766, + "learning_rate": 6.40557113005214e-07, + "loss": 0.0768, + "step": 448 + }, + { + "epoch": 3.620967741935484, + "grad_norm": 68.24752807617188, + "learning_rate": 6.368329437435559e-07, + "loss": 0.3628, + "step": 449 + }, + { + "epoch": 3.629032258064516, + "grad_norm": 49.00190734863281, + "learning_rate": 6.331087744818976e-07, + "loss": 1.1004, + "step": 450 + }, + { + "epoch": 3.6370967741935485, + "grad_norm": 5.552606582641602, + "learning_rate": 6.293846052202394e-07, + "loss": 0.0698, + "step": 451 + }, + { + "epoch": 3.6451612903225805, + "grad_norm": 28.109525680541992, + "learning_rate": 6.256604359585812e-07, + "loss": 0.0813, + "step": 452 + }, + { + "epoch": 3.653225806451613, + "grad_norm": 12.806706428527832, + "learning_rate": 6.219362666969229e-07, + "loss": 0.2015, + "step": 453 + }, + { + "epoch": 3.661290322580645, + "grad_norm": 320.7244873046875, + "learning_rate": 6.182120974352647e-07, + "loss": 13.654, + "step": 454 + }, + { + "epoch": 3.6693548387096775, + "grad_norm": 267.62042236328125, + "learning_rate": 6.144879281736064e-07, + "loss": 21.5447, + "step": 455 + }, + { + "epoch": 3.6774193548387095, + "grad_norm": 177.5928192138672, + "learning_rate": 6.107637589119483e-07, + "loss": 1.9395, + "step": 456 + }, + { + "epoch": 3.685483870967742, + "grad_norm": 64.51294708251953, + "learning_rate": 6.0703958965029e-07, + "loss": 0.2744, + "step": 457 + }, + { + "epoch": 3.693548387096774, + "grad_norm": 225.3753662109375, + "learning_rate": 6.033154203886319e-07, + "loss": 11.8973, + "step": 458 + }, + { + "epoch": 3.7016129032258065, + "grad_norm": 138.8841094970703, + "learning_rate": 5.995912511269737e-07, + "loss": 0.8054, + "step": 459 + }, + { + "epoch": 3.709677419354839, + "grad_norm": 194.67889404296875, + "learning_rate": 5.958670818653154e-07, + "loss": 1.5434, + "step": 460 + }, + { + "epoch": 3.717741935483871, + "grad_norm": 84.84955596923828, + "learning_rate": 5.921429126036572e-07, + "loss": 0.7915, + "step": 461 + }, + { + "epoch": 3.725806451612903, + "grad_norm": 130.38189697265625, + "learning_rate": 5.884187433419989e-07, + "loss": 2.1479, + "step": 462 + }, + { + "epoch": 3.7338709677419355, + "grad_norm": 213.24395751953125, + "learning_rate": 5.846945740803408e-07, + "loss": 3.5013, + "step": 463 + }, + { + "epoch": 3.741935483870968, + "grad_norm": 207.3053436279297, + "learning_rate": 5.809704048186825e-07, + "loss": 17.0066, + "step": 464 + }, + { + "epoch": 3.75, + "grad_norm": 41.873130798339844, + "learning_rate": 5.772462355570243e-07, + "loss": 0.6512, + "step": 465 + }, + { + "epoch": 3.758064516129032, + "grad_norm": 1999.728515625, + "learning_rate": 5.735220662953661e-07, + "loss": 134.0836, + "step": 466 + }, + { + "epoch": 3.7661290322580645, + "grad_norm": 66.1317367553711, + "learning_rate": 5.697978970337078e-07, + "loss": 0.7145, + "step": 467 + }, + { + "epoch": 3.774193548387097, + "grad_norm": 211.88003540039062, + "learning_rate": 5.660737277720496e-07, + "loss": 14.5329, + "step": 468 + }, + { + "epoch": 3.782258064516129, + "grad_norm": 81.95831298828125, + "learning_rate": 5.623495585103914e-07, + "loss": 0.6901, + "step": 469 + }, + { + "epoch": 3.790322580645161, + "grad_norm": 54.79182815551758, + "learning_rate": 5.586253892487332e-07, + "loss": 0.6836, + "step": 470 + }, + { + "epoch": 3.7983870967741935, + "grad_norm": 65.97078704833984, + "learning_rate": 5.54901219987075e-07, + "loss": 5.0991, + "step": 471 + }, + { + "epoch": 3.806451612903226, + "grad_norm": 85.9794921875, + "learning_rate": 5.511770507254167e-07, + "loss": 0.2896, + "step": 472 + }, + { + "epoch": 3.814516129032258, + "grad_norm": 71.18566131591797, + "learning_rate": 5.474528814637585e-07, + "loss": 0.4102, + "step": 473 + }, + { + "epoch": 3.8225806451612905, + "grad_norm": 62.09203338623047, + "learning_rate": 5.437287122021003e-07, + "loss": 0.3776, + "step": 474 + }, + { + "epoch": 3.8306451612903225, + "grad_norm": 128.81930541992188, + "learning_rate": 5.40004542940442e-07, + "loss": 1.9145, + "step": 475 + }, + { + "epoch": 3.838709677419355, + "grad_norm": 312.0795593261719, + "learning_rate": 5.362803736787838e-07, + "loss": 2.3146, + "step": 476 + }, + { + "epoch": 3.846774193548387, + "grad_norm": 265.22119140625, + "learning_rate": 5.325562044171257e-07, + "loss": 4.3763, + "step": 477 + }, + { + "epoch": 3.8548387096774195, + "grad_norm": 207.3704376220703, + "learning_rate": 5.288320351554674e-07, + "loss": 20.7729, + "step": 478 + }, + { + "epoch": 3.8629032258064515, + "grad_norm": 418.6147766113281, + "learning_rate": 5.251078658938092e-07, + "loss": 0.3353, + "step": 479 + }, + { + "epoch": 3.870967741935484, + "grad_norm": 69.67323303222656, + "learning_rate": 5.21383696632151e-07, + "loss": 1.5328, + "step": 480 + }, + { + "epoch": 3.879032258064516, + "grad_norm": 18.951583862304688, + "learning_rate": 5.176595273704927e-07, + "loss": 0.1113, + "step": 481 + }, + { + "epoch": 3.8870967741935485, + "grad_norm": 271.2349853515625, + "learning_rate": 5.139353581088345e-07, + "loss": 2.2297, + "step": 482 + }, + { + "epoch": 3.8951612903225805, + "grad_norm": 78.0976791381836, + "learning_rate": 5.102111888471763e-07, + "loss": 0.9436, + "step": 483 + }, + { + "epoch": 3.903225806451613, + "grad_norm": 124.9957046508789, + "learning_rate": 5.064870195855181e-07, + "loss": 11.5244, + "step": 484 + }, + { + "epoch": 3.911290322580645, + "grad_norm": 119.20372009277344, + "learning_rate": 5.027628503238599e-07, + "loss": 1.2656, + "step": 485 + }, + { + "epoch": 3.9193548387096775, + "grad_norm": 111.48886108398438, + "learning_rate": 4.990386810622016e-07, + "loss": 0.9424, + "step": 486 + }, + { + "epoch": 3.9274193548387095, + "grad_norm": 44.763328552246094, + "learning_rate": 4.953145118005434e-07, + "loss": 0.406, + "step": 487 + }, + { + "epoch": 3.935483870967742, + "grad_norm": 33.71810531616211, + "learning_rate": 4.915903425388852e-07, + "loss": 0.7287, + "step": 488 + }, + { + "epoch": 3.943548387096774, + "grad_norm": 314.32232666015625, + "learning_rate": 4.87866173277227e-07, + "loss": 14.9761, + "step": 489 + }, + { + "epoch": 3.9516129032258065, + "grad_norm": 161.79278564453125, + "learning_rate": 4.841420040155687e-07, + "loss": 1.6247, + "step": 490 + }, + { + "epoch": 3.959677419354839, + "grad_norm": 19.090133666992188, + "learning_rate": 4.804178347539106e-07, + "loss": 0.5247, + "step": 491 + }, + { + "epoch": 3.967741935483871, + "grad_norm": 227.18374633789062, + "learning_rate": 4.766936654922523e-07, + "loss": 1.9452, + "step": 492 + }, + { + "epoch": 3.975806451612903, + "grad_norm": 42.912105560302734, + "learning_rate": 4.729694962305941e-07, + "loss": 0.6671, + "step": 493 + }, + { + "epoch": 3.9838709677419355, + "grad_norm": 229.20773315429688, + "learning_rate": 4.6924532696893585e-07, + "loss": 23.1584, + "step": 494 + }, + { + "epoch": 3.991935483870968, + "grad_norm": 16.33729362487793, + "learning_rate": 4.655211577072776e-07, + "loss": 0.6326, + "step": 495 + }, + { + "epoch": 4.0, + "grad_norm": 31.148290634155273, + "learning_rate": 4.6179698844561943e-07, + "loss": 0.083, + "step": 496 + }, + { + "epoch": 4.0, + "eval_loss": 3.6658711433410645, + "eval_mae": 1.1627670526504517, + "eval_mse": 3.6658709049224854, + "eval_r2": -0.0036516189575195312, + "eval_rmse": 1.9146464177289981, + "eval_runtime": 1.3825, + "eval_samples_per_second": 39.783, + "eval_smape": 49.43971037864685, + "eval_steps_per_second": 10.127, + "step": 496 + }, + { + "epoch": 4.008064516129032, + "grad_norm": 40.192630767822266, + "learning_rate": 4.5807281918396124e-07, + "loss": 0.4606, + "step": 497 + }, + { + "epoch": 4.016129032258065, + "grad_norm": 34.81431579589844, + "learning_rate": 4.54348649922303e-07, + "loss": 0.6634, + "step": 498 + }, + { + "epoch": 4.024193548387097, + "grad_norm": 44.28828048706055, + "learning_rate": 4.5062448066064477e-07, + "loss": 0.4906, + "step": 499 + }, + { + "epoch": 4.032258064516129, + "grad_norm": 144.46249389648438, + "learning_rate": 4.4690031139898653e-07, + "loss": 1.7835, + "step": 500 + }, + { + "epoch": 4.040322580645161, + "grad_norm": 46.331146240234375, + "learning_rate": 4.431761421373283e-07, + "loss": 1.3651, + "step": 501 + }, + { + "epoch": 4.048387096774194, + "grad_norm": 217.1621856689453, + "learning_rate": 4.3945197287567006e-07, + "loss": 2.7421, + "step": 502 + }, + { + "epoch": 4.056451612903226, + "grad_norm": 90.83784484863281, + "learning_rate": 4.357278036140119e-07, + "loss": 0.5116, + "step": 503 + }, + { + "epoch": 4.064516129032258, + "grad_norm": 176.08767700195312, + "learning_rate": 4.320036343523537e-07, + "loss": 1.0464, + "step": 504 + }, + { + "epoch": 4.07258064516129, + "grad_norm": 27.863637924194336, + "learning_rate": 4.2827946509069546e-07, + "loss": 0.4235, + "step": 505 + }, + { + "epoch": 4.080645161290323, + "grad_norm": 33.19697189331055, + "learning_rate": 4.245552958290372e-07, + "loss": 0.261, + "step": 506 + }, + { + "epoch": 4.088709677419355, + "grad_norm": 124.42127227783203, + "learning_rate": 4.20831126567379e-07, + "loss": 9.2822, + "step": 507 + }, + { + "epoch": 4.096774193548387, + "grad_norm": 21.310752868652344, + "learning_rate": 4.1710695730572075e-07, + "loss": 0.0854, + "step": 508 + }, + { + "epoch": 4.104838709677419, + "grad_norm": 163.42176818847656, + "learning_rate": 4.1338278804406256e-07, + "loss": 0.4038, + "step": 509 + }, + { + "epoch": 4.112903225806452, + "grad_norm": 68.20549774169922, + "learning_rate": 4.0965861878240433e-07, + "loss": 0.4068, + "step": 510 + }, + { + "epoch": 4.120967741935484, + "grad_norm": 96.01313781738281, + "learning_rate": 4.0593444952074614e-07, + "loss": 1.2494, + "step": 511 + }, + { + "epoch": 4.129032258064516, + "grad_norm": 153.09539794921875, + "learning_rate": 4.022102802590879e-07, + "loss": 0.2314, + "step": 512 + }, + { + "epoch": 4.137096774193548, + "grad_norm": 231.76513671875, + "learning_rate": 3.9848611099742967e-07, + "loss": 12.2955, + "step": 513 + }, + { + "epoch": 4.145161290322581, + "grad_norm": 497.2821044921875, + "learning_rate": 3.9476194173577143e-07, + "loss": 14.7263, + "step": 514 + }, + { + "epoch": 4.153225806451613, + "grad_norm": 100.38178253173828, + "learning_rate": 3.910377724741132e-07, + "loss": 0.6703, + "step": 515 + }, + { + "epoch": 4.161290322580645, + "grad_norm": 20.106956481933594, + "learning_rate": 3.87313603212455e-07, + "loss": 0.2879, + "step": 516 + }, + { + "epoch": 4.169354838709677, + "grad_norm": 421.3139343261719, + "learning_rate": 3.835894339507968e-07, + "loss": 4.0681, + "step": 517 + }, + { + "epoch": 4.17741935483871, + "grad_norm": 224.01904296875, + "learning_rate": 3.798652646891386e-07, + "loss": 5.5452, + "step": 518 + }, + { + "epoch": 4.185483870967742, + "grad_norm": 75.63765716552734, + "learning_rate": 3.7614109542748036e-07, + "loss": 1.4427, + "step": 519 + }, + { + "epoch": 4.193548387096774, + "grad_norm": 164.29302978515625, + "learning_rate": 3.724169261658221e-07, + "loss": 18.1201, + "step": 520 + }, + { + "epoch": 4.201612903225806, + "grad_norm": 23.143348693847656, + "learning_rate": 3.686927569041639e-07, + "loss": 0.5357, + "step": 521 + }, + { + "epoch": 4.209677419354839, + "grad_norm": 26.469867706298828, + "learning_rate": 3.6496858764250565e-07, + "loss": 0.6538, + "step": 522 + }, + { + "epoch": 4.217741935483871, + "grad_norm": 184.11761474609375, + "learning_rate": 3.6124441838084746e-07, + "loss": 20.6509, + "step": 523 + }, + { + "epoch": 4.225806451612903, + "grad_norm": 124.52848815917969, + "learning_rate": 3.5752024911918923e-07, + "loss": 1.5531, + "step": 524 + }, + { + "epoch": 4.233870967741935, + "grad_norm": 147.81829833984375, + "learning_rate": 3.5379607985753104e-07, + "loss": 4.8944, + "step": 525 + }, + { + "epoch": 4.241935483870968, + "grad_norm": 54.121559143066406, + "learning_rate": 3.500719105958728e-07, + "loss": 0.5221, + "step": 526 + }, + { + "epoch": 4.25, + "grad_norm": 83.79217529296875, + "learning_rate": 3.4634774133421457e-07, + "loss": 1.6032, + "step": 527 + }, + { + "epoch": 4.258064516129032, + "grad_norm": 27.414520263671875, + "learning_rate": 3.4262357207255633e-07, + "loss": 0.4982, + "step": 528 + }, + { + "epoch": 4.266129032258064, + "grad_norm": 176.4253692626953, + "learning_rate": 3.388994028108981e-07, + "loss": 3.1434, + "step": 529 + }, + { + "epoch": 4.274193548387097, + "grad_norm": 63.877079010009766, + "learning_rate": 3.351752335492399e-07, + "loss": 0.5721, + "step": 530 + }, + { + "epoch": 4.282258064516129, + "grad_norm": 105.18173217773438, + "learning_rate": 3.314510642875817e-07, + "loss": 0.4412, + "step": 531 + }, + { + "epoch": 4.290322580645161, + "grad_norm": 18.525733947753906, + "learning_rate": 3.277268950259235e-07, + "loss": 0.1196, + "step": 532 + }, + { + "epoch": 4.298387096774194, + "grad_norm": 126.90892028808594, + "learning_rate": 3.2400272576426526e-07, + "loss": 1.7735, + "step": 533 + }, + { + "epoch": 4.306451612903226, + "grad_norm": 13.654480934143066, + "learning_rate": 3.20278556502607e-07, + "loss": 0.8871, + "step": 534 + }, + { + "epoch": 4.314516129032258, + "grad_norm": 183.49330139160156, + "learning_rate": 3.165543872409488e-07, + "loss": 1.7886, + "step": 535 + }, + { + "epoch": 4.32258064516129, + "grad_norm": 651.6168823242188, + "learning_rate": 3.128302179792906e-07, + "loss": 2.3902, + "step": 536 + }, + { + "epoch": 4.330645161290323, + "grad_norm": 76.452880859375, + "learning_rate": 3.0910604871763236e-07, + "loss": 0.6681, + "step": 537 + }, + { + "epoch": 4.338709677419355, + "grad_norm": 70.49311065673828, + "learning_rate": 3.0538187945597413e-07, + "loss": 0.3893, + "step": 538 + }, + { + "epoch": 4.346774193548387, + "grad_norm": 24.761642456054688, + "learning_rate": 3.0165771019431594e-07, + "loss": 0.0661, + "step": 539 + }, + { + "epoch": 4.354838709677419, + "grad_norm": 54.143436431884766, + "learning_rate": 2.979335409326577e-07, + "loss": 0.6459, + "step": 540 + }, + { + "epoch": 4.362903225806452, + "grad_norm": 70.6890640258789, + "learning_rate": 2.9420937167099947e-07, + "loss": 0.2079, + "step": 541 + }, + { + "epoch": 4.370967741935484, + "grad_norm": 44.36699295043945, + "learning_rate": 2.9048520240934123e-07, + "loss": 0.2182, + "step": 542 + }, + { + "epoch": 4.379032258064516, + "grad_norm": 24.9780330657959, + "learning_rate": 2.8676103314768305e-07, + "loss": 0.1024, + "step": 543 + }, + { + "epoch": 4.387096774193548, + "grad_norm": 600.659423828125, + "learning_rate": 2.830368638860248e-07, + "loss": 20.363, + "step": 544 + }, + { + "epoch": 4.395161290322581, + "grad_norm": 11.253837585449219, + "learning_rate": 2.793126946243666e-07, + "loss": 0.1866, + "step": 545 + }, + { + "epoch": 4.403225806451613, + "grad_norm": 28.648752212524414, + "learning_rate": 2.7558852536270834e-07, + "loss": 0.3628, + "step": 546 + }, + { + "epoch": 4.411290322580645, + "grad_norm": 63.38084030151367, + "learning_rate": 2.7186435610105016e-07, + "loss": 0.7742, + "step": 547 + }, + { + "epoch": 4.419354838709677, + "grad_norm": 81.5975112915039, + "learning_rate": 2.681401868393919e-07, + "loss": 2.4472, + "step": 548 + }, + { + "epoch": 4.42741935483871, + "grad_norm": 96.23023223876953, + "learning_rate": 2.644160175777337e-07, + "loss": 0.5348, + "step": 549 + }, + { + "epoch": 4.435483870967742, + "grad_norm": 698.8837280273438, + "learning_rate": 2.606918483160755e-07, + "loss": 6.1945, + "step": 550 + }, + { + "epoch": 4.443548387096774, + "grad_norm": 105.67257690429688, + "learning_rate": 2.5696767905441726e-07, + "loss": 1.0181, + "step": 551 + }, + { + "epoch": 4.451612903225806, + "grad_norm": 369.3524475097656, + "learning_rate": 2.5324350979275903e-07, + "loss": 10.3212, + "step": 552 + }, + { + "epoch": 4.459677419354839, + "grad_norm": 68.52635955810547, + "learning_rate": 2.495193405311008e-07, + "loss": 0.8354, + "step": 553 + }, + { + "epoch": 4.467741935483871, + "grad_norm": 34.14206314086914, + "learning_rate": 2.457951712694426e-07, + "loss": 0.5059, + "step": 554 + }, + { + "epoch": 4.475806451612903, + "grad_norm": 160.20071411132812, + "learning_rate": 2.4207100200778437e-07, + "loss": 0.5112, + "step": 555 + }, + { + "epoch": 4.483870967741936, + "grad_norm": 38.58085632324219, + "learning_rate": 2.3834683274612616e-07, + "loss": 0.3525, + "step": 556 + }, + { + "epoch": 4.491935483870968, + "grad_norm": 65.05552673339844, + "learning_rate": 2.3462266348446792e-07, + "loss": 0.3484, + "step": 557 + }, + { + "epoch": 4.5, + "grad_norm": 394.2702941894531, + "learning_rate": 2.3089849422280971e-07, + "loss": 20.5987, + "step": 558 + }, + { + "epoch": 4.508064516129032, + "grad_norm": 11.85052490234375, + "learning_rate": 2.271743249611515e-07, + "loss": 0.0769, + "step": 559 + }, + { + "epoch": 4.516129032258064, + "grad_norm": 186.5146484375, + "learning_rate": 2.2345015569949327e-07, + "loss": 11.3414, + "step": 560 + }, + { + "epoch": 4.524193548387097, + "grad_norm": 35.12519454956055, + "learning_rate": 2.1972598643783503e-07, + "loss": 0.5073, + "step": 561 + }, + { + "epoch": 4.532258064516129, + "grad_norm": 159.3316650390625, + "learning_rate": 2.1600181717617685e-07, + "loss": 1.5338, + "step": 562 + }, + { + "epoch": 4.540322580645161, + "grad_norm": 376.8158264160156, + "learning_rate": 2.122776479145186e-07, + "loss": 16.0564, + "step": 563 + }, + { + "epoch": 4.548387096774194, + "grad_norm": 38.60029220581055, + "learning_rate": 2.0855347865286037e-07, + "loss": 0.3901, + "step": 564 + }, + { + "epoch": 4.556451612903226, + "grad_norm": 14.886783599853516, + "learning_rate": 2.0482930939120216e-07, + "loss": 0.2495, + "step": 565 + }, + { + "epoch": 4.564516129032258, + "grad_norm": 274.8218688964844, + "learning_rate": 2.0110514012954395e-07, + "loss": 3.6786, + "step": 566 + }, + { + "epoch": 4.57258064516129, + "grad_norm": 865.7882080078125, + "learning_rate": 1.9738097086788572e-07, + "loss": 5.4778, + "step": 567 + }, + { + "epoch": 4.580645161290323, + "grad_norm": 167.94483947753906, + "learning_rate": 1.936568016062275e-07, + "loss": 13.7189, + "step": 568 + }, + { + "epoch": 4.588709677419355, + "grad_norm": 28.00385856628418, + "learning_rate": 1.899326323445693e-07, + "loss": 0.9164, + "step": 569 + }, + { + "epoch": 4.596774193548387, + "grad_norm": 492.3658752441406, + "learning_rate": 1.8620846308291106e-07, + "loss": 10.5385, + "step": 570 + }, + { + "epoch": 4.604838709677419, + "grad_norm": 91.81761169433594, + "learning_rate": 1.8248429382125282e-07, + "loss": 1.3811, + "step": 571 + }, + { + "epoch": 4.612903225806452, + "grad_norm": 53.9609489440918, + "learning_rate": 1.7876012455959461e-07, + "loss": 0.4604, + "step": 572 + }, + { + "epoch": 4.620967741935484, + "grad_norm": 91.66685485839844, + "learning_rate": 1.750359552979364e-07, + "loss": 3.1813, + "step": 573 + }, + { + "epoch": 4.629032258064516, + "grad_norm": 61.261680603027344, + "learning_rate": 1.7131178603627817e-07, + "loss": 0.2714, + "step": 574 + }, + { + "epoch": 4.637096774193548, + "grad_norm": 86.26275634765625, + "learning_rate": 1.6758761677461996e-07, + "loss": 1.7492, + "step": 575 + }, + { + "epoch": 4.645161290322581, + "grad_norm": 35.08537673950195, + "learning_rate": 1.6386344751296175e-07, + "loss": 1.0437, + "step": 576 + }, + { + "epoch": 4.653225806451613, + "grad_norm": 71.87916564941406, + "learning_rate": 1.601392782513035e-07, + "loss": 0.5856, + "step": 577 + }, + { + "epoch": 4.661290322580645, + "grad_norm": 362.8409118652344, + "learning_rate": 1.564151089896453e-07, + "loss": 23.2117, + "step": 578 + }, + { + "epoch": 4.669354838709677, + "grad_norm": 37.816200256347656, + "learning_rate": 1.5269093972798706e-07, + "loss": 0.5448, + "step": 579 + }, + { + "epoch": 4.67741935483871, + "grad_norm": 152.78346252441406, + "learning_rate": 1.4896677046632885e-07, + "loss": 7.6629, + "step": 580 + }, + { + "epoch": 4.685483870967742, + "grad_norm": 51.52225875854492, + "learning_rate": 1.4524260120467062e-07, + "loss": 0.3167, + "step": 581 + }, + { + "epoch": 4.693548387096774, + "grad_norm": 34.78017044067383, + "learning_rate": 1.415184319430124e-07, + "loss": 0.1883, + "step": 582 + }, + { + "epoch": 4.701612903225806, + "grad_norm": 33.89111328125, + "learning_rate": 1.3779426268135417e-07, + "loss": 0.5733, + "step": 583 + }, + { + "epoch": 4.709677419354839, + "grad_norm": 165.656494140625, + "learning_rate": 1.3407009341969596e-07, + "loss": 15.5164, + "step": 584 + }, + { + "epoch": 4.717741935483871, + "grad_norm": 187.8523712158203, + "learning_rate": 1.3034592415803775e-07, + "loss": 2.6475, + "step": 585 + }, + { + "epoch": 4.725806451612903, + "grad_norm": 226.56434631347656, + "learning_rate": 1.2662175489637951e-07, + "loss": 19.2786, + "step": 586 + }, + { + "epoch": 4.733870967741936, + "grad_norm": 93.69617462158203, + "learning_rate": 1.228975856347213e-07, + "loss": 0.3997, + "step": 587 + }, + { + "epoch": 4.741935483870968, + "grad_norm": 70.5687255859375, + "learning_rate": 1.1917341637306308e-07, + "loss": 0.8395, + "step": 588 + }, + { + "epoch": 4.75, + "grad_norm": 139.98455810546875, + "learning_rate": 1.1544924711140486e-07, + "loss": 1.5936, + "step": 589 + }, + { + "epoch": 4.758064516129032, + "grad_norm": 61.73574447631836, + "learning_rate": 1.1172507784974663e-07, + "loss": 1.6269, + "step": 590 + }, + { + "epoch": 4.766129032258064, + "grad_norm": 267.8656921386719, + "learning_rate": 1.0800090858808842e-07, + "loss": 10.6706, + "step": 591 + }, + { + "epoch": 4.774193548387097, + "grad_norm": 103.93309783935547, + "learning_rate": 1.0427673932643019e-07, + "loss": 0.7362, + "step": 592 + }, + { + "epoch": 4.782258064516129, + "grad_norm": 33.18098831176758, + "learning_rate": 1.0055257006477198e-07, + "loss": 0.5736, + "step": 593 + }, + { + "epoch": 4.790322580645161, + "grad_norm": 67.95414733886719, + "learning_rate": 9.682840080311375e-08, + "loss": 0.3606, + "step": 594 + }, + { + "epoch": 4.798387096774194, + "grad_norm": 374.37744140625, + "learning_rate": 9.310423154145553e-08, + "loss": 4.9431, + "step": 595 + }, + { + "epoch": 4.806451612903226, + "grad_norm": 27.907346725463867, + "learning_rate": 8.938006227979731e-08, + "loss": 0.1885, + "step": 596 + }, + { + "epoch": 4.814516129032258, + "grad_norm": 54.07706069946289, + "learning_rate": 8.565589301813908e-08, + "loss": 1.0378, + "step": 597 + }, + { + "epoch": 4.82258064516129, + "grad_norm": 192.1484832763672, + "learning_rate": 8.193172375648087e-08, + "loss": 10.6858, + "step": 598 + }, + { + "epoch": 4.830645161290323, + "grad_norm": 415.5929870605469, + "learning_rate": 7.820755449482265e-08, + "loss": 4.9497, + "step": 599 + }, + { + "epoch": 4.838709677419355, + "grad_norm": 79.47122192382812, + "learning_rate": 7.448338523316443e-08, + "loss": 4.9277, + "step": 600 + }, + { + "epoch": 4.846774193548387, + "grad_norm": 214.2676544189453, + "learning_rate": 7.07592159715062e-08, + "loss": 5.4366, + "step": 601 + }, + { + "epoch": 4.854838709677419, + "grad_norm": 85.91707611083984, + "learning_rate": 6.703504670984798e-08, + "loss": 0.9629, + "step": 602 + }, + { + "epoch": 4.862903225806452, + "grad_norm": 59.75841522216797, + "learning_rate": 6.331087744818976e-08, + "loss": 0.2416, + "step": 603 + }, + { + "epoch": 4.870967741935484, + "grad_norm": 9.71762752532959, + "learning_rate": 5.958670818653154e-08, + "loss": 0.1288, + "step": 604 + }, + { + "epoch": 4.879032258064516, + "grad_norm": 139.64285278320312, + "learning_rate": 5.5862538924873317e-08, + "loss": 1.0523, + "step": 605 + }, + { + "epoch": 4.887096774193548, + "grad_norm": 222.7238006591797, + "learning_rate": 5.2138369663215093e-08, + "loss": 17.453, + "step": 606 + }, + { + "epoch": 4.895161290322581, + "grad_norm": 79.89140319824219, + "learning_rate": 4.8414200401556877e-08, + "loss": 0.4757, + "step": 607 + }, + { + "epoch": 4.903225806451613, + "grad_norm": 65.18061065673828, + "learning_rate": 4.4690031139898653e-08, + "loss": 0.239, + "step": 608 + }, + { + "epoch": 4.911290322580645, + "grad_norm": 2055.86767578125, + "learning_rate": 4.0965861878240437e-08, + "loss": 133.0742, + "step": 609 + }, + { + "epoch": 4.919354838709677, + "grad_norm": 53.200016021728516, + "learning_rate": 3.724169261658221e-08, + "loss": 0.4046, + "step": 610 + }, + { + "epoch": 4.92741935483871, + "grad_norm": 46.2671012878418, + "learning_rate": 3.351752335492399e-08, + "loss": 1.0537, + "step": 611 + }, + { + "epoch": 4.935483870967742, + "grad_norm": 121.25851440429688, + "learning_rate": 2.979335409326577e-08, + "loss": 11.5313, + "step": 612 + }, + { + "epoch": 4.943548387096774, + "grad_norm": 313.4679870605469, + "learning_rate": 2.6069184831607547e-08, + "loss": 14.306, + "step": 613 + }, + { + "epoch": 4.951612903225806, + "grad_norm": 26.671438217163086, + "learning_rate": 2.2345015569949327e-08, + "loss": 0.8477, + "step": 614 + }, + { + "epoch": 4.959677419354839, + "grad_norm": 42.424842834472656, + "learning_rate": 1.8620846308291107e-08, + "loss": 0.3301, + "step": 615 + }, + { + "epoch": 4.967741935483871, + "grad_norm": 152.9750213623047, + "learning_rate": 1.4896677046632885e-08, + "loss": 1.3814, + "step": 616 + }, + { + "epoch": 4.975806451612903, + "grad_norm": 17.42218017578125, + "learning_rate": 1.1172507784974663e-08, + "loss": 0.0749, + "step": 617 + }, + { + "epoch": 4.983870967741936, + "grad_norm": 88.0694808959961, + "learning_rate": 7.4483385233164425e-09, + "loss": 0.6241, + "step": 618 + }, + { + "epoch": 4.991935483870968, + "grad_norm": 240.30140686035156, + "learning_rate": 3.7241692616582212e-09, + "loss": 22.4261, + "step": 619 + }, + { + "epoch": 5.0, + "grad_norm": 26.565181732177734, + "learning_rate": 0.0, + "loss": 0.0558, + "step": 620 + }, + { + "epoch": 5.0, + "eval_loss": 3.6100568771362305, + "eval_mae": 1.1832215785980225, + "eval_mse": 3.6100568771362305, + "eval_r2": 0.011629223823547363, + "eval_rmse": 1.9000149676084739, + "eval_runtime": 1.3508, + "eval_samples_per_second": 40.717, + "eval_smape": 49.474382400512695, + "eval_steps_per_second": 10.364, + "step": 620 + } + ], + "logging_steps": 1, + "max_steps": 620, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3359849068769280.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.308984942228097e-06 + } +}