{ "best_metric": 1.1512540578842163, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 2.989247311827957, "eval_steps": 50, "global_step": 139, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021505376344086023, "grad_norm": 0.3019464313983917, "learning_rate": 1.16e-05, "loss": 1.4259, "step": 1 }, { "epoch": 0.021505376344086023, "eval_loss": 1.4605212211608887, "eval_runtime": 3.307, "eval_samples_per_second": 188.69, "eval_steps_per_second": 6.048, "step": 1 }, { "epoch": 0.043010752688172046, "grad_norm": 0.3766098916530609, "learning_rate": 2.32e-05, "loss": 1.3061, "step": 2 }, { "epoch": 0.06451612903225806, "grad_norm": 0.4208398163318634, "learning_rate": 3.48e-05, "loss": 1.2716, "step": 3 }, { "epoch": 0.08602150537634409, "grad_norm": 0.4907033145427704, "learning_rate": 4.64e-05, "loss": 1.3698, "step": 4 }, { "epoch": 0.10752688172043011, "grad_norm": 0.5819088816642761, "learning_rate": 5.8e-05, "loss": 1.5208, "step": 5 }, { "epoch": 0.12903225806451613, "grad_norm": 0.8909784555435181, "learning_rate": 6.96e-05, "loss": 1.6946, "step": 6 }, { "epoch": 0.15053763440860216, "grad_norm": 0.1904004067182541, "learning_rate": 8.12e-05, "loss": 1.3806, "step": 7 }, { "epoch": 0.17204301075268819, "grad_norm": 0.2766229510307312, "learning_rate": 9.28e-05, "loss": 1.2944, "step": 8 }, { "epoch": 0.1935483870967742, "grad_norm": 0.3707273602485657, "learning_rate": 0.0001044, "loss": 1.1893, "step": 9 }, { "epoch": 0.21505376344086022, "grad_norm": 0.4749780595302582, "learning_rate": 0.000116, "loss": 1.2153, "step": 10 }, { "epoch": 0.23655913978494625, "grad_norm": 0.5096077919006348, "learning_rate": 0.00011598280125101809, "loss": 1.3548, "step": 11 }, { "epoch": 0.25806451612903225, "grad_norm": 0.46667513251304626, "learning_rate": 0.00011593121520396772, "loss": 1.4838, "step": 12 }, { "epoch": 0.27956989247311825, "grad_norm": 0.2550894618034363, "learning_rate": 0.000115845272452486, "loss": 1.3795, "step": 13 }, { "epoch": 0.3010752688172043, "grad_norm": 0.2862931787967682, "learning_rate": 0.00011572502396580767, "loss": 1.2467, "step": 14 }, { "epoch": 0.3225806451612903, "grad_norm": 0.2682102620601654, "learning_rate": 0.00011557054105853753, "loss": 1.1907, "step": 15 }, { "epoch": 0.34408602150537637, "grad_norm": 0.27467212080955505, "learning_rate": 0.0001153819153483564, "loss": 1.1003, "step": 16 }, { "epoch": 0.3655913978494624, "grad_norm": 0.3005955219268799, "learning_rate": 0.00011515925870168636, "loss": 1.2234, "step": 17 }, { "epoch": 0.3870967741935484, "grad_norm": 0.3794748783111572, "learning_rate": 0.00011490270316734726, "loss": 1.4082, "step": 18 }, { "epoch": 0.40860215053763443, "grad_norm": 0.21310873329639435, "learning_rate": 0.00011461240089824378, "loss": 1.328, "step": 19 }, { "epoch": 0.43010752688172044, "grad_norm": 0.20409537851810455, "learning_rate": 0.0001142885240611295, "loss": 1.3031, "step": 20 }, { "epoch": 0.45161290322580644, "grad_norm": 0.22202569246292114, "learning_rate": 0.0001139312647345018, "loss": 1.1874, "step": 21 }, { "epoch": 0.4731182795698925, "grad_norm": 0.2386716902256012, "learning_rate": 0.00011354083479468755, "loss": 1.1251, "step": 22 }, { "epoch": 0.4946236559139785, "grad_norm": 0.26145026087760925, "learning_rate": 0.00011311746579018779, "loss": 1.1777, "step": 23 }, { "epoch": 0.5161290322580645, "grad_norm": 0.30247944593429565, "learning_rate": 0.00011266140880435544, "loss": 1.3137, "step": 24 }, { "epoch": 0.5376344086021505, "grad_norm": 0.587131917476654, "learning_rate": 0.00011217293430648779, "loss": 1.4008, "step": 25 }, { "epoch": 0.5591397849462365, "grad_norm": 0.14670686423778534, "learning_rate": 0.00011165233199142182, "loss": 1.2933, "step": 26 }, { "epoch": 0.5806451612903226, "grad_norm": 0.16923627257347107, "learning_rate": 0.00011109991060772776, "loss": 1.1914, "step": 27 }, { "epoch": 0.6021505376344086, "grad_norm": 0.19446203112602234, "learning_rate": 0.0001105159977746025, "loss": 1.1251, "step": 28 }, { "epoch": 0.6236559139784946, "grad_norm": 0.22473880648612976, "learning_rate": 0.00010990093978757173, "loss": 1.1065, "step": 29 }, { "epoch": 0.6451612903225806, "grad_norm": 0.2743987441062927, "learning_rate": 0.00010925510141311572, "loss": 1.2497, "step": 30 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3620847761631012, "learning_rate": 0.00010857886567234085, "loss": 1.3353, "step": 31 }, { "epoch": 0.6881720430107527, "grad_norm": 0.14775493741035461, "learning_rate": 0.00010787263361382498, "loss": 1.2885, "step": 32 }, { "epoch": 0.7096774193548387, "grad_norm": 0.1633865386247635, "learning_rate": 0.00010713682407577149, "loss": 1.2385, "step": 33 }, { "epoch": 0.7311827956989247, "grad_norm": 0.1605488508939743, "learning_rate": 0.00010637187343761291, "loss": 1.0806, "step": 34 }, { "epoch": 0.7526881720430108, "grad_norm": 0.19970649480819702, "learning_rate": 0.00010557823536121162, "loss": 1.1132, "step": 35 }, { "epoch": 0.7741935483870968, "grad_norm": 0.24861781299114227, "learning_rate": 0.00010475638052181104, "loss": 1.1757, "step": 36 }, { "epoch": 0.7956989247311828, "grad_norm": 0.37152165174484253, "learning_rate": 0.00010390679632889674, "loss": 1.3386, "step": 37 }, { "epoch": 0.8172043010752689, "grad_norm": 0.16140355169773102, "learning_rate": 0.00010302998663713333, "loss": 1.3232, "step": 38 }, { "epoch": 0.8387096774193549, "grad_norm": 0.15119719505310059, "learning_rate": 0.00010212647144754812, "loss": 1.2435, "step": 39 }, { "epoch": 0.8602150537634409, "grad_norm": 0.15187421441078186, "learning_rate": 0.00010119678659913935, "loss": 1.0749, "step": 40 }, { "epoch": 0.8817204301075269, "grad_norm": 0.1807960420846939, "learning_rate": 0.00010024148345109112, "loss": 1.0696, "step": 41 }, { "epoch": 0.9032258064516129, "grad_norm": 0.2298922836780548, "learning_rate": 9.926112855578431e-05, "loss": 1.1653, "step": 42 }, { "epoch": 0.9247311827956989, "grad_norm": 0.29605555534362793, "learning_rate": 9.825630332279677e-05, "loss": 1.235, "step": 43 }, { "epoch": 0.946236559139785, "grad_norm": 0.2054792195558548, "learning_rate": 9.722760367409236e-05, "loss": 1.2058, "step": 44 }, { "epoch": 0.967741935483871, "grad_norm": 0.1517435610294342, "learning_rate": 9.617563969060338e-05, "loss": 1.1643, "step": 45 }, { "epoch": 0.989247311827957, "grad_norm": 0.2478746473789215, "learning_rate": 9.51010352504157e-05, "loss": 1.1552, "step": 46 }, { "epoch": 1.010752688172043, "grad_norm": 0.2573375105857849, "learning_rate": 9.400442765877141e-05, "loss": 2.141, "step": 47 }, { "epoch": 1.032258064516129, "grad_norm": 0.12741310894489288, "learning_rate": 9.288646727010848e-05, "loss": 1.0358, "step": 48 }, { "epoch": 1.053763440860215, "grad_norm": 0.1643674373626709, "learning_rate": 9.174781710236128e-05, "loss": 1.1719, "step": 49 }, { "epoch": 1.075268817204301, "grad_norm": 0.19035093486309052, "learning_rate": 9.058915244375091e-05, "loss": 1.0132, "step": 50 }, { "epoch": 1.075268817204301, "eval_loss": 1.1688854694366455, "eval_runtime": 3.3032, "eval_samples_per_second": 188.91, "eval_steps_per_second": 6.055, "step": 50 }, { "epoch": 1.096774193548387, "grad_norm": 0.24456371366977692, "learning_rate": 8.94111604522987e-05, "loss": 1.1068, "step": 51 }, { "epoch": 1.118279569892473, "grad_norm": 0.34046271443367004, "learning_rate": 8.821453974829996e-05, "loss": 1.2428, "step": 52 }, { "epoch": 1.139784946236559, "grad_norm": 0.247173473238945, "learning_rate": 8.7e-05, "loss": 0.9179, "step": 53 }, { "epoch": 1.1612903225806452, "grad_norm": 0.17861323058605194, "learning_rate": 8.576826150271813e-05, "loss": 1.3754, "step": 54 }, { "epoch": 1.1827956989247312, "grad_norm": 0.20689214766025543, "learning_rate": 8.452005475166903e-05, "loss": 1.2233, "step": 55 }, { "epoch": 1.2043010752688172, "grad_norm": 0.2131056934595108, "learning_rate": 8.325612000873509e-05, "loss": 1.0103, "step": 56 }, { "epoch": 1.2258064516129032, "grad_norm": 0.26632529497146606, "learning_rate": 8.197720686344642e-05, "loss": 1.0388, "step": 57 }, { "epoch": 1.2473118279569892, "grad_norm": 0.29926690459251404, "learning_rate": 8.068407378842904e-05, "loss": 1.1619, "step": 58 }, { "epoch": 1.2688172043010753, "grad_norm": 0.21377074718475342, "learning_rate": 7.937748768958499e-05, "loss": 0.548, "step": 59 }, { "epoch": 1.2903225806451613, "grad_norm": 0.215475931763649, "learning_rate": 7.805822345127066e-05, "loss": 1.9897, "step": 60 }, { "epoch": 1.3118279569892473, "grad_norm": 0.1549675166606903, "learning_rate": 7.672706347674388e-05, "loss": 0.9913, "step": 61 }, { "epoch": 1.3333333333333333, "grad_norm": 0.21127307415008545, "learning_rate": 7.53847972241514e-05, "loss": 1.0833, "step": 62 }, { "epoch": 1.3548387096774195, "grad_norm": 0.24489474296569824, "learning_rate": 7.403222073833276e-05, "loss": 1.0426, "step": 63 }, { "epoch": 1.3763440860215055, "grad_norm": 0.3033023178577423, "learning_rate": 7.267013617871748e-05, "loss": 1.1059, "step": 64 }, { "epoch": 1.3978494623655915, "grad_norm": 0.22320985794067383, "learning_rate": 7.129935134359642e-05, "loss": 0.6576, "step": 65 }, { "epoch": 1.4193548387096775, "grad_norm": 0.23948603868484497, "learning_rate": 6.992067919104844e-05, "loss": 1.8893, "step": 66 }, { "epoch": 1.4408602150537635, "grad_norm": 0.1648971438407898, "learning_rate": 6.85349373568073e-05, "loss": 0.9948, "step": 67 }, { "epoch": 1.4623655913978495, "grad_norm": 0.20279648900032043, "learning_rate": 6.714294766935446e-05, "loss": 1.0688, "step": 68 }, { "epoch": 1.4838709677419355, "grad_norm": 0.25218087434768677, "learning_rate": 6.574553566252508e-05, "loss": 1.0871, "step": 69 }, { "epoch": 1.5053763440860215, "grad_norm": 0.2963137626647949, "learning_rate": 6.434353008591673e-05, "loss": 1.0764, "step": 70 }, { "epoch": 1.5268817204301075, "grad_norm": 0.41593724489212036, "learning_rate": 6.293776241339087e-05, "loss": 1.2876, "step": 71 }, { "epoch": 1.5483870967741935, "grad_norm": 0.3219810426235199, "learning_rate": 6.152906634995881e-05, "loss": 1.1358, "step": 72 }, { "epoch": 1.5698924731182795, "grad_norm": 0.14590708911418915, "learning_rate": 6.011827733734423e-05, "loss": 1.0, "step": 73 }, { "epoch": 1.5913978494623655, "grad_norm": 0.18871888518333435, "learning_rate": 5.870623205851586e-05, "loss": 1.2011, "step": 74 }, { "epoch": 1.6129032258064515, "grad_norm": 0.20429526269435883, "learning_rate": 5.729376794148415e-05, "loss": 1.0139, "step": 75 }, { "epoch": 1.6344086021505375, "grad_norm": 0.2530740201473236, "learning_rate": 5.588172266265578e-05, "loss": 1.053, "step": 76 }, { "epoch": 1.6559139784946235, "grad_norm": 0.3459080755710602, "learning_rate": 5.4470933650041196e-05, "loss": 1.1392, "step": 77 }, { "epoch": 1.6774193548387095, "grad_norm": 0.2594045102596283, "learning_rate": 5.3062237586609127e-05, "loss": 0.8978, "step": 78 }, { "epoch": 1.6989247311827957, "grad_norm": 0.1727474331855774, "learning_rate": 5.16564699140833e-05, "loss": 1.4071, "step": 79 }, { "epoch": 1.7204301075268817, "grad_norm": 0.17758332192897797, "learning_rate": 5.025446433747493e-05, "loss": 1.1052, "step": 80 }, { "epoch": 1.7419354838709677, "grad_norm": 0.19625383615493774, "learning_rate": 4.885705233064554e-05, "loss": 1.012, "step": 81 }, { "epoch": 1.7634408602150538, "grad_norm": 0.25267520546913147, "learning_rate": 4.746506264319269e-05, "loss": 1.0759, "step": 82 }, { "epoch": 1.7849462365591398, "grad_norm": 0.31696927547454834, "learning_rate": 4.6079320808951565e-05, "loss": 1.1305, "step": 83 }, { "epoch": 1.8064516129032258, "grad_norm": 0.22307011485099792, "learning_rate": 4.470064865640358e-05, "loss": 0.5376, "step": 84 }, { "epoch": 1.827956989247312, "grad_norm": 0.2187909334897995, "learning_rate": 4.3329863821282514e-05, "loss": 1.8596, "step": 85 }, { "epoch": 1.849462365591398, "grad_norm": 0.16456525027751923, "learning_rate": 4.1967779261667245e-05, "loss": 1.0951, "step": 86 }, { "epoch": 1.870967741935484, "grad_norm": 0.19617106020450592, "learning_rate": 4.06152027758486e-05, "loss": 1.0345, "step": 87 }, { "epoch": 1.89247311827957, "grad_norm": 0.23399649560451508, "learning_rate": 3.9272936523256134e-05, "loss": 1.0037, "step": 88 }, { "epoch": 1.913978494623656, "grad_norm": 0.3092529773712158, "learning_rate": 3.794177654872934e-05, "loss": 1.1699, "step": 89 }, { "epoch": 1.935483870967742, "grad_norm": 0.23798301815986633, "learning_rate": 3.662251231041502e-05, "loss": 0.5823, "step": 90 }, { "epoch": 1.956989247311828, "grad_norm": 0.250249445438385, "learning_rate": 3.531592621157096e-05, "loss": 1.6714, "step": 91 }, { "epoch": 1.978494623655914, "grad_norm": 0.2460961937904358, "learning_rate": 3.402279313655359e-05, "loss": 1.2035, "step": 92 }, { "epoch": 2.0, "grad_norm": 0.3106490969657898, "learning_rate": 3.274387999126492e-05, "loss": 1.5907, "step": 93 }, { "epoch": 2.021505376344086, "grad_norm": 0.14660653471946716, "learning_rate": 3.1479945248330964e-05, "loss": 1.2345, "step": 94 }, { "epoch": 2.043010752688172, "grad_norm": 0.17234809696674347, "learning_rate": 3.023173849728189e-05, "loss": 1.1059, "step": 95 }, { "epoch": 2.064516129032258, "grad_norm": 0.18329447507858276, "learning_rate": 2.9000000000000014e-05, "loss": 0.9879, "step": 96 }, { "epoch": 2.086021505376344, "grad_norm": 0.222031369805336, "learning_rate": 2.7785460251700053e-05, "loss": 0.9824, "step": 97 }, { "epoch": 2.10752688172043, "grad_norm": 0.284976065158844, "learning_rate": 2.6588839547701294e-05, "loss": 1.0231, "step": 98 }, { "epoch": 2.129032258064516, "grad_norm": 0.3977857828140259, "learning_rate": 2.541084755624909e-05, "loss": 1.0588, "step": 99 }, { "epoch": 2.150537634408602, "grad_norm": 0.1437109261751175, "learning_rate": 2.4252182897638746e-05, "loss": 1.1997, "step": 100 }, { "epoch": 2.150537634408602, "eval_loss": 1.1512540578842163, "eval_runtime": 3.6809, "eval_samples_per_second": 169.524, "eval_steps_per_second": 5.433, "step": 100 }, { "epoch": 2.172043010752688, "grad_norm": 0.17709468305110931, "learning_rate": 2.3113532729891522e-05, "loss": 1.1533, "step": 101 }, { "epoch": 2.193548387096774, "grad_norm": 0.1920265406370163, "learning_rate": 2.1995572341228588e-05, "loss": 1.0311, "step": 102 }, { "epoch": 2.21505376344086, "grad_norm": 0.2193988710641861, "learning_rate": 2.089896474958432e-05, "loss": 0.9886, "step": 103 }, { "epoch": 2.236559139784946, "grad_norm": 0.2702076733112335, "learning_rate": 1.9824360309396626e-05, "loss": 1.0325, "step": 104 }, { "epoch": 2.258064516129032, "grad_norm": 0.36857450008392334, "learning_rate": 1.877239632590764e-05, "loss": 1.0513, "step": 105 }, { "epoch": 2.279569892473118, "grad_norm": 0.1756919026374817, "learning_rate": 1.774369667720323e-05, "loss": 1.2273, "step": 106 }, { "epoch": 2.3010752688172045, "grad_norm": 0.17812420427799225, "learning_rate": 1.67388714442157e-05, "loss": 1.1593, "step": 107 }, { "epoch": 2.3225806451612905, "grad_norm": 0.18077994883060455, "learning_rate": 1.575851654890888e-05, "loss": 1.0257, "step": 108 }, { "epoch": 2.3440860215053765, "grad_norm": 0.21289852261543274, "learning_rate": 1.4803213400860651e-05, "loss": 0.9742, "step": 109 }, { "epoch": 2.3655913978494625, "grad_norm": 0.2521963119506836, "learning_rate": 1.3873528552451873e-05, "loss": 0.9653, "step": 110 }, { "epoch": 2.3870967741935485, "grad_norm": 0.32557451725006104, "learning_rate": 1.2970013362866697e-05, "loss": 1.013, "step": 111 }, { "epoch": 2.4086021505376345, "grad_norm": 0.2451455295085907, "learning_rate": 1.2093203671103267e-05, "loss": 1.2257, "step": 112 }, { "epoch": 2.4301075268817205, "grad_norm": 0.1599583923816681, "learning_rate": 1.1243619478188961e-05, "loss": 1.1622, "step": 113 }, { "epoch": 2.4516129032258065, "grad_norm": 0.17558430135250092, "learning_rate": 1.0421764638788365e-05, "loss": 1.0423, "step": 114 }, { "epoch": 2.4731182795698925, "grad_norm": 0.20616887509822845, "learning_rate": 9.628126562387086e-06, "loss": 0.9993, "step": 115 }, { "epoch": 2.4946236559139785, "grad_norm": 0.24216623604297638, "learning_rate": 8.863175924228501e-06, "loss": 0.9776, "step": 116 }, { "epoch": 2.5161290322580645, "grad_norm": 0.3300863802433014, "learning_rate": 8.127366386175014e-06, "loss": 1.054, "step": 117 }, { "epoch": 2.5376344086021505, "grad_norm": 0.5971299409866333, "learning_rate": 7.421134327659152e-06, "loss": 1.0757, "step": 118 }, { "epoch": 2.5591397849462365, "grad_norm": 0.14677409827709198, "learning_rate": 6.744898586884296e-06, "loss": 1.2074, "step": 119 }, { "epoch": 2.5806451612903225, "grad_norm": 0.17917855083942413, "learning_rate": 6.099060212428274e-06, "loss": 1.0735, "step": 120 }, { "epoch": 2.6021505376344085, "grad_norm": 0.2020336389541626, "learning_rate": 5.484002225397496e-06, "loss": 0.9547, "step": 121 }, { "epoch": 2.6236559139784945, "grad_norm": 0.23973648250102997, "learning_rate": 4.900089392272253e-06, "loss": 0.9674, "step": 122 }, { "epoch": 2.6451612903225805, "grad_norm": 0.2944413423538208, "learning_rate": 4.347668008578187e-06, "loss": 1.0609, "step": 123 }, { "epoch": 2.6666666666666665, "grad_norm": 0.40398478507995605, "learning_rate": 3.8270656935122204e-06, "loss": 1.0409, "step": 124 }, { "epoch": 2.688172043010753, "grad_norm": 0.15678246319293976, "learning_rate": 3.3385911956445625e-06, "loss": 1.2516, "step": 125 }, { "epoch": 2.709677419354839, "grad_norm": 0.18234221637248993, "learning_rate": 2.8825342098122193e-06, "loss": 1.1226, "step": 126 }, { "epoch": 2.731182795698925, "grad_norm": 0.19632849097251892, "learning_rate": 2.4591652053124607e-06, "loss": 1.0385, "step": 127 }, { "epoch": 2.752688172043011, "grad_norm": 0.22743487358093262, "learning_rate": 2.068735265498204e-06, "loss": 0.9646, "step": 128 }, { "epoch": 2.774193548387097, "grad_norm": 0.28680744767189026, "learning_rate": 1.711475938870494e-06, "loss": 0.9698, "step": 129 }, { "epoch": 2.795698924731183, "grad_norm": 0.37023359537124634, "learning_rate": 1.3875991017562305e-06, "loss": 1.0446, "step": 130 }, { "epoch": 2.817204301075269, "grad_norm": 0.17839553952217102, "learning_rate": 1.0972968326527323e-06, "loss": 1.2334, "step": 131 }, { "epoch": 2.838709677419355, "grad_norm": 0.17378120124340057, "learning_rate": 8.407412983136427e-07, "loss": 1.1619, "step": 132 }, { "epoch": 2.860215053763441, "grad_norm": 0.18641377985477448, "learning_rate": 6.180846516436054e-07, "loss": 1.0182, "step": 133 }, { "epoch": 2.881720430107527, "grad_norm": 0.2144346982240677, "learning_rate": 4.294589414624692e-07, "loss": 0.9406, "step": 134 }, { "epoch": 2.903225806451613, "grad_norm": 0.27374863624572754, "learning_rate": 2.7497603419232487e-07, "loss": 1.031, "step": 135 }, { "epoch": 2.924731182795699, "grad_norm": 0.3471708297729492, "learning_rate": 1.5472754751400464e-07, "loss": 1.0266, "step": 136 }, { "epoch": 2.946236559139785, "grad_norm": 0.24548716843128204, "learning_rate": 6.878479603226562e-08, "loss": 1.1736, "step": 137 }, { "epoch": 2.967741935483871, "grad_norm": 0.18722181022167206, "learning_rate": 1.71987489819172e-08, "loss": 1.0664, "step": 138 }, { "epoch": 2.989247311827957, "grad_norm": 0.27170056104660034, "learning_rate": 0.0, "loss": 0.9598, "step": 139 } ], "logging_steps": 1, "max_steps": 139, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.224234711308042e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }