{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0014025245441796, "eval_steps": 134, "global_step": 535, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018700327255726976, "grad_norm": 13.994741439819336, "learning_rate": 2e-05, "loss": 3.2962, "step": 1 }, { "epoch": 0.0037400654511453952, "grad_norm": 16.602272033691406, "learning_rate": 4e-05, "loss": 3.6632, "step": 2 }, { "epoch": 0.005610098176718092, "grad_norm": 21.41153335571289, "learning_rate": 6e-05, "loss": 3.8852, "step": 3 }, { "epoch": 0.0074801309022907905, "grad_norm": 15.502434730529785, "learning_rate": 8e-05, "loss": 3.8645, "step": 4 }, { "epoch": 0.009350163627863487, "grad_norm": 14.494356155395508, "learning_rate": 0.0001, "loss": 3.868, "step": 5 }, { "epoch": 0.011220196353436185, "grad_norm": 20.01993751525879, "learning_rate": 0.00012, "loss": 3.5737, "step": 6 }, { "epoch": 0.013090229079008883, "grad_norm": 14.023553848266602, "learning_rate": 0.00014, "loss": 3.2279, "step": 7 }, { "epoch": 0.014960261804581581, "grad_norm": 15.476705551147461, "learning_rate": 0.00016, "loss": 3.8968, "step": 8 }, { "epoch": 0.016830294530154277, "grad_norm": 14.212241172790527, "learning_rate": 0.00018, "loss": 3.401, "step": 9 }, { "epoch": 0.018700327255726974, "grad_norm": 14.703544616699219, "learning_rate": 0.0002, "loss": 3.4936, "step": 10 }, { "epoch": 0.020570359981299673, "grad_norm": 14.499024391174316, "learning_rate": 0.00019999820960091608, "loss": 2.5218, "step": 11 }, { "epoch": 0.02244039270687237, "grad_norm": 16.242597579956055, "learning_rate": 0.00019999283846777488, "loss": 3.0075, "step": 12 }, { "epoch": 0.02431042543244507, "grad_norm": 16.940950393676758, "learning_rate": 0.00019998388679290583, "loss": 2.8596, "step": 13 }, { "epoch": 0.026180458158017766, "grad_norm": 13.73834228515625, "learning_rate": 0.00019997135489685034, "loss": 3.0382, "step": 14 }, { "epoch": 0.028050490883590462, "grad_norm": 16.882173538208008, "learning_rate": 0.00019995524322835034, "loss": 3.4678, "step": 15 }, { "epoch": 0.029920523609163162, "grad_norm": 15.432026863098145, "learning_rate": 0.00019993555236433213, "loss": 3.4976, "step": 16 }, { "epoch": 0.031790556334735855, "grad_norm": 16.355684280395508, "learning_rate": 0.00019991228300988585, "loss": 3.3903, "step": 17 }, { "epoch": 0.033660589060308554, "grad_norm": 13.33781909942627, "learning_rate": 0.00019988543599824005, "loss": 3.1913, "step": 18 }, { "epoch": 0.035530621785881254, "grad_norm": 18.799007415771484, "learning_rate": 0.0001998550122907321, "loss": 3.2008, "step": 19 }, { "epoch": 0.03740065451145395, "grad_norm": 13.545116424560547, "learning_rate": 0.0001998210129767735, "loss": 2.8425, "step": 20 }, { "epoch": 0.03927068723702665, "grad_norm": 14.966565132141113, "learning_rate": 0.00019978343927381113, "loss": 2.8036, "step": 21 }, { "epoch": 0.04114071996259935, "grad_norm": 17.932737350463867, "learning_rate": 0.00019974229252728342, "loss": 2.9432, "step": 22 }, { "epoch": 0.043010752688172046, "grad_norm": 16.62327766418457, "learning_rate": 0.0001996975742105723, "loss": 3.0234, "step": 23 }, { "epoch": 0.04488078541374474, "grad_norm": 17.949251174926758, "learning_rate": 0.00019964928592495045, "loss": 3.0715, "step": 24 }, { "epoch": 0.04675081813931744, "grad_norm": 18.255725860595703, "learning_rate": 0.00019959742939952392, "loss": 3.155, "step": 25 }, { "epoch": 0.04862085086489014, "grad_norm": 22.37006187438965, "learning_rate": 0.0001995420064911702, "loss": 3.0856, "step": 26 }, { "epoch": 0.05049088359046283, "grad_norm": 23.708791732788086, "learning_rate": 0.00019948301918447183, "loss": 3.1826, "step": 27 }, { "epoch": 0.05236091631603553, "grad_norm": 21.139902114868164, "learning_rate": 0.00019942046959164515, "loss": 3.1289, "step": 28 }, { "epoch": 0.05423094904160823, "grad_norm": 18.88515281677246, "learning_rate": 0.0001993543599524649, "loss": 3.2124, "step": 29 }, { "epoch": 0.056100981767180924, "grad_norm": 22.860271453857422, "learning_rate": 0.00019928469263418374, "loss": 3.1165, "step": 30 }, { "epoch": 0.057971014492753624, "grad_norm": 25.16404151916504, "learning_rate": 0.0001992114701314478, "loss": 3.6229, "step": 31 }, { "epoch": 0.059841047218326324, "grad_norm": 20.945270538330078, "learning_rate": 0.00019913469506620707, "loss": 3.2879, "step": 32 }, { "epoch": 0.061711079943899017, "grad_norm": 19.237075805664062, "learning_rate": 0.0001990543701876217, "loss": 2.9019, "step": 33 }, { "epoch": 0.06358111266947171, "grad_norm": 21.761228561401367, "learning_rate": 0.0001989704983719635, "loss": 2.7975, "step": 34 }, { "epoch": 0.06545114539504442, "grad_norm": 18.43031120300293, "learning_rate": 0.00019888308262251285, "loss": 2.6259, "step": 35 }, { "epoch": 0.06732117812061711, "grad_norm": 29.478282928466797, "learning_rate": 0.00019879212606945136, "loss": 3.4395, "step": 36 }, { "epoch": 0.0691912108461898, "grad_norm": 20.588619232177734, "learning_rate": 0.00019869763196974957, "loss": 2.4942, "step": 37 }, { "epoch": 0.07106124357176251, "grad_norm": 25.75873374938965, "learning_rate": 0.0001985996037070505, "loss": 3.6293, "step": 38 }, { "epoch": 0.0729312762973352, "grad_norm": 23.140533447265625, "learning_rate": 0.00019849804479154837, "loss": 2.928, "step": 39 }, { "epoch": 0.0748013090229079, "grad_norm": 25.42648696899414, "learning_rate": 0.00019839295885986296, "loss": 3.1603, "step": 40 }, { "epoch": 0.0766713417484806, "grad_norm": 25.744718551635742, "learning_rate": 0.00019828434967490943, "loss": 3.3389, "step": 41 }, { "epoch": 0.0785413744740533, "grad_norm": 39.95504379272461, "learning_rate": 0.0001981722211257634, "loss": 3.6146, "step": 42 }, { "epoch": 0.08041140719962599, "grad_norm": 32.589569091796875, "learning_rate": 0.00019805657722752202, "loss": 2.9932, "step": 43 }, { "epoch": 0.0822814399251987, "grad_norm": 48.44812774658203, "learning_rate": 0.00019793742212115978, "loss": 3.2977, "step": 44 }, { "epoch": 0.08415147265077139, "grad_norm": 46.82399368286133, "learning_rate": 0.00019781476007338058, "loss": 3.3127, "step": 45 }, { "epoch": 0.08602150537634409, "grad_norm": 35.45515441894531, "learning_rate": 0.00019768859547646478, "loss": 3.409, "step": 46 }, { "epoch": 0.08789153810191679, "grad_norm": 28.829622268676758, "learning_rate": 0.00019755893284811196, "loss": 3.4779, "step": 47 }, { "epoch": 0.08976157082748948, "grad_norm": 42.49812316894531, "learning_rate": 0.00019742577683127911, "loss": 3.7014, "step": 48 }, { "epoch": 0.09163160355306219, "grad_norm": 77.00621795654297, "learning_rate": 0.0001972891321940145, "loss": 5.1841, "step": 49 }, { "epoch": 0.09350163627863488, "grad_norm": 46.6325798034668, "learning_rate": 0.00019714900382928675, "loss": 3.8944, "step": 50 }, { "epoch": 0.09537166900420757, "grad_norm": 118.46067810058594, "learning_rate": 0.0001970053967548098, "loss": 5.2813, "step": 51 }, { "epoch": 0.09724170172978028, "grad_norm": 94.66202545166016, "learning_rate": 0.0001968583161128631, "loss": 4.5682, "step": 52 }, { "epoch": 0.09911173445535297, "grad_norm": 60.450828552246094, "learning_rate": 0.00019670776717010767, "loss": 3.9013, "step": 53 }, { "epoch": 0.10098176718092566, "grad_norm": 25.923925399780273, "learning_rate": 0.0001965537553173972, "loss": 3.6057, "step": 54 }, { "epoch": 0.10285179990649837, "grad_norm": 14.4677152633667, "learning_rate": 0.00019639628606958533, "loss": 3.0911, "step": 55 }, { "epoch": 0.10472183263207106, "grad_norm": 12.290410041809082, "learning_rate": 0.000196235365065328, "loss": 2.6739, "step": 56 }, { "epoch": 0.10659186535764376, "grad_norm": 14.281597137451172, "learning_rate": 0.0001960709980668816, "loss": 2.8039, "step": 57 }, { "epoch": 0.10846189808321646, "grad_norm": 12.305088996887207, "learning_rate": 0.0001959031909598966, "loss": 2.8797, "step": 58 }, { "epoch": 0.11033193080878916, "grad_norm": 12.407671928405762, "learning_rate": 0.00019573194975320673, "loss": 2.402, "step": 59 }, { "epoch": 0.11220196353436185, "grad_norm": 12.547879219055176, "learning_rate": 0.0001955572805786141, "loss": 2.5572, "step": 60 }, { "epoch": 0.11407199625993455, "grad_norm": 15.23267650604248, "learning_rate": 0.0001953791896906692, "loss": 3.1069, "step": 61 }, { "epoch": 0.11594202898550725, "grad_norm": 14.644232749938965, "learning_rate": 0.00019519768346644737, "loss": 2.4441, "step": 62 }, { "epoch": 0.11781206171107994, "grad_norm": 14.574198722839355, "learning_rate": 0.00019501276840532016, "loss": 3.1865, "step": 63 }, { "epoch": 0.11968209443665265, "grad_norm": 15.284114837646484, "learning_rate": 0.00019482445112872264, "loss": 2.7122, "step": 64 }, { "epoch": 0.12155212716222534, "grad_norm": 14.484579086303711, "learning_rate": 0.00019463273837991643, "loss": 2.9405, "step": 65 }, { "epoch": 0.12342215988779803, "grad_norm": 13.775676727294922, "learning_rate": 0.00019443763702374812, "loss": 2.8403, "step": 66 }, { "epoch": 0.12529219261337074, "grad_norm": 14.479483604431152, "learning_rate": 0.0001942391540464035, "loss": 2.9306, "step": 67 }, { "epoch": 0.12716222533894342, "grad_norm": 13.77834701538086, "learning_rate": 0.00019403729655515737, "loss": 2.7441, "step": 68 }, { "epoch": 0.12903225806451613, "grad_norm": 17.22318458557129, "learning_rate": 0.0001938320717781191, "loss": 2.793, "step": 69 }, { "epoch": 0.13090229079008883, "grad_norm": 14.486649513244629, "learning_rate": 0.00019362348706397373, "loss": 2.8446, "step": 70 }, { "epoch": 0.1327723235156615, "grad_norm": 19.49801254272461, "learning_rate": 0.0001934115498817189, "loss": 3.0295, "step": 71 }, { "epoch": 0.13464235624123422, "grad_norm": 17.23444938659668, "learning_rate": 0.00019319626782039734, "loss": 2.9814, "step": 72 }, { "epoch": 0.13651238896680692, "grad_norm": 18.949792861938477, "learning_rate": 0.00019297764858882514, "loss": 3.0234, "step": 73 }, { "epoch": 0.1383824216923796, "grad_norm": 18.541202545166016, "learning_rate": 0.00019275570001531578, "loss": 3.1777, "step": 74 }, { "epoch": 0.1402524544179523, "grad_norm": 21.238248825073242, "learning_rate": 0.00019253043004739968, "loss": 3.677, "step": 75 }, { "epoch": 0.14212248714352502, "grad_norm": 24.113332748413086, "learning_rate": 0.00019230184675153976, "loss": 3.2631, "step": 76 }, { "epoch": 0.1439925198690977, "grad_norm": 17.838098526000977, "learning_rate": 0.00019206995831284242, "loss": 2.728, "step": 77 }, { "epoch": 0.1458625525946704, "grad_norm": 18.885387420654297, "learning_rate": 0.00019183477303476467, "loss": 2.9679, "step": 78 }, { "epoch": 0.1477325853202431, "grad_norm": 19.87195587158203, "learning_rate": 0.00019159629933881666, "loss": 2.9025, "step": 79 }, { "epoch": 0.1496026180458158, "grad_norm": 19.265392303466797, "learning_rate": 0.0001913545457642601, "loss": 3.4623, "step": 80 }, { "epoch": 0.1514726507713885, "grad_norm": 19.50520133972168, "learning_rate": 0.00019110952096780258, "loss": 3.1419, "step": 81 }, { "epoch": 0.1533426834969612, "grad_norm": 20.407455444335938, "learning_rate": 0.00019086123372328746, "loss": 2.5145, "step": 82 }, { "epoch": 0.15521271622253388, "grad_norm": 42.79829788208008, "learning_rate": 0.00019060969292137992, "loss": 3.0625, "step": 83 }, { "epoch": 0.1570827489481066, "grad_norm": 20.8466796875, "learning_rate": 0.00019035490756924832, "loss": 2.9626, "step": 84 }, { "epoch": 0.1589527816736793, "grad_norm": 21.628929138183594, "learning_rate": 0.0001900968867902419, "loss": 2.8822, "step": 85 }, { "epoch": 0.16082281439925197, "grad_norm": 29.18246078491211, "learning_rate": 0.00018983563982356405, "loss": 3.4651, "step": 86 }, { "epoch": 0.16269284712482468, "grad_norm": 26.07509422302246, "learning_rate": 0.0001895711760239413, "loss": 3.0619, "step": 87 }, { "epoch": 0.1645628798503974, "grad_norm": 39.00653076171875, "learning_rate": 0.00018930350486128856, "loss": 3.1362, "step": 88 }, { "epoch": 0.16643291257597007, "grad_norm": 23.455015182495117, "learning_rate": 0.00018903263592036989, "loss": 3.4385, "step": 89 }, { "epoch": 0.16830294530154277, "grad_norm": 29.16703224182129, "learning_rate": 0.00018875857890045543, "loss": 3.1283, "step": 90 }, { "epoch": 0.17017297802711548, "grad_norm": 29.75694465637207, "learning_rate": 0.00018848134361497385, "loss": 3.0399, "step": 91 }, { "epoch": 0.17204301075268819, "grad_norm": 28.572084426879883, "learning_rate": 0.00018820093999116124, "loss": 3.8989, "step": 92 }, { "epoch": 0.17391304347826086, "grad_norm": 50.12794876098633, "learning_rate": 0.00018791737806970538, "loss": 3.6389, "step": 93 }, { "epoch": 0.17578307620383357, "grad_norm": 23.859804153442383, "learning_rate": 0.00018763066800438636, "loss": 3.1504, "step": 94 }, { "epoch": 0.17765310892940628, "grad_norm": 44.48038864135742, "learning_rate": 0.00018734082006171299, "loss": 3.1353, "step": 95 }, { "epoch": 0.17952314165497896, "grad_norm": 26.45856475830078, "learning_rate": 0.00018704784462055503, "loss": 2.7182, "step": 96 }, { "epoch": 0.18139317438055166, "grad_norm": 30.120588302612305, "learning_rate": 0.00018675175217177175, "loss": 3.6129, "step": 97 }, { "epoch": 0.18326320710612437, "grad_norm": 61.866886138916016, "learning_rate": 0.00018645255331783617, "loss": 5.1282, "step": 98 }, { "epoch": 0.18513323983169705, "grad_norm": 39.70136260986328, "learning_rate": 0.00018615025877245523, "loss": 3.8535, "step": 99 }, { "epoch": 0.18700327255726976, "grad_norm": 51.10054397583008, "learning_rate": 0.00018584487936018661, "loss": 3.8204, "step": 100 }, { "epoch": 0.18887330528284246, "grad_norm": 160.09117126464844, "learning_rate": 0.00018553642601605068, "loss": 5.7671, "step": 101 }, { "epoch": 0.19074333800841514, "grad_norm": 126.96265411376953, "learning_rate": 0.0001852249097851391, "loss": 5.543, "step": 102 }, { "epoch": 0.19261337073398785, "grad_norm": 79.90239715576172, "learning_rate": 0.0001849103418222194, "loss": 4.1758, "step": 103 }, { "epoch": 0.19448340345956056, "grad_norm": 48.66373062133789, "learning_rate": 0.00018459273339133537, "loss": 3.4729, "step": 104 }, { "epoch": 0.19635343618513323, "grad_norm": 31.121978759765625, "learning_rate": 0.0001842720958654039, "loss": 3.2988, "step": 105 }, { "epoch": 0.19822346891070594, "grad_norm": 15.884181022644043, "learning_rate": 0.00018394844072580773, "loss": 3.4934, "step": 106 }, { "epoch": 0.20009350163627865, "grad_norm": 13.574240684509277, "learning_rate": 0.00018362177956198408, "loss": 2.6176, "step": 107 }, { "epoch": 0.20196353436185133, "grad_norm": 11.254271507263184, "learning_rate": 0.00018329212407100994, "loss": 2.3752, "step": 108 }, { "epoch": 0.20383356708742403, "grad_norm": 11.554123878479004, "learning_rate": 0.00018295948605718314, "loss": 2.295, "step": 109 }, { "epoch": 0.20570359981299674, "grad_norm": 11.747841835021973, "learning_rate": 0.0001826238774315995, "loss": 2.378, "step": 110 }, { "epoch": 0.20757363253856942, "grad_norm": 13.485661506652832, "learning_rate": 0.00018228531021172658, "loss": 2.6167, "step": 111 }, { "epoch": 0.20944366526414213, "grad_norm": 12.09218692779541, "learning_rate": 0.0001819437965209732, "loss": 2.1478, "step": 112 }, { "epoch": 0.21131369798971483, "grad_norm": 15.334200859069824, "learning_rate": 0.0001815993485882553, "loss": 2.7231, "step": 113 }, { "epoch": 0.2131837307152875, "grad_norm": 14.982645988464355, "learning_rate": 0.0001812519787475582, "loss": 2.7762, "step": 114 }, { "epoch": 0.21505376344086022, "grad_norm": 15.121198654174805, "learning_rate": 0.00018090169943749476, "loss": 2.6204, "step": 115 }, { "epoch": 0.21692379616643293, "grad_norm": 12.098959922790527, "learning_rate": 0.0001805485232008601, "loss": 2.6039, "step": 116 }, { "epoch": 0.2187938288920056, "grad_norm": 13.73119068145752, "learning_rate": 0.0001801924626841824, "loss": 2.7469, "step": 117 }, { "epoch": 0.2206638616175783, "grad_norm": 14.34265422821045, "learning_rate": 0.00017983353063727016, "loss": 3.0645, "step": 118 }, { "epoch": 0.22253389434315102, "grad_norm": 13.765779495239258, "learning_rate": 0.00017947173991275555, "loss": 3.014, "step": 119 }, { "epoch": 0.2244039270687237, "grad_norm": 17.305545806884766, "learning_rate": 0.00017910710346563416, "loss": 2.9139, "step": 120 }, { "epoch": 0.2262739597942964, "grad_norm": 13.76815414428711, "learning_rate": 0.00017873963435280121, "loss": 2.6729, "step": 121 }, { "epoch": 0.2281439925198691, "grad_norm": 16.517879486083984, "learning_rate": 0.000178369345732584, "loss": 3.1773, "step": 122 }, { "epoch": 0.2300140252454418, "grad_norm": 18.648420333862305, "learning_rate": 0.00017799625086427064, "loss": 3.5063, "step": 123 }, { "epoch": 0.2318840579710145, "grad_norm": 19.074386596679688, "learning_rate": 0.00017762036310763532, "loss": 2.743, "step": 124 }, { "epoch": 0.2337540906965872, "grad_norm": 16.391334533691406, "learning_rate": 0.00017724169592245995, "loss": 2.7288, "step": 125 }, { "epoch": 0.23562412342215988, "grad_norm": 26.434553146362305, "learning_rate": 0.0001768602628680522, "loss": 2.6386, "step": 126 }, { "epoch": 0.2374941561477326, "grad_norm": 18.820554733276367, "learning_rate": 0.00017647607760275987, "loss": 3.3189, "step": 127 }, { "epoch": 0.2393641888733053, "grad_norm": 18.246240615844727, "learning_rate": 0.00017608915388348187, "loss": 2.9794, "step": 128 }, { "epoch": 0.24123422159887797, "grad_norm": 24.38311004638672, "learning_rate": 0.00017569950556517566, "loss": 2.9017, "step": 129 }, { "epoch": 0.24310425432445068, "grad_norm": 17.729557037353516, "learning_rate": 0.00017530714660036112, "loss": 2.7735, "step": 130 }, { "epoch": 0.2449742870500234, "grad_norm": 20.780271530151367, "learning_rate": 0.00017491209103862084, "loss": 3.2586, "step": 131 }, { "epoch": 0.24684431977559607, "grad_norm": 25.706838607788086, "learning_rate": 0.00017451435302609714, "loss": 2.9574, "step": 132 }, { "epoch": 0.24871435250116877, "grad_norm": 17.675687789916992, "learning_rate": 0.0001741139468049855, "loss": 2.6356, "step": 133 }, { "epoch": 0.2505843852267415, "grad_norm": 23.035436630249023, "learning_rate": 0.0001737108867130245, "loss": 3.3455, "step": 134 }, { "epoch": 0.2505843852267415, "eval_loss": 3.025357723236084, "eval_runtime": 12.856, "eval_samples_per_second": 17.579, "eval_steps_per_second": 8.79, "step": 134 }, { "epoch": 0.25245441795231416, "grad_norm": 17.924945831298828, "learning_rate": 0.00017330518718298264, "loss": 2.8946, "step": 135 }, { "epoch": 0.25432445067788684, "grad_norm": 20.890548706054688, "learning_rate": 0.00017289686274214118, "loss": 2.9531, "step": 136 }, { "epoch": 0.25619448340345957, "grad_norm": 19.297199249267578, "learning_rate": 0.0001724859280117742, "loss": 2.7719, "step": 137 }, { "epoch": 0.25806451612903225, "grad_norm": 23.349380493164062, "learning_rate": 0.000172072397706625, "loss": 3.5567, "step": 138 }, { "epoch": 0.25993454885460493, "grad_norm": 24.923961639404297, "learning_rate": 0.00017165628663437922, "loss": 3.113, "step": 139 }, { "epoch": 0.26180458158017766, "grad_norm": 21.864042282104492, "learning_rate": 0.0001712376096951345, "loss": 2.9502, "step": 140 }, { "epoch": 0.26367461430575034, "grad_norm": 30.322755813598633, "learning_rate": 0.00017081638188086697, "loss": 3.7115, "step": 141 }, { "epoch": 0.265544647031323, "grad_norm": 23.991077423095703, "learning_rate": 0.0001703926182748945, "loss": 3.3406, "step": 142 }, { "epoch": 0.26741467975689576, "grad_norm": 24.86219024658203, "learning_rate": 0.00016996633405133655, "loss": 3.347, "step": 143 }, { "epoch": 0.26928471248246844, "grad_norm": 29.057273864746094, "learning_rate": 0.00016953754447457078, "loss": 2.6652, "step": 144 }, { "epoch": 0.2711547452080411, "grad_norm": 25.7908992767334, "learning_rate": 0.00016910626489868649, "loss": 3.1458, "step": 145 }, { "epoch": 0.27302477793361385, "grad_norm": 27.767778396606445, "learning_rate": 0.00016867251076693482, "loss": 3.3319, "step": 146 }, { "epoch": 0.27489481065918653, "grad_norm": 30.76727294921875, "learning_rate": 0.0001682362976111758, "loss": 3.682, "step": 147 }, { "epoch": 0.2767648433847592, "grad_norm": 25.062744140625, "learning_rate": 0.0001677976410513221, "loss": 3.0067, "step": 148 }, { "epoch": 0.27863487611033194, "grad_norm": 48.073890686035156, "learning_rate": 0.00016735655679477979, "loss": 4.0274, "step": 149 }, { "epoch": 0.2805049088359046, "grad_norm": 44.6626091003418, "learning_rate": 0.00016691306063588583, "loss": 3.1489, "step": 150 }, { "epoch": 0.2823749415614773, "grad_norm": 73.71078491210938, "learning_rate": 0.0001664671684553426, "loss": 4.4787, "step": 151 }, { "epoch": 0.28424497428705003, "grad_norm": 56.24951934814453, "learning_rate": 0.00016601889621964904, "loss": 3.7768, "step": 152 }, { "epoch": 0.2861150070126227, "grad_norm": 46.72991943359375, "learning_rate": 0.00016556825998052924, "loss": 3.5245, "step": 153 }, { "epoch": 0.2879850397381954, "grad_norm": 31.89575958251953, "learning_rate": 0.00016511527587435737, "loss": 3.4228, "step": 154 }, { "epoch": 0.2898550724637681, "grad_norm": 18.977670669555664, "learning_rate": 0.00016465996012157995, "loss": 3.0195, "step": 155 }, { "epoch": 0.2917251051893408, "grad_norm": 12.196249008178711, "learning_rate": 0.00016420232902613523, "loss": 2.8431, "step": 156 }, { "epoch": 0.2935951379149135, "grad_norm": 11.746070861816406, "learning_rate": 0.000163742398974869, "loss": 2.5194, "step": 157 }, { "epoch": 0.2954651706404862, "grad_norm": 12.877666473388672, "learning_rate": 0.00016328018643694812, "loss": 2.695, "step": 158 }, { "epoch": 0.2973352033660589, "grad_norm": 11.190670013427734, "learning_rate": 0.00016281570796327068, "loss": 3.0105, "step": 159 }, { "epoch": 0.2992052360916316, "grad_norm": 12.561261177062988, "learning_rate": 0.00016234898018587337, "loss": 2.113, "step": 160 }, { "epoch": 0.3010752688172043, "grad_norm": 20.69705581665039, "learning_rate": 0.00016188001981733588, "loss": 2.7286, "step": 161 }, { "epoch": 0.302945301542777, "grad_norm": 13.717788696289062, "learning_rate": 0.00016140884365018252, "loss": 2.9779, "step": 162 }, { "epoch": 0.30481533426834967, "grad_norm": 12.364960670471191, "learning_rate": 0.00016093546855628084, "loss": 2.6577, "step": 163 }, { "epoch": 0.3066853669939224, "grad_norm": 12.219094276428223, "learning_rate": 0.0001604599114862375, "loss": 2.3431, "step": 164 }, { "epoch": 0.3085553997194951, "grad_norm": 14.56771183013916, "learning_rate": 0.00015998218946879138, "loss": 2.7715, "step": 165 }, { "epoch": 0.31042543244506776, "grad_norm": 14.490442276000977, "learning_rate": 0.00015950231961020373, "loss": 2.7294, "step": 166 }, { "epoch": 0.3122954651706405, "grad_norm": 13.455503463745117, "learning_rate": 0.00015902031909364564, "loss": 2.7422, "step": 167 }, { "epoch": 0.3141654978962132, "grad_norm": 14.774016380310059, "learning_rate": 0.00015853620517858276, "loss": 2.807, "step": 168 }, { "epoch": 0.31603553062178585, "grad_norm": 15.136216163635254, "learning_rate": 0.00015804999520015734, "loss": 2.8907, "step": 169 }, { "epoch": 0.3179055633473586, "grad_norm": 14.85161018371582, "learning_rate": 0.00015756170656856737, "loss": 2.8376, "step": 170 }, { "epoch": 0.31977559607293127, "grad_norm": 16.69162368774414, "learning_rate": 0.0001570713567684432, "loss": 2.8552, "step": 171 }, { "epoch": 0.32164562879850395, "grad_norm": 13.032023429870605, "learning_rate": 0.00015657896335822147, "loss": 3.02, "step": 172 }, { "epoch": 0.3235156615240767, "grad_norm": 16.295326232910156, "learning_rate": 0.00015608454396951645, "loss": 3.0504, "step": 173 }, { "epoch": 0.32538569424964936, "grad_norm": 15.998780250549316, "learning_rate": 0.00015558811630648846, "loss": 3.0265, "step": 174 }, { "epoch": 0.32725572697522204, "grad_norm": 25.61530113220215, "learning_rate": 0.00015508969814521025, "loss": 2.8652, "step": 175 }, { "epoch": 0.3291257597007948, "grad_norm": 15.13475227355957, "learning_rate": 0.00015458930733303018, "loss": 2.5502, "step": 176 }, { "epoch": 0.33099579242636745, "grad_norm": 13.900614738464355, "learning_rate": 0.00015408696178793331, "loss": 2.9654, "step": 177 }, { "epoch": 0.33286582515194013, "grad_norm": 16.07615852355957, "learning_rate": 0.00015358267949789966, "loss": 2.7143, "step": 178 }, { "epoch": 0.33473585787751287, "grad_norm": 18.99631118774414, "learning_rate": 0.0001530764785202603, "loss": 2.8096, "step": 179 }, { "epoch": 0.33660589060308554, "grad_norm": 20.583770751953125, "learning_rate": 0.00015256837698105047, "loss": 2.9865, "step": 180 }, { "epoch": 0.3384759233286583, "grad_norm": 18.920778274536133, "learning_rate": 0.00015205839307436088, "loss": 3.3376, "step": 181 }, { "epoch": 0.34034595605423096, "grad_norm": 18.719928741455078, "learning_rate": 0.00015154654506168585, "loss": 2.7643, "step": 182 }, { "epoch": 0.34221598877980364, "grad_norm": 22.60373878479004, "learning_rate": 0.00015103285127126962, "loss": 2.9539, "step": 183 }, { "epoch": 0.34408602150537637, "grad_norm": 19.586341857910156, "learning_rate": 0.00015051733009745013, "loss": 2.5908, "step": 184 }, { "epoch": 0.34595605423094905, "grad_norm": 20.427947998046875, "learning_rate": 0.00015000000000000001, "loss": 2.6959, "step": 185 }, { "epoch": 0.34782608695652173, "grad_norm": 19.461990356445312, "learning_rate": 0.000149480879503466, "loss": 2.9813, "step": 186 }, { "epoch": 0.34969611968209446, "grad_norm": 23.931228637695312, "learning_rate": 0.00014895998719650526, "loss": 3.047, "step": 187 }, { "epoch": 0.35156615240766714, "grad_norm": 19.459693908691406, "learning_rate": 0.00014843734173122002, "loss": 2.9688, "step": 188 }, { "epoch": 0.3534361851332398, "grad_norm": 25.261966705322266, "learning_rate": 0.0001479129618224895, "loss": 2.8956, "step": 189 }, { "epoch": 0.35530621785881256, "grad_norm": 23.84093475341797, "learning_rate": 0.00014738686624729986, "loss": 3.4096, "step": 190 }, { "epoch": 0.35717625058438524, "grad_norm": 23.549619674682617, "learning_rate": 0.00014685907384407186, "loss": 2.9033, "step": 191 }, { "epoch": 0.3590462833099579, "grad_norm": 21.206594467163086, "learning_rate": 0.00014632960351198618, "loss": 3.0757, "step": 192 }, { "epoch": 0.36091631603553065, "grad_norm": 21.78272247314453, "learning_rate": 0.00014579847421030678, "loss": 2.9197, "step": 193 }, { "epoch": 0.3627863487611033, "grad_norm": 24.570682525634766, "learning_rate": 0.00014526570495770194, "loss": 3.6499, "step": 194 }, { "epoch": 0.364656381486676, "grad_norm": 28.85085105895996, "learning_rate": 0.00014473131483156327, "loss": 2.6047, "step": 195 }, { "epoch": 0.36652641421224874, "grad_norm": 27.24856948852539, "learning_rate": 0.0001441953229673227, "loss": 3.0789, "step": 196 }, { "epoch": 0.3683964469378214, "grad_norm": 28.561187744140625, "learning_rate": 0.000143657748557767, "loss": 3.4813, "step": 197 }, { "epoch": 0.3702664796633941, "grad_norm": 27.148723602294922, "learning_rate": 0.00014311861085235085, "loss": 3.0691, "step": 198 }, { "epoch": 0.37213651238896683, "grad_norm": 31.670246124267578, "learning_rate": 0.00014257792915650728, "loss": 3.9914, "step": 199 }, { "epoch": 0.3740065451145395, "grad_norm": 42.1184196472168, "learning_rate": 0.00014203572283095657, "loss": 4.6206, "step": 200 }, { "epoch": 0.3758765778401122, "grad_norm": 40.36967086791992, "learning_rate": 0.00014149201129101286, "loss": 3.2648, "step": 201 }, { "epoch": 0.3777466105656849, "grad_norm": 38.3328971862793, "learning_rate": 0.00014094681400588906, "loss": 3.248, "step": 202 }, { "epoch": 0.3796166432912576, "grad_norm": 36.41666030883789, "learning_rate": 0.00014040015049799953, "loss": 3.2585, "step": 203 }, { "epoch": 0.3814866760168303, "grad_norm": 23.713642120361328, "learning_rate": 0.00013985204034226115, "loss": 2.8715, "step": 204 }, { "epoch": 0.383356708742403, "grad_norm": 13.198444366455078, "learning_rate": 0.00013930250316539238, "loss": 3.0464, "step": 205 }, { "epoch": 0.3852267414679757, "grad_norm": 11.255416870117188, "learning_rate": 0.0001387515586452103, "loss": 2.7254, "step": 206 }, { "epoch": 0.3870967741935484, "grad_norm": 11.445615768432617, "learning_rate": 0.00013819922650992625, "loss": 2.3275, "step": 207 }, { "epoch": 0.3889668069191211, "grad_norm": 11.428658485412598, "learning_rate": 0.0001376455265374392, "loss": 2.7019, "step": 208 }, { "epoch": 0.3908368396446938, "grad_norm": 11.425288200378418, "learning_rate": 0.00013709047855462765, "loss": 2.6879, "step": 209 }, { "epoch": 0.39270687237026647, "grad_norm": 9.648893356323242, "learning_rate": 0.00013653410243663952, "loss": 2.1931, "step": 210 }, { "epoch": 0.3945769050958392, "grad_norm": 12.493631362915039, "learning_rate": 0.00013597641810618073, "loss": 2.5069, "step": 211 }, { "epoch": 0.3964469378214119, "grad_norm": 11.868135452270508, "learning_rate": 0.0001354174455328015, "loss": 2.8156, "step": 212 }, { "epoch": 0.39831697054698456, "grad_norm": 10.956955909729004, "learning_rate": 0.00013485720473218154, "loss": 2.3458, "step": 213 }, { "epoch": 0.4001870032725573, "grad_norm": 12.486101150512695, "learning_rate": 0.00013429571576541315, "loss": 2.559, "step": 214 }, { "epoch": 0.40205703599813, "grad_norm": 13.160475730895996, "learning_rate": 0.00013373299873828303, "loss": 2.5989, "step": 215 }, { "epoch": 0.40392706872370265, "grad_norm": 14.06461238861084, "learning_rate": 0.00013316907380055208, "loss": 2.9656, "step": 216 }, { "epoch": 0.4057971014492754, "grad_norm": 13.031517028808594, "learning_rate": 0.0001326039611452342, "loss": 2.494, "step": 217 }, { "epoch": 0.40766713417484807, "grad_norm": 12.205702781677246, "learning_rate": 0.00013203768100787297, "loss": 2.4097, "step": 218 }, { "epoch": 0.40953716690042075, "grad_norm": 13.180871963500977, "learning_rate": 0.0001314702536658172, "loss": 2.5468, "step": 219 }, { "epoch": 0.4114071996259935, "grad_norm": 16.636207580566406, "learning_rate": 0.00013090169943749476, "loss": 3.3479, "step": 220 }, { "epoch": 0.41327723235156616, "grad_norm": 13.308319091796875, "learning_rate": 0.000130332038681685, "loss": 2.597, "step": 221 }, { "epoch": 0.41514726507713884, "grad_norm": 13.627033233642578, "learning_rate": 0.00012976129179678988, "loss": 2.4171, "step": 222 }, { "epoch": 0.4170172978027116, "grad_norm": 15.449974060058594, "learning_rate": 0.00012918947922010336, "loss": 3.2259, "step": 223 }, { "epoch": 0.41888733052828425, "grad_norm": 16.265350341796875, "learning_rate": 0.00012861662142707968, "loss": 2.7149, "step": 224 }, { "epoch": 0.42075736325385693, "grad_norm": 14.941640853881836, "learning_rate": 0.00012804273893060028, "loss": 2.9722, "step": 225 }, { "epoch": 0.42262739597942967, "grad_norm": 19.596858978271484, "learning_rate": 0.00012746785228023904, "loss": 2.9886, "step": 226 }, { "epoch": 0.42449742870500234, "grad_norm": 23.8212947845459, "learning_rate": 0.00012689198206152657, "loss": 3.0503, "step": 227 }, { "epoch": 0.426367461430575, "grad_norm": 16.560731887817383, "learning_rate": 0.0001263151488952132, "loss": 2.6271, "step": 228 }, { "epoch": 0.42823749415614776, "grad_norm": 16.66946029663086, "learning_rate": 0.00012573737343653024, "loss": 2.804, "step": 229 }, { "epoch": 0.43010752688172044, "grad_norm": 22.840084075927734, "learning_rate": 0.00012515867637445086, "loss": 2.7086, "step": 230 }, { "epoch": 0.4319775596072931, "grad_norm": 14.753687858581543, "learning_rate": 0.00012457907843094882, "loss": 2.5197, "step": 231 }, { "epoch": 0.43384759233286585, "grad_norm": 14.791753768920898, "learning_rate": 0.0001239986003602566, "loss": 2.394, "step": 232 }, { "epoch": 0.43571762505843853, "grad_norm": 22.011295318603516, "learning_rate": 0.00012341726294812238, "loss": 2.5769, "step": 233 }, { "epoch": 0.4375876577840112, "grad_norm": 20.5321102142334, "learning_rate": 0.00012283508701106557, "loss": 3.0704, "step": 234 }, { "epoch": 0.43945769050958394, "grad_norm": 16.91661262512207, "learning_rate": 0.00012225209339563145, "loss": 2.8309, "step": 235 }, { "epoch": 0.4413277232351566, "grad_norm": 26.08629608154297, "learning_rate": 0.00012166830297764471, "loss": 3.0209, "step": 236 }, { "epoch": 0.4431977559607293, "grad_norm": 19.537260055541992, "learning_rate": 0.00012108373666146191, "loss": 2.8018, "step": 237 }, { "epoch": 0.44506778868630203, "grad_norm": 23.111852645874023, "learning_rate": 0.00012049841537922307, "loss": 3.0249, "step": 238 }, { "epoch": 0.4469378214118747, "grad_norm": 19.322980880737305, "learning_rate": 0.00011991236009010183, "loss": 3.0254, "step": 239 }, { "epoch": 0.4488078541374474, "grad_norm": 30.11720848083496, "learning_rate": 0.00011932559177955533, "loss": 3.9059, "step": 240 }, { "epoch": 0.4506778868630201, "grad_norm": 24.715557098388672, "learning_rate": 0.00011873813145857249, "loss": 3.1555, "step": 241 }, { "epoch": 0.4525479195885928, "grad_norm": 29.622190475463867, "learning_rate": 0.00011815000016292164, "loss": 2.6222, "step": 242 }, { "epoch": 0.4544179523141655, "grad_norm": 24.93659019470215, "learning_rate": 0.00011756121895239753, "loss": 2.9839, "step": 243 }, { "epoch": 0.4562879850397382, "grad_norm": 26.67534637451172, "learning_rate": 0.00011697180891006689, "loss": 3.1704, "step": 244 }, { "epoch": 0.4581580177653109, "grad_norm": 25.920337677001953, "learning_rate": 0.00011638179114151377, "loss": 2.653, "step": 245 }, { "epoch": 0.4600280504908836, "grad_norm": 29.56134796142578, "learning_rate": 0.0001157911867740836, "loss": 2.9653, "step": 246 }, { "epoch": 0.4618980832164563, "grad_norm": 35.859981536865234, "learning_rate": 0.00011520001695612674, "loss": 3.6873, "step": 247 }, { "epoch": 0.463768115942029, "grad_norm": 41.76987838745117, "learning_rate": 0.00011460830285624118, "loss": 3.7447, "step": 248 }, { "epoch": 0.46563814866760167, "grad_norm": 38.14467239379883, "learning_rate": 0.0001140160656625146, "loss": 4.65, "step": 249 }, { "epoch": 0.4675081813931744, "grad_norm": 55.1082649230957, "learning_rate": 0.00011342332658176555, "loss": 3.6235, "step": 250 }, { "epoch": 0.4693782141187471, "grad_norm": 34.86373519897461, "learning_rate": 0.00011283010683878423, "loss": 3.1883, "step": 251 }, { "epoch": 0.47124824684431976, "grad_norm": 30.727251052856445, "learning_rate": 0.00011223642767557227, "loss": 3.3415, "step": 252 }, { "epoch": 0.4731182795698925, "grad_norm": 32.8022575378418, "learning_rate": 0.00011164231035058228, "loss": 3.1891, "step": 253 }, { "epoch": 0.4749883122954652, "grad_norm": 21.999401092529297, "learning_rate": 0.00011104777613795661, "loss": 3.1296, "step": 254 }, { "epoch": 0.47685834502103785, "grad_norm": 20.68447494506836, "learning_rate": 0.00011045284632676536, "loss": 2.7139, "step": 255 }, { "epoch": 0.4787283777466106, "grad_norm": 13.36279010772705, "learning_rate": 0.00010985754222024436, "loss": 2.4648, "step": 256 }, { "epoch": 0.48059841047218327, "grad_norm": 10.14539909362793, "learning_rate": 0.00010926188513503215, "loss": 2.4138, "step": 257 }, { "epoch": 0.48246844319775595, "grad_norm": 9.960616111755371, "learning_rate": 0.00010866589640040669, "loss": 2.7157, "step": 258 }, { "epoch": 0.4843384759233287, "grad_norm": 10.990545272827148, "learning_rate": 0.00010806959735752174, "loss": 2.4142, "step": 259 }, { "epoch": 0.48620850864890136, "grad_norm": 10.179009437561035, "learning_rate": 0.00010747300935864243, "loss": 2.0645, "step": 260 }, { "epoch": 0.48807854137447404, "grad_norm": 9.950582504272461, "learning_rate": 0.00010687615376638093, "loss": 2.2024, "step": 261 }, { "epoch": 0.4899485741000468, "grad_norm": 10.244935989379883, "learning_rate": 0.00010627905195293135, "loss": 2.1277, "step": 262 }, { "epoch": 0.49181860682561945, "grad_norm": 11.46534252166748, "learning_rate": 0.00010568172529930447, "loss": 2.534, "step": 263 }, { "epoch": 0.49368863955119213, "grad_norm": 13.022493362426758, "learning_rate": 0.00010508419519456219, "loss": 2.3155, "step": 264 }, { "epoch": 0.49555867227676487, "grad_norm": 15.116632461547852, "learning_rate": 0.00010448648303505151, "loss": 3.0314, "step": 265 }, { "epoch": 0.49742870500233755, "grad_norm": 13.014046669006348, "learning_rate": 0.0001038886102236385, "loss": 2.4966, "step": 266 }, { "epoch": 0.4992987377279102, "grad_norm": 11.42091178894043, "learning_rate": 0.00010329059816894186, "loss": 2.56, "step": 267 }, { "epoch": 0.501168770453483, "grad_norm": 12.75170612335205, "learning_rate": 0.00010269246828456629, "loss": 2.5727, "step": 268 }, { "epoch": 0.501168770453483, "eval_loss": 2.828613758087158, "eval_runtime": 12.8533, "eval_samples_per_second": 17.583, "eval_steps_per_second": 8.792, "step": 268 }, { "epoch": 0.5030388031790556, "grad_norm": 13.714940071105957, "learning_rate": 0.0001020942419883357, "loss": 2.4806, "step": 269 }, { "epoch": 0.5049088359046283, "grad_norm": 14.08552074432373, "learning_rate": 0.00010149594070152638, "loss": 2.7159, "step": 270 }, { "epoch": 0.506778868630201, "grad_norm": 14.860525131225586, "learning_rate": 0.00010089758584809979, "loss": 2.5984, "step": 271 }, { "epoch": 0.5086489013557737, "grad_norm": 14.785524368286133, "learning_rate": 0.00010029919885393563, "loss": 2.6805, "step": 272 }, { "epoch": 0.5105189340813464, "grad_norm": 14.939064979553223, "learning_rate": 9.970080114606439e-05, "loss": 2.4628, "step": 273 }, { "epoch": 0.5123889668069191, "grad_norm": 13.804859161376953, "learning_rate": 9.910241415190021e-05, "loss": 2.847, "step": 274 }, { "epoch": 0.5142589995324918, "grad_norm": 14.139949798583984, "learning_rate": 9.850405929847366e-05, "loss": 2.3347, "step": 275 }, { "epoch": 0.5161290322580645, "grad_norm": 16.16015625, "learning_rate": 9.790575801166432e-05, "loss": 2.7167, "step": 276 }, { "epoch": 0.5179990649836372, "grad_norm": 17.632705688476562, "learning_rate": 9.730753171543374e-05, "loss": 3.3824, "step": 277 }, { "epoch": 0.5198690977092099, "grad_norm": 20.08257484436035, "learning_rate": 9.670940183105812e-05, "loss": 3.1337, "step": 278 }, { "epoch": 0.5217391304347826, "grad_norm": 16.899017333984375, "learning_rate": 9.611138977636153e-05, "loss": 2.8295, "step": 279 }, { "epoch": 0.5236091631603553, "grad_norm": 16.086267471313477, "learning_rate": 9.551351696494854e-05, "loss": 2.2724, "step": 280 }, { "epoch": 0.525479195885928, "grad_norm": 17.3554744720459, "learning_rate": 9.491580480543784e-05, "loss": 3.0389, "step": 281 }, { "epoch": 0.5273492286115007, "grad_norm": 15.312295913696289, "learning_rate": 9.431827470069558e-05, "loss": 1.8464, "step": 282 }, { "epoch": 0.5292192613370734, "grad_norm": 14.824910163879395, "learning_rate": 9.372094804706867e-05, "loss": 2.5675, "step": 283 }, { "epoch": 0.531089294062646, "grad_norm": 62.370765686035156, "learning_rate": 9.312384623361909e-05, "loss": 3.6012, "step": 284 }, { "epoch": 0.5329593267882188, "grad_norm": 17.69074249267578, "learning_rate": 9.252699064135758e-05, "loss": 2.9718, "step": 285 }, { "epoch": 0.5348293595137915, "grad_norm": 18.726192474365234, "learning_rate": 9.193040264247829e-05, "loss": 2.6739, "step": 286 }, { "epoch": 0.5366993922393641, "grad_norm": 23.5485782623291, "learning_rate": 9.13341035995933e-05, "loss": 3.519, "step": 287 }, { "epoch": 0.5385694249649369, "grad_norm": 23.890811920166016, "learning_rate": 9.073811486496788e-05, "loss": 3.0704, "step": 288 }, { "epoch": 0.5404394576905096, "grad_norm": 21.349260330200195, "learning_rate": 9.014245777975565e-05, "loss": 3.1382, "step": 289 }, { "epoch": 0.5423094904160822, "grad_norm": 27.911243438720703, "learning_rate": 8.954715367323468e-05, "loss": 3.3497, "step": 290 }, { "epoch": 0.544179523141655, "grad_norm": 23.007844924926758, "learning_rate": 8.89522238620434e-05, "loss": 2.4557, "step": 291 }, { "epoch": 0.5460495558672277, "grad_norm": 28.537771224975586, "learning_rate": 8.835768964941773e-05, "loss": 3.3786, "step": 292 }, { "epoch": 0.5479195885928003, "grad_norm": 23.56622314453125, "learning_rate": 8.776357232442778e-05, "loss": 3.245, "step": 293 }, { "epoch": 0.5497896213183731, "grad_norm": 26.669605255126953, "learning_rate": 8.716989316121578e-05, "loss": 3.5067, "step": 294 }, { "epoch": 0.5516596540439458, "grad_norm": 25.95106315612793, "learning_rate": 8.657667341823448e-05, "loss": 2.7495, "step": 295 }, { "epoch": 0.5535296867695184, "grad_norm": 36.05848693847656, "learning_rate": 8.598393433748541e-05, "loss": 3.4859, "step": 296 }, { "epoch": 0.5553997194950911, "grad_norm": 25.490476608276367, "learning_rate": 8.539169714375885e-05, "loss": 3.1739, "step": 297 }, { "epoch": 0.5572697522206639, "grad_norm": 36.27049255371094, "learning_rate": 8.479998304387329e-05, "loss": 3.6866, "step": 298 }, { "epoch": 0.5591397849462365, "grad_norm": 33.86897277832031, "learning_rate": 8.420881322591642e-05, "loss": 2.8654, "step": 299 }, { "epoch": 0.5610098176718092, "grad_norm": 44.89165115356445, "learning_rate": 8.361820885848624e-05, "loss": 3.9259, "step": 300 }, { "epoch": 0.562879850397382, "grad_norm": 20.971372604370117, "learning_rate": 8.302819108993312e-05, "loss": 2.7844, "step": 301 }, { "epoch": 0.5647498831229546, "grad_norm": 24.576387405395508, "learning_rate": 8.243878104760249e-05, "loss": 3.0554, "step": 302 }, { "epoch": 0.5666199158485273, "grad_norm": 25.367847442626953, "learning_rate": 8.184999983707837e-05, "loss": 2.9751, "step": 303 }, { "epoch": 0.5684899485741001, "grad_norm": 20.629751205444336, "learning_rate": 8.126186854142752e-05, "loss": 2.8788, "step": 304 }, { "epoch": 0.5703599812996727, "grad_norm": 17.573223114013672, "learning_rate": 8.067440822044469e-05, "loss": 2.3871, "step": 305 }, { "epoch": 0.5722300140252454, "grad_norm": 15.68101692199707, "learning_rate": 8.00876399098982e-05, "loss": 1.9594, "step": 306 }, { "epoch": 0.5741000467508182, "grad_norm": 13.078360557556152, "learning_rate": 7.950158462077697e-05, "loss": 2.3795, "step": 307 }, { "epoch": 0.5759700794763908, "grad_norm": 11.211087226867676, "learning_rate": 7.891626333853812e-05, "loss": 2.7153, "step": 308 }, { "epoch": 0.5778401122019635, "grad_norm": 9.67572021484375, "learning_rate": 7.833169702235531e-05, "loss": 2.4531, "step": 309 }, { "epoch": 0.5797101449275363, "grad_norm": 12.671576499938965, "learning_rate": 7.774790660436858e-05, "loss": 2.2148, "step": 310 }, { "epoch": 0.5815801776531089, "grad_norm": 9.585877418518066, "learning_rate": 7.716491298893442e-05, "loss": 2.7332, "step": 311 }, { "epoch": 0.5834502103786816, "grad_norm": 10.074613571166992, "learning_rate": 7.658273705187761e-05, "loss": 2.2678, "step": 312 }, { "epoch": 0.5853202431042543, "grad_norm": 10.348039627075195, "learning_rate": 7.600139963974341e-05, "loss": 2.6394, "step": 313 }, { "epoch": 0.587190275829827, "grad_norm": 10.6520414352417, "learning_rate": 7.542092156905123e-05, "loss": 2.1318, "step": 314 }, { "epoch": 0.5890603085553997, "grad_norm": 11.883548736572266, "learning_rate": 7.484132362554915e-05, "loss": 2.5724, "step": 315 }, { "epoch": 0.5909303412809724, "grad_norm": 11.804710388183594, "learning_rate": 7.426262656346978e-05, "loss": 2.3502, "step": 316 }, { "epoch": 0.5928003740065451, "grad_norm": 12.366445541381836, "learning_rate": 7.368485110478685e-05, "loss": 2.2743, "step": 317 }, { "epoch": 0.5946704067321178, "grad_norm": 13.19687557220459, "learning_rate": 7.310801793847344e-05, "loss": 2.6187, "step": 318 }, { "epoch": 0.5965404394576905, "grad_norm": 13.92205810546875, "learning_rate": 7.2532147719761e-05, "loss": 2.8836, "step": 319 }, { "epoch": 0.5984104721832632, "grad_norm": 13.191638946533203, "learning_rate": 7.195726106939974e-05, "loss": 2.7437, "step": 320 }, { "epoch": 0.6002805049088359, "grad_norm": 12.879383087158203, "learning_rate": 7.138337857292034e-05, "loss": 2.3656, "step": 321 }, { "epoch": 0.6021505376344086, "grad_norm": 18.539752960205078, "learning_rate": 7.081052077989667e-05, "loss": 2.7168, "step": 322 }, { "epoch": 0.6040205703599812, "grad_norm": 15.435444831848145, "learning_rate": 7.023870820321017e-05, "loss": 2.8032, "step": 323 }, { "epoch": 0.605890603085554, "grad_norm": 15.457225799560547, "learning_rate": 6.966796131831501e-05, "loss": 2.9741, "step": 324 }, { "epoch": 0.6077606358111267, "grad_norm": 14.631234169006348, "learning_rate": 6.909830056250527e-05, "loss": 2.782, "step": 325 }, { "epoch": 0.6096306685366993, "grad_norm": 13.480672836303711, "learning_rate": 6.85297463341828e-05, "loss": 2.3114, "step": 326 }, { "epoch": 0.6115007012622721, "grad_norm": 15.09902572631836, "learning_rate": 6.796231899212704e-05, "loss": 2.0404, "step": 327 }, { "epoch": 0.6133707339878448, "grad_norm": 19.798410415649414, "learning_rate": 6.739603885476582e-05, "loss": 3.2275, "step": 328 }, { "epoch": 0.6152407667134174, "grad_norm": 19.74947738647461, "learning_rate": 6.683092619944796e-05, "loss": 2.8039, "step": 329 }, { "epoch": 0.6171107994389902, "grad_norm": 15.367700576782227, "learning_rate": 6.626700126171702e-05, "loss": 2.8833, "step": 330 }, { "epoch": 0.6189808321645629, "grad_norm": 18.47310447692871, "learning_rate": 6.570428423458687e-05, "loss": 2.8571, "step": 331 }, { "epoch": 0.6208508648901355, "grad_norm": 17.92266273498535, "learning_rate": 6.51427952678185e-05, "loss": 2.3749, "step": 332 }, { "epoch": 0.6227208976157083, "grad_norm": 17.863025665283203, "learning_rate": 6.458255446719854e-05, "loss": 2.6118, "step": 333 }, { "epoch": 0.624590930341281, "grad_norm": 16.579729080200195, "learning_rate": 6.402358189381934e-05, "loss": 2.2111, "step": 334 }, { "epoch": 0.6264609630668536, "grad_norm": 21.433856964111328, "learning_rate": 6.34658975633605e-05, "loss": 2.8057, "step": 335 }, { "epoch": 0.6283309957924264, "grad_norm": 17.18570899963379, "learning_rate": 6.290952144537241e-05, "loss": 2.7357, "step": 336 }, { "epoch": 0.6302010285179991, "grad_norm": 27.641077041625977, "learning_rate": 6.23544734625608e-05, "loss": 3.0753, "step": 337 }, { "epoch": 0.6320710612435717, "grad_norm": 25.628257751464844, "learning_rate": 6.180077349007376e-05, "loss": 3.2045, "step": 338 }, { "epoch": 0.6339410939691444, "grad_norm": 18.360332489013672, "learning_rate": 6.12484413547897e-05, "loss": 2.6208, "step": 339 }, { "epoch": 0.6358111266947172, "grad_norm": 52.43937683105469, "learning_rate": 6.069749683460765e-05, "loss": 3.2548, "step": 340 }, { "epoch": 0.6376811594202898, "grad_norm": 25.194255828857422, "learning_rate": 6.014795965773884e-05, "loss": 2.9652, "step": 341 }, { "epoch": 0.6395511921458625, "grad_norm": 33.8499870300293, "learning_rate": 5.9599849502000485e-05, "loss": 3.4296, "step": 342 }, { "epoch": 0.6414212248714353, "grad_norm": 33.058990478515625, "learning_rate": 5.9053185994110974e-05, "loss": 3.6159, "step": 343 }, { "epoch": 0.6432912575970079, "grad_norm": 22.76582145690918, "learning_rate": 5.8507988708987146e-05, "loss": 2.9352, "step": 344 }, { "epoch": 0.6451612903225806, "grad_norm": 27.470590591430664, "learning_rate": 5.796427716904347e-05, "loss": 3.0844, "step": 345 }, { "epoch": 0.6470313230481534, "grad_norm": 30.144134521484375, "learning_rate": 5.7422070843492734e-05, "loss": 3.3897, "step": 346 }, { "epoch": 0.648901355773726, "grad_norm": 27.228771209716797, "learning_rate": 5.6881389147649176e-05, "loss": 3.0126, "step": 347 }, { "epoch": 0.6507713884992987, "grad_norm": 30.412782669067383, "learning_rate": 5.634225144223302e-05, "loss": 2.8863, "step": 348 }, { "epoch": 0.6526414212248715, "grad_norm": 43.44699478149414, "learning_rate": 5.5804677032677354e-05, "loss": 4.0037, "step": 349 }, { "epoch": 0.6545114539504441, "grad_norm": 50.50189208984375, "learning_rate": 5.526868516843673e-05, "loss": 3.4202, "step": 350 }, { "epoch": 0.6563814866760168, "grad_norm": 17.37109375, "learning_rate": 5.47342950422981e-05, "loss": 2.9548, "step": 351 }, { "epoch": 0.6582515194015895, "grad_norm": 18.439983367919922, "learning_rate": 5.420152578969326e-05, "loss": 2.7266, "step": 352 }, { "epoch": 0.6601215521271622, "grad_norm": 20.2562313079834, "learning_rate": 5.3670396488013854e-05, "loss": 2.5423, "step": 353 }, { "epoch": 0.6619915848527349, "grad_norm": 15.055497169494629, "learning_rate": 5.3140926155928136e-05, "loss": 2.6748, "step": 354 }, { "epoch": 0.6638616175783076, "grad_norm": 13.408388137817383, "learning_rate": 5.261313375270014e-05, "loss": 2.9774, "step": 355 }, { "epoch": 0.6657316503038803, "grad_norm": 14.892769813537598, "learning_rate": 5.208703817751053e-05, "loss": 2.2886, "step": 356 }, { "epoch": 0.667601683029453, "grad_norm": 12.354819297790527, "learning_rate": 5.156265826877999e-05, "loss": 2.8214, "step": 357 }, { "epoch": 0.6694717157550257, "grad_norm": 13.169706344604492, "learning_rate": 5.1040012803494795e-05, "loss": 2.4982, "step": 358 }, { "epoch": 0.6713417484805985, "grad_norm": 12.36215591430664, "learning_rate": 5.0519120496534044e-05, "loss": 3.0511, "step": 359 }, { "epoch": 0.6732117812061711, "grad_norm": 11.116745948791504, "learning_rate": 5.000000000000002e-05, "loss": 2.3823, "step": 360 }, { "epoch": 0.6750818139317438, "grad_norm": 11.059001922607422, "learning_rate": 4.9482669902549894e-05, "loss": 2.3138, "step": 361 }, { "epoch": 0.6769518466573166, "grad_norm": 10.290990829467773, "learning_rate": 4.896714872873038e-05, "loss": 2.0308, "step": 362 }, { "epoch": 0.6788218793828892, "grad_norm": 14.33850383758545, "learning_rate": 4.845345493831419e-05, "loss": 2.6556, "step": 363 }, { "epoch": 0.6806919121084619, "grad_norm": 12.50505542755127, "learning_rate": 4.794160692563917e-05, "loss": 2.6198, "step": 364 }, { "epoch": 0.6825619448340347, "grad_norm": 11.454310417175293, "learning_rate": 4.743162301894952e-05, "loss": 2.4727, "step": 365 }, { "epoch": 0.6844319775596073, "grad_norm": 11.56086254119873, "learning_rate": 4.692352147973973e-05, "loss": 2.1338, "step": 366 }, { "epoch": 0.68630201028518, "grad_norm": 11.487324714660645, "learning_rate": 4.6417320502100316e-05, "loss": 2.4042, "step": 367 }, { "epoch": 0.6881720430107527, "grad_norm": 11.415949821472168, "learning_rate": 4.591303821206673e-05, "loss": 2.6558, "step": 368 }, { "epoch": 0.6900420757363254, "grad_norm": 12.308177947998047, "learning_rate": 4.541069266696984e-05, "loss": 2.3123, "step": 369 }, { "epoch": 0.6919121084618981, "grad_norm": 11.585166931152344, "learning_rate": 4.491030185478976e-05, "loss": 2.7433, "step": 370 }, { "epoch": 0.6937821411874708, "grad_norm": 12.77014446258545, "learning_rate": 4.441188369351157e-05, "loss": 2.646, "step": 371 }, { "epoch": 0.6956521739130435, "grad_norm": 12.757980346679688, "learning_rate": 4.391545603048358e-05, "loss": 2.702, "step": 372 }, { "epoch": 0.6975222066386162, "grad_norm": 15.349445343017578, "learning_rate": 4.3421036641778556e-05, "loss": 3.0098, "step": 373 }, { "epoch": 0.6993922393641889, "grad_norm": 15.5166597366333, "learning_rate": 4.2928643231556844e-05, "loss": 3.2723, "step": 374 }, { "epoch": 0.7012622720897616, "grad_norm": 13.189452171325684, "learning_rate": 4.2438293431432665e-05, "loss": 2.363, "step": 375 }, { "epoch": 0.7031323048153343, "grad_norm": 17.275712966918945, "learning_rate": 4.195000479984265e-05, "loss": 3.0557, "step": 376 }, { "epoch": 0.705002337540907, "grad_norm": 16.319204330444336, "learning_rate": 4.146379482141723e-05, "loss": 2.6115, "step": 377 }, { "epoch": 0.7068723702664796, "grad_norm": 14.946385383605957, "learning_rate": 4.097968090635439e-05, "loss": 2.6765, "step": 378 }, { "epoch": 0.7087424029920524, "grad_norm": 19.0872745513916, "learning_rate": 4.049768038979631e-05, "loss": 2.2803, "step": 379 }, { "epoch": 0.7106124357176251, "grad_norm": 15.21898078918457, "learning_rate": 4.001781053120863e-05, "loss": 2.4357, "step": 380 }, { "epoch": 0.7124824684431977, "grad_norm": 17.521011352539062, "learning_rate": 3.954008851376252e-05, "loss": 2.6634, "step": 381 }, { "epoch": 0.7143525011687705, "grad_norm": 16.630462646484375, "learning_rate": 3.90645314437192e-05, "loss": 2.2544, "step": 382 }, { "epoch": 0.7162225338943432, "grad_norm": 18.478824615478516, "learning_rate": 3.859115634981748e-05, "loss": 2.6528, "step": 383 }, { "epoch": 0.7180925666199158, "grad_norm": 22.990461349487305, "learning_rate": 3.811998018266416e-05, "loss": 2.8017, "step": 384 }, { "epoch": 0.7199625993454886, "grad_norm": 23.46438217163086, "learning_rate": 3.7651019814126654e-05, "loss": 3.5866, "step": 385 }, { "epoch": 0.7218326320710613, "grad_norm": 19.73815155029297, "learning_rate": 3.718429203672936e-05, "loss": 2.6817, "step": 386 }, { "epoch": 0.7237026647966339, "grad_norm": 20.134384155273438, "learning_rate": 3.671981356305191e-05, "loss": 3.4092, "step": 387 }, { "epoch": 0.7255726975222067, "grad_norm": 19.134763717651367, "learning_rate": 3.6257601025131026e-05, "loss": 2.5706, "step": 388 }, { "epoch": 0.7274427302477794, "grad_norm": 19.289335250854492, "learning_rate": 3.57976709738648e-05, "loss": 3.1928, "step": 389 }, { "epoch": 0.729312762973352, "grad_norm": 21.083354949951172, "learning_rate": 3.534003987842005e-05, "loss": 3.1004, "step": 390 }, { "epoch": 0.7311827956989247, "grad_norm": 19.628206253051758, "learning_rate": 3.488472412564264e-05, "loss": 3.0638, "step": 391 }, { "epoch": 0.7330528284244975, "grad_norm": 25.418609619140625, "learning_rate": 3.4431740019470774e-05, "loss": 3.3569, "step": 392 }, { "epoch": 0.7349228611500701, "grad_norm": 24.504236221313477, "learning_rate": 3.398110378035098e-05, "loss": 3.3934, "step": 393 }, { "epoch": 0.7367928938756428, "grad_norm": 32.00563430786133, "learning_rate": 3.353283154465746e-05, "loss": 3.3888, "step": 394 }, { "epoch": 0.7386629266012156, "grad_norm": 25.788663864135742, "learning_rate": 3.308693936411421e-05, "loss": 3.7214, "step": 395 }, { "epoch": 0.7405329593267882, "grad_norm": 29.49918556213379, "learning_rate": 3.264344320522024e-05, "loss": 3.6635, "step": 396 }, { "epoch": 0.7424029920523609, "grad_norm": 52.400569915771484, "learning_rate": 3.220235894867794e-05, "loss": 3.574, "step": 397 }, { "epoch": 0.7442730247779337, "grad_norm": 31.379642486572266, "learning_rate": 3.1763702388824214e-05, "loss": 3.8696, "step": 398 }, { "epoch": 0.7461430575035063, "grad_norm": 33.01647186279297, "learning_rate": 3.132748923306522e-05, "loss": 3.5235, "step": 399 }, { "epoch": 0.748013090229079, "grad_norm": 54.04301071166992, "learning_rate": 3.089373510131354e-05, "loss": 4.3509, "step": 400 }, { "epoch": 0.7498831229546518, "grad_norm": 9.012869834899902, "learning_rate": 3.0462455525429257e-05, "loss": 2.4901, "step": 401 }, { "epoch": 0.7517531556802244, "grad_norm": 10.679055213928223, "learning_rate": 3.0033665948663448e-05, "loss": 2.0973, "step": 402 }, { "epoch": 0.7517531556802244, "eval_loss": 2.625810146331787, "eval_runtime": 12.8607, "eval_samples_per_second": 17.573, "eval_steps_per_second": 8.786, "step": 402 }, { "epoch": 0.7536231884057971, "grad_norm": 9.760929107666016, "learning_rate": 2.960738172510551e-05, "loss": 2.5254, "step": 403 }, { "epoch": 0.7554932211313699, "grad_norm": 8.057788848876953, "learning_rate": 2.9183618119133062e-05, "loss": 1.8935, "step": 404 }, { "epoch": 0.7573632538569425, "grad_norm": 9.041131019592285, "learning_rate": 2.876239030486554e-05, "loss": 2.5203, "step": 405 }, { "epoch": 0.7592332865825152, "grad_norm": 9.86754322052002, "learning_rate": 2.8343713365620772e-05, "loss": 2.1501, "step": 406 }, { "epoch": 0.7611033193080879, "grad_norm": 9.489944458007812, "learning_rate": 2.7927602293375e-05, "loss": 2.1061, "step": 407 }, { "epoch": 0.7629733520336606, "grad_norm": 9.308277130126953, "learning_rate": 2.751407198822583e-05, "loss": 1.8263, "step": 408 }, { "epoch": 0.7648433847592333, "grad_norm": 10.4146728515625, "learning_rate": 2.7103137257858868e-05, "loss": 2.5591, "step": 409 }, { "epoch": 0.766713417484806, "grad_norm": 10.501506805419922, "learning_rate": 2.669481281701739e-05, "loss": 2.7704, "step": 410 }, { "epoch": 0.7685834502103787, "grad_norm": 12.172067642211914, "learning_rate": 2.6289113286975485e-05, "loss": 2.5226, "step": 411 }, { "epoch": 0.7704534829359514, "grad_norm": 12.657646179199219, "learning_rate": 2.5886053195014538e-05, "loss": 2.7101, "step": 412 }, { "epoch": 0.7723235156615241, "grad_norm": 11.94274616241455, "learning_rate": 2.5485646973902865e-05, "loss": 2.6979, "step": 413 }, { "epoch": 0.7741935483870968, "grad_norm": 11.538039207458496, "learning_rate": 2.508790896137918e-05, "loss": 2.7357, "step": 414 }, { "epoch": 0.7760635811126695, "grad_norm": 11.735588073730469, "learning_rate": 2.4692853399638917e-05, "loss": 2.7582, "step": 415 }, { "epoch": 0.7779336138382422, "grad_norm": 11.011154174804688, "learning_rate": 2.4300494434824373e-05, "loss": 2.5606, "step": 416 }, { "epoch": 0.7798036465638148, "grad_norm": 12.485893249511719, "learning_rate": 2.391084611651816e-05, "loss": 2.2526, "step": 417 }, { "epoch": 0.7816736792893876, "grad_norm": 13.266201972961426, "learning_rate": 2.352392239724016e-05, "loss": 2.7745, "step": 418 }, { "epoch": 0.7835437120149603, "grad_norm": 12.729734420776367, "learning_rate": 2.3139737131947824e-05, "loss": 2.1154, "step": 419 }, { "epoch": 0.7854137447405329, "grad_norm": 11.948831558227539, "learning_rate": 2.275830407754006e-05, "loss": 2.3556, "step": 420 }, { "epoch": 0.7872837774661057, "grad_norm": 15.799360275268555, "learning_rate": 2.237963689236472e-05, "loss": 3.1533, "step": 421 }, { "epoch": 0.7891538101916784, "grad_norm": 14.989856719970703, "learning_rate": 2.200374913572939e-05, "loss": 2.8148, "step": 422 }, { "epoch": 0.791023842917251, "grad_norm": 17.03190040588379, "learning_rate": 2.163065426741603e-05, "loss": 2.8717, "step": 423 }, { "epoch": 0.7928938756428238, "grad_norm": 16.29947280883789, "learning_rate": 2.1260365647198798e-05, "loss": 3.1721, "step": 424 }, { "epoch": 0.7947639083683965, "grad_norm": 14.097606658935547, "learning_rate": 2.0892896534365904e-05, "loss": 2.7177, "step": 425 }, { "epoch": 0.7966339410939691, "grad_norm": 18.629955291748047, "learning_rate": 2.0528260087244487e-05, "loss": 3.0088, "step": 426 }, { "epoch": 0.7985039738195419, "grad_norm": 14.538827896118164, "learning_rate": 2.016646936272987e-05, "loss": 2.5514, "step": 427 }, { "epoch": 0.8003740065451146, "grad_norm": 13.943222045898438, "learning_rate": 1.9807537315817604e-05, "loss": 2.4046, "step": 428 }, { "epoch": 0.8022440392706872, "grad_norm": 17.54332160949707, "learning_rate": 1.9451476799139935e-05, "loss": 3.5145, "step": 429 }, { "epoch": 0.80411407199626, "grad_norm": 16.975404739379883, "learning_rate": 1.9098300562505266e-05, "loss": 2.6382, "step": 430 }, { "epoch": 0.8059841047218327, "grad_norm": 18.486377716064453, "learning_rate": 1.8748021252441817e-05, "loss": 2.5148, "step": 431 }, { "epoch": 0.8078541374474053, "grad_norm": 15.601974487304688, "learning_rate": 1.8400651411744685e-05, "loss": 2.418, "step": 432 }, { "epoch": 0.809724170172978, "grad_norm": 17.163917541503906, "learning_rate": 1.805620347902681e-05, "loss": 2.592, "step": 433 }, { "epoch": 0.8115942028985508, "grad_norm": 18.376161575317383, "learning_rate": 1.771468978827343e-05, "loss": 3.3599, "step": 434 }, { "epoch": 0.8134642356241234, "grad_norm": 20.212865829467773, "learning_rate": 1.7376122568400532e-05, "loss": 3.021, "step": 435 }, { "epoch": 0.8153342683496961, "grad_norm": 26.279674530029297, "learning_rate": 1.7040513942816906e-05, "loss": 3.0855, "step": 436 }, { "epoch": 0.8172043010752689, "grad_norm": 18.360366821289062, "learning_rate": 1.6707875928990058e-05, "loss": 2.2329, "step": 437 }, { "epoch": 0.8190743338008415, "grad_norm": 25.305208206176758, "learning_rate": 1.6378220438015933e-05, "loss": 2.7794, "step": 438 }, { "epoch": 0.8209443665264142, "grad_norm": 19.780479431152344, "learning_rate": 1.6051559274192275e-05, "loss": 2.8426, "step": 439 }, { "epoch": 0.822814399251987, "grad_norm": 24.164377212524414, "learning_rate": 1.5727904134596083e-05, "loss": 3.2383, "step": 440 }, { "epoch": 0.8246844319775596, "grad_norm": 24.38163185119629, "learning_rate": 1.540726660866466e-05, "loss": 3.5773, "step": 441 }, { "epoch": 0.8265544647031323, "grad_norm": 22.310827255249023, "learning_rate": 1.5089658177780653e-05, "loss": 3.2983, "step": 442 }, { "epoch": 0.828424497428705, "grad_norm": 54.48847579956055, "learning_rate": 1.477509021486091e-05, "loss": 3.3077, "step": 443 }, { "epoch": 0.8302945301542777, "grad_norm": 23.941181182861328, "learning_rate": 1.4463573983949341e-05, "loss": 3.283, "step": 444 }, { "epoch": 0.8321645628798504, "grad_norm": 23.76746940612793, "learning_rate": 1.415512063981339e-05, "loss": 3.4818, "step": 445 }, { "epoch": 0.8340345956054231, "grad_norm": 26.566600799560547, "learning_rate": 1.3849741227544777e-05, "loss": 2.6301, "step": 446 }, { "epoch": 0.8359046283309958, "grad_norm": 30.18052864074707, "learning_rate": 1.3547446682163889e-05, "loss": 3.8662, "step": 447 }, { "epoch": 0.8377746610565685, "grad_norm": 36.08682632446289, "learning_rate": 1.3248247828228245e-05, "loss": 3.9059, "step": 448 }, { "epoch": 0.8396446937821412, "grad_norm": 39.56899642944336, "learning_rate": 1.2952155379444975e-05, "loss": 3.1025, "step": 449 }, { "epoch": 0.8415147265077139, "grad_norm": 39.47623062133789, "learning_rate": 1.2659179938287035e-05, "loss": 4.5911, "step": 450 }, { "epoch": 0.8433847592332866, "grad_norm": 8.356927871704102, "learning_rate": 1.2369331995613665e-05, "loss": 2.3298, "step": 451 }, { "epoch": 0.8452547919588593, "grad_norm": 7.952692985534668, "learning_rate": 1.2082621930294635e-05, "loss": 2.4439, "step": 452 }, { "epoch": 0.847124824684432, "grad_norm": 9.349421501159668, "learning_rate": 1.1799060008838791e-05, "loss": 2.6627, "step": 453 }, { "epoch": 0.8489948574100047, "grad_norm": 9.868120193481445, "learning_rate": 1.151865638502615e-05, "loss": 2.1623, "step": 454 }, { "epoch": 0.8508648901355774, "grad_norm": 9.13068675994873, "learning_rate": 1.124142109954459e-05, "loss": 1.9143, "step": 455 }, { "epoch": 0.85273492286115, "grad_norm": 7.523913860321045, "learning_rate": 1.0967364079630115e-05, "loss": 1.7452, "step": 456 }, { "epoch": 0.8546049555867228, "grad_norm": 8.57178783416748, "learning_rate": 1.069649513871147e-05, "loss": 1.6952, "step": 457 }, { "epoch": 0.8564749883122955, "grad_norm": 9.974442481994629, "learning_rate": 1.042882397605871e-05, "loss": 2.4002, "step": 458 }, { "epoch": 0.8583450210378681, "grad_norm": 9.599936485290527, "learning_rate": 1.0164360176435961e-05, "loss": 2.5942, "step": 459 }, { "epoch": 0.8602150537634409, "grad_norm": 9.855104446411133, "learning_rate": 9.903113209758096e-06, "loss": 1.904, "step": 460 }, { "epoch": 0.8620850864890136, "grad_norm": 11.599543571472168, "learning_rate": 9.6450924307517e-06, "loss": 1.6992, "step": 461 }, { "epoch": 0.8639551192145862, "grad_norm": 9.953221321105957, "learning_rate": 9.39030707862013e-06, "loss": 2.1692, "step": 462 }, { "epoch": 0.865825151940159, "grad_norm": 12.537870407104492, "learning_rate": 9.138766276712552e-06, "loss": 2.1261, "step": 463 }, { "epoch": 0.8676951846657317, "grad_norm": 12.677627563476562, "learning_rate": 8.890479032197464e-06, "loss": 2.4732, "step": 464 }, { "epoch": 0.8695652173913043, "grad_norm": 12.54604434967041, "learning_rate": 8.645454235739903e-06, "loss": 2.1939, "step": 465 }, { "epoch": 0.8714352501168771, "grad_norm": 11.750197410583496, "learning_rate": 8.403700661183355e-06, "loss": 2.1129, "step": 466 }, { "epoch": 0.8733052828424498, "grad_norm": 11.671151161193848, "learning_rate": 8.165226965235328e-06, "loss": 2.4747, "step": 467 }, { "epoch": 0.8751753155680224, "grad_norm": 14.79646110534668, "learning_rate": 7.930041687157607e-06, "loss": 2.1669, "step": 468 }, { "epoch": 0.8770453482935952, "grad_norm": 11.03843879699707, "learning_rate": 7.698153248460271e-06, "loss": 2.1497, "step": 469 }, { "epoch": 0.8789153810191679, "grad_norm": 12.181533813476562, "learning_rate": 7.46956995260033e-06, "loss": 2.5349, "step": 470 }, { "epoch": 0.8807854137447405, "grad_norm": 27.885995864868164, "learning_rate": 7.244299984684233e-06, "loss": 2.6204, "step": 471 }, { "epoch": 0.8826554464703132, "grad_norm": 14.558568000793457, "learning_rate": 7.022351411174866e-06, "loss": 2.2863, "step": 472 }, { "epoch": 0.884525479195886, "grad_norm": 14.143083572387695, "learning_rate": 6.803732179602684e-06, "loss": 2.2255, "step": 473 }, { "epoch": 0.8863955119214586, "grad_norm": 15.334390640258789, "learning_rate": 6.5884501182811084e-06, "loss": 2.7062, "step": 474 }, { "epoch": 0.8882655446470313, "grad_norm": 14.617964744567871, "learning_rate": 6.37651293602628e-06, "loss": 2.4742, "step": 475 }, { "epoch": 0.8901355773726041, "grad_norm": 13.574972152709961, "learning_rate": 6.167928221880926e-06, "loss": 2.6229, "step": 476 }, { "epoch": 0.8920056100981767, "grad_norm": 14.283699989318848, "learning_rate": 5.9627034448426545e-06, "loss": 2.2862, "step": 477 }, { "epoch": 0.8938756428237494, "grad_norm": 16.076574325561523, "learning_rate": 5.760845953596527e-06, "loss": 2.9803, "step": 478 }, { "epoch": 0.8957456755493222, "grad_norm": 18.35474967956543, "learning_rate": 5.562362976251901e-06, "loss": 2.6646, "step": 479 }, { "epoch": 0.8976157082748948, "grad_norm": 16.39321517944336, "learning_rate": 5.367261620083575e-06, "loss": 2.596, "step": 480 }, { "epoch": 0.8994857410004675, "grad_norm": 16.686824798583984, "learning_rate": 5.175548871277358e-06, "loss": 2.5261, "step": 481 }, { "epoch": 0.9013557737260403, "grad_norm": 16.57494354248047, "learning_rate": 4.9872315946798535e-06, "loss": 2.3015, "step": 482 }, { "epoch": 0.9032258064516129, "grad_norm": 17.73284339904785, "learning_rate": 4.80231653355262e-06, "loss": 2.7761, "step": 483 }, { "epoch": 0.9050958391771856, "grad_norm": 16.7850284576416, "learning_rate": 4.620810309330803e-06, "loss": 2.2855, "step": 484 }, { "epoch": 0.9069658719027583, "grad_norm": 18.645483016967773, "learning_rate": 4.442719421385922e-06, "loss": 3.0409, "step": 485 }, { "epoch": 0.908835904628331, "grad_norm": 17.9419002532959, "learning_rate": 4.268050246793276e-06, "loss": 2.664, "step": 486 }, { "epoch": 0.9107059373539037, "grad_norm": 17.279287338256836, "learning_rate": 4.096809040103444e-06, "loss": 2.2947, "step": 487 }, { "epoch": 0.9125759700794764, "grad_norm": 25.570619583129883, "learning_rate": 3.9290019331184145e-06, "loss": 2.6834, "step": 488 }, { "epoch": 0.9144460028050491, "grad_norm": 22.238800048828125, "learning_rate": 3.7646349346719955e-06, "loss": 3.1555, "step": 489 }, { "epoch": 0.9163160355306218, "grad_norm": 18.883892059326172, "learning_rate": 3.6037139304146762e-06, "loss": 2.4276, "step": 490 }, { "epoch": 0.9181860682561945, "grad_norm": 20.898273468017578, "learning_rate": 3.446244682602817e-06, "loss": 3.0974, "step": 491 }, { "epoch": 0.9200561009817672, "grad_norm": 19.29071044921875, "learning_rate": 3.292232829892361e-06, "loss": 2.7486, "step": 492 }, { "epoch": 0.9219261337073399, "grad_norm": 23.598310470581055, "learning_rate": 3.1416838871368924e-06, "loss": 3.4687, "step": 493 }, { "epoch": 0.9237961664329126, "grad_norm": 22.534082412719727, "learning_rate": 2.9946032451902194e-06, "loss": 3.0483, "step": 494 }, { "epoch": 0.9256661991584852, "grad_norm": 20.772890090942383, "learning_rate": 2.8509961707132494e-06, "loss": 2.8497, "step": 495 }, { "epoch": 0.927536231884058, "grad_norm": 27.105192184448242, "learning_rate": 2.7108678059855065e-06, "loss": 3.8906, "step": 496 }, { "epoch": 0.9294062646096307, "grad_norm": 29.60268211364746, "learning_rate": 2.5742231687209017e-06, "loss": 4.1652, "step": 497 }, { "epoch": 0.9312762973352033, "grad_norm": 30.220417022705078, "learning_rate": 2.4410671518880655e-06, "loss": 3.4544, "step": 498 }, { "epoch": 0.9331463300607761, "grad_norm": 34.319881439208984, "learning_rate": 2.311404523535243e-06, "loss": 3.7756, "step": 499 }, { "epoch": 0.9350163627863488, "grad_norm": 48.59423065185547, "learning_rate": 2.1852399266194314e-06, "loss": 4.6194, "step": 500 }, { "epoch": 0.9368863955119214, "grad_norm": 7.484718322753906, "learning_rate": 2.062577878840244e-06, "loss": 2.3524, "step": 501 }, { "epoch": 0.9387564282374942, "grad_norm": 8.843035697937012, "learning_rate": 1.9434227724779984e-06, "loss": 2.4799, "step": 502 }, { "epoch": 0.9406264609630669, "grad_norm": 7.7384161949157715, "learning_rate": 1.8277788742365965e-06, "loss": 1.8435, "step": 503 }, { "epoch": 0.9424964936886395, "grad_norm": 9.377725601196289, "learning_rate": 1.7156503250905898e-06, "loss": 2.1838, "step": 504 }, { "epoch": 0.9443665264142123, "grad_norm": 9.2131929397583, "learning_rate": 1.6070411401370334e-06, "loss": 2.0863, "step": 505 }, { "epoch": 0.946236559139785, "grad_norm": 9.792388916015625, "learning_rate": 1.501955208451633e-06, "loss": 2.5053, "step": 506 }, { "epoch": 0.9481065918653576, "grad_norm": 10.286921501159668, "learning_rate": 1.400396292949513e-06, "loss": 1.9311, "step": 507 }, { "epoch": 0.9499766245909304, "grad_norm": 10.273000717163086, "learning_rate": 1.3023680302504338e-06, "loss": 2.4607, "step": 508 }, { "epoch": 0.9518466573165031, "grad_norm": 9.218442916870117, "learning_rate": 1.207873930548653e-06, "loss": 1.5958, "step": 509 }, { "epoch": 0.9537166900420757, "grad_norm": 12.184614181518555, "learning_rate": 1.1169173774871478e-06, "loss": 2.7458, "step": 510 }, { "epoch": 0.9555867227676484, "grad_norm": 12.237886428833008, "learning_rate": 1.0295016280365112e-06, "loss": 2.8767, "step": 511 }, { "epoch": 0.9574567554932212, "grad_norm": 14.358260154724121, "learning_rate": 9.456298123782902e-07, "loss": 2.5067, "step": 512 }, { "epoch": 0.9593267882187938, "grad_norm": 15.50924301147461, "learning_rate": 8.65304933792932e-07, "loss": 2.7335, "step": 513 }, { "epoch": 0.9611968209443665, "grad_norm": 12.935508728027344, "learning_rate": 7.885298685522235e-07, "loss": 2.3211, "step": 514 }, { "epoch": 0.9630668536699393, "grad_norm": 13.291459083557129, "learning_rate": 7.153073658162646e-07, "loss": 2.5455, "step": 515 }, { "epoch": 0.9649368863955119, "grad_norm": 17.141666412353516, "learning_rate": 6.456400475351232e-07, "loss": 3.0588, "step": 516 }, { "epoch": 0.9668069191210846, "grad_norm": 16.180906295776367, "learning_rate": 5.795304083548559e-07, "loss": 2.8284, "step": 517 }, { "epoch": 0.9686769518466574, "grad_norm": 13.24155044555664, "learning_rate": 5.169808155281786e-07, "loss": 2.2862, "step": 518 }, { "epoch": 0.97054698457223, "grad_norm": 15.022529602050781, "learning_rate": 4.579935088298015e-07, "loss": 2.5049, "step": 519 }, { "epoch": 0.9724170172978027, "grad_norm": 18.232423782348633, "learning_rate": 4.025706004760932e-07, "loss": 3.3347, "step": 520 }, { "epoch": 0.9742870500233755, "grad_norm": 15.241950988769531, "learning_rate": 3.50714075049563e-07, "loss": 2.7126, "step": 521 }, { "epoch": 0.9761570827489481, "grad_norm": 15.458436965942383, "learning_rate": 3.0242578942771825e-07, "loss": 2.6416, "step": 522 }, { "epoch": 0.9780271154745208, "grad_norm": 18.882591247558594, "learning_rate": 2.577074727165951e-07, "loss": 2.461, "step": 523 }, { "epoch": 0.9798971482000935, "grad_norm": 17.970928192138672, "learning_rate": 2.1656072618887468e-07, "loss": 2.9272, "step": 524 }, { "epoch": 0.9817671809256662, "grad_norm": 18.435932159423828, "learning_rate": 1.7898702322648453e-07, "loss": 2.8594, "step": 525 }, { "epoch": 0.9836372136512389, "grad_norm": 19.39750099182129, "learning_rate": 1.449877092679075e-07, "loss": 2.1917, "step": 526 }, { "epoch": 0.9855072463768116, "grad_norm": 20.87994384765625, "learning_rate": 1.1456400175994252e-07, "loss": 3.1765, "step": 527 }, { "epoch": 0.9873772791023843, "grad_norm": 20.146398544311523, "learning_rate": 8.771699011416168e-08, "loss": 2.4422, "step": 528 }, { "epoch": 0.989247311827957, "grad_norm": 21.26473045349121, "learning_rate": 6.444763566786361e-08, "loss": 2.6351, "step": 529 }, { "epoch": 0.9911173445535297, "grad_norm": 24.013477325439453, "learning_rate": 4.475677164966774e-08, "loss": 3.721, "step": 530 }, { "epoch": 0.9929873772791024, "grad_norm": 29.331228256225586, "learning_rate": 2.86451031496604e-08, "loss": 3.3834, "step": 531 }, { "epoch": 0.9948574100046751, "grad_norm": 20.945201873779297, "learning_rate": 1.6113207094181626e-08, "loss": 2.846, "step": 532 }, { "epoch": 0.9967274427302478, "grad_norm": 22.861347198486328, "learning_rate": 7.161532225130607e-09, "loss": 2.9253, "step": 533 }, { "epoch": 0.9985974754558204, "grad_norm": 30.596744537353516, "learning_rate": 1.7903990839229779e-09, "loss": 4.0554, "step": 534 }, { "epoch": 1.0014025245441796, "grad_norm": 14.026424407958984, "learning_rate": 0.0, "loss": 2.7318, "step": 535 } ], "logging_steps": 1, "max_steps": 535, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 134, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8753566709645312e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }