|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0014025245441796, |
|
"eval_steps": 134, |
|
"global_step": 535, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0018700327255726976, |
|
"grad_norm": 13.994741439819336, |
|
"learning_rate": 2e-05, |
|
"loss": 3.2962, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0037400654511453952, |
|
"grad_norm": 16.602272033691406, |
|
"learning_rate": 4e-05, |
|
"loss": 3.6632, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005610098176718092, |
|
"grad_norm": 21.41153335571289, |
|
"learning_rate": 6e-05, |
|
"loss": 3.8852, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0074801309022907905, |
|
"grad_norm": 15.502434730529785, |
|
"learning_rate": 8e-05, |
|
"loss": 3.8645, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009350163627863487, |
|
"grad_norm": 14.494356155395508, |
|
"learning_rate": 0.0001, |
|
"loss": 3.868, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.011220196353436185, |
|
"grad_norm": 20.01993751525879, |
|
"learning_rate": 0.00012, |
|
"loss": 3.5737, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.013090229079008883, |
|
"grad_norm": 14.023553848266602, |
|
"learning_rate": 0.00014, |
|
"loss": 3.2279, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.014960261804581581, |
|
"grad_norm": 15.476705551147461, |
|
"learning_rate": 0.00016, |
|
"loss": 3.8968, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.016830294530154277, |
|
"grad_norm": 14.212241172790527, |
|
"learning_rate": 0.00018, |
|
"loss": 3.401, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.018700327255726974, |
|
"grad_norm": 14.703544616699219, |
|
"learning_rate": 0.0002, |
|
"loss": 3.4936, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.020570359981299673, |
|
"grad_norm": 14.499024391174316, |
|
"learning_rate": 0.00019999820960091608, |
|
"loss": 2.5218, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02244039270687237, |
|
"grad_norm": 16.242597579956055, |
|
"learning_rate": 0.00019999283846777488, |
|
"loss": 3.0075, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02431042543244507, |
|
"grad_norm": 16.940950393676758, |
|
"learning_rate": 0.00019998388679290583, |
|
"loss": 2.8596, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.026180458158017766, |
|
"grad_norm": 13.73834228515625, |
|
"learning_rate": 0.00019997135489685034, |
|
"loss": 3.0382, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.028050490883590462, |
|
"grad_norm": 16.882173538208008, |
|
"learning_rate": 0.00019995524322835034, |
|
"loss": 3.4678, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.029920523609163162, |
|
"grad_norm": 15.432026863098145, |
|
"learning_rate": 0.00019993555236433213, |
|
"loss": 3.4976, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.031790556334735855, |
|
"grad_norm": 16.355684280395508, |
|
"learning_rate": 0.00019991228300988585, |
|
"loss": 3.3903, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.033660589060308554, |
|
"grad_norm": 13.33781909942627, |
|
"learning_rate": 0.00019988543599824005, |
|
"loss": 3.1913, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.035530621785881254, |
|
"grad_norm": 18.799007415771484, |
|
"learning_rate": 0.0001998550122907321, |
|
"loss": 3.2008, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03740065451145395, |
|
"grad_norm": 13.545116424560547, |
|
"learning_rate": 0.0001998210129767735, |
|
"loss": 2.8425, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03927068723702665, |
|
"grad_norm": 14.966565132141113, |
|
"learning_rate": 0.00019978343927381113, |
|
"loss": 2.8036, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.04114071996259935, |
|
"grad_norm": 17.932737350463867, |
|
"learning_rate": 0.00019974229252728342, |
|
"loss": 2.9432, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.043010752688172046, |
|
"grad_norm": 16.62327766418457, |
|
"learning_rate": 0.0001996975742105723, |
|
"loss": 3.0234, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04488078541374474, |
|
"grad_norm": 17.949251174926758, |
|
"learning_rate": 0.00019964928592495045, |
|
"loss": 3.0715, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04675081813931744, |
|
"grad_norm": 18.255725860595703, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 3.155, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04862085086489014, |
|
"grad_norm": 22.37006187438965, |
|
"learning_rate": 0.0001995420064911702, |
|
"loss": 3.0856, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05049088359046283, |
|
"grad_norm": 23.708791732788086, |
|
"learning_rate": 0.00019948301918447183, |
|
"loss": 3.1826, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05236091631603553, |
|
"grad_norm": 21.139902114868164, |
|
"learning_rate": 0.00019942046959164515, |
|
"loss": 3.1289, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05423094904160823, |
|
"grad_norm": 18.88515281677246, |
|
"learning_rate": 0.0001993543599524649, |
|
"loss": 3.2124, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.056100981767180924, |
|
"grad_norm": 22.860271453857422, |
|
"learning_rate": 0.00019928469263418374, |
|
"loss": 3.1165, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.057971014492753624, |
|
"grad_norm": 25.16404151916504, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 3.6229, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.059841047218326324, |
|
"grad_norm": 20.945270538330078, |
|
"learning_rate": 0.00019913469506620707, |
|
"loss": 3.2879, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.061711079943899017, |
|
"grad_norm": 19.237075805664062, |
|
"learning_rate": 0.0001990543701876217, |
|
"loss": 2.9019, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06358111266947171, |
|
"grad_norm": 21.761228561401367, |
|
"learning_rate": 0.0001989704983719635, |
|
"loss": 2.7975, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06545114539504442, |
|
"grad_norm": 18.43031120300293, |
|
"learning_rate": 0.00019888308262251285, |
|
"loss": 2.6259, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06732117812061711, |
|
"grad_norm": 29.478282928466797, |
|
"learning_rate": 0.00019879212606945136, |
|
"loss": 3.4395, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0691912108461898, |
|
"grad_norm": 20.588619232177734, |
|
"learning_rate": 0.00019869763196974957, |
|
"loss": 2.4942, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07106124357176251, |
|
"grad_norm": 25.75873374938965, |
|
"learning_rate": 0.0001985996037070505, |
|
"loss": 3.6293, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0729312762973352, |
|
"grad_norm": 23.140533447265625, |
|
"learning_rate": 0.00019849804479154837, |
|
"loss": 2.928, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0748013090229079, |
|
"grad_norm": 25.42648696899414, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 3.1603, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0766713417484806, |
|
"grad_norm": 25.744718551635742, |
|
"learning_rate": 0.00019828434967490943, |
|
"loss": 3.3389, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0785413744740533, |
|
"grad_norm": 39.95504379272461, |
|
"learning_rate": 0.0001981722211257634, |
|
"loss": 3.6146, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08041140719962599, |
|
"grad_norm": 32.589569091796875, |
|
"learning_rate": 0.00019805657722752202, |
|
"loss": 2.9932, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0822814399251987, |
|
"grad_norm": 48.44812774658203, |
|
"learning_rate": 0.00019793742212115978, |
|
"loss": 3.2977, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08415147265077139, |
|
"grad_norm": 46.82399368286133, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 3.3127, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08602150537634409, |
|
"grad_norm": 35.45515441894531, |
|
"learning_rate": 0.00019768859547646478, |
|
"loss": 3.409, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08789153810191679, |
|
"grad_norm": 28.829622268676758, |
|
"learning_rate": 0.00019755893284811196, |
|
"loss": 3.4779, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08976157082748948, |
|
"grad_norm": 42.49812316894531, |
|
"learning_rate": 0.00019742577683127911, |
|
"loss": 3.7014, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09163160355306219, |
|
"grad_norm": 77.00621795654297, |
|
"learning_rate": 0.0001972891321940145, |
|
"loss": 5.1841, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09350163627863488, |
|
"grad_norm": 46.6325798034668, |
|
"learning_rate": 0.00019714900382928675, |
|
"loss": 3.8944, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09537166900420757, |
|
"grad_norm": 118.46067810058594, |
|
"learning_rate": 0.0001970053967548098, |
|
"loss": 5.2813, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.09724170172978028, |
|
"grad_norm": 94.66202545166016, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 4.5682, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.09911173445535297, |
|
"grad_norm": 60.450828552246094, |
|
"learning_rate": 0.00019670776717010767, |
|
"loss": 3.9013, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.10098176718092566, |
|
"grad_norm": 25.923925399780273, |
|
"learning_rate": 0.0001965537553173972, |
|
"loss": 3.6057, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.10285179990649837, |
|
"grad_norm": 14.4677152633667, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 3.0911, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10472183263207106, |
|
"grad_norm": 12.290410041809082, |
|
"learning_rate": 0.000196235365065328, |
|
"loss": 2.6739, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10659186535764376, |
|
"grad_norm": 14.281597137451172, |
|
"learning_rate": 0.0001960709980668816, |
|
"loss": 2.8039, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.10846189808321646, |
|
"grad_norm": 12.305088996887207, |
|
"learning_rate": 0.0001959031909598966, |
|
"loss": 2.8797, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.11033193080878916, |
|
"grad_norm": 12.407671928405762, |
|
"learning_rate": 0.00019573194975320673, |
|
"loss": 2.402, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.11220196353436185, |
|
"grad_norm": 12.547879219055176, |
|
"learning_rate": 0.0001955572805786141, |
|
"loss": 2.5572, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11407199625993455, |
|
"grad_norm": 15.23267650604248, |
|
"learning_rate": 0.0001953791896906692, |
|
"loss": 3.1069, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.11594202898550725, |
|
"grad_norm": 14.644232749938965, |
|
"learning_rate": 0.00019519768346644737, |
|
"loss": 2.4441, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.11781206171107994, |
|
"grad_norm": 14.574198722839355, |
|
"learning_rate": 0.00019501276840532016, |
|
"loss": 3.1865, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.11968209443665265, |
|
"grad_norm": 15.284114837646484, |
|
"learning_rate": 0.00019482445112872264, |
|
"loss": 2.7122, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12155212716222534, |
|
"grad_norm": 14.484579086303711, |
|
"learning_rate": 0.00019463273837991643, |
|
"loss": 2.9405, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12342215988779803, |
|
"grad_norm": 13.775676727294922, |
|
"learning_rate": 0.00019443763702374812, |
|
"loss": 2.8403, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.12529219261337074, |
|
"grad_norm": 14.479483604431152, |
|
"learning_rate": 0.0001942391540464035, |
|
"loss": 2.9306, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12716222533894342, |
|
"grad_norm": 13.77834701538086, |
|
"learning_rate": 0.00019403729655515737, |
|
"loss": 2.7441, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 17.22318458557129, |
|
"learning_rate": 0.0001938320717781191, |
|
"loss": 2.793, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.13090229079008883, |
|
"grad_norm": 14.486649513244629, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 2.8446, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1327723235156615, |
|
"grad_norm": 19.49801254272461, |
|
"learning_rate": 0.0001934115498817189, |
|
"loss": 3.0295, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.13464235624123422, |
|
"grad_norm": 17.23444938659668, |
|
"learning_rate": 0.00019319626782039734, |
|
"loss": 2.9814, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13651238896680692, |
|
"grad_norm": 18.949792861938477, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 3.0234, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1383824216923796, |
|
"grad_norm": 18.541202545166016, |
|
"learning_rate": 0.00019275570001531578, |
|
"loss": 3.1777, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1402524544179523, |
|
"grad_norm": 21.238248825073242, |
|
"learning_rate": 0.00019253043004739968, |
|
"loss": 3.677, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.14212248714352502, |
|
"grad_norm": 24.113332748413086, |
|
"learning_rate": 0.00019230184675153976, |
|
"loss": 3.2631, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1439925198690977, |
|
"grad_norm": 17.838098526000977, |
|
"learning_rate": 0.00019206995831284242, |
|
"loss": 2.728, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1458625525946704, |
|
"grad_norm": 18.885387420654297, |
|
"learning_rate": 0.00019183477303476467, |
|
"loss": 2.9679, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1477325853202431, |
|
"grad_norm": 19.87195587158203, |
|
"learning_rate": 0.00019159629933881666, |
|
"loss": 2.9025, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.1496026180458158, |
|
"grad_norm": 19.265392303466797, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 3.4623, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1514726507713885, |
|
"grad_norm": 19.50520133972168, |
|
"learning_rate": 0.00019110952096780258, |
|
"loss": 3.1419, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1533426834969612, |
|
"grad_norm": 20.407455444335938, |
|
"learning_rate": 0.00019086123372328746, |
|
"loss": 2.5145, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.15521271622253388, |
|
"grad_norm": 42.79829788208008, |
|
"learning_rate": 0.00019060969292137992, |
|
"loss": 3.0625, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.1570827489481066, |
|
"grad_norm": 20.8466796875, |
|
"learning_rate": 0.00019035490756924832, |
|
"loss": 2.9626, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1589527816736793, |
|
"grad_norm": 21.628929138183594, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 2.8822, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.16082281439925197, |
|
"grad_norm": 29.18246078491211, |
|
"learning_rate": 0.00018983563982356405, |
|
"loss": 3.4651, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.16269284712482468, |
|
"grad_norm": 26.07509422302246, |
|
"learning_rate": 0.0001895711760239413, |
|
"loss": 3.0619, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.1645628798503974, |
|
"grad_norm": 39.00653076171875, |
|
"learning_rate": 0.00018930350486128856, |
|
"loss": 3.1362, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.16643291257597007, |
|
"grad_norm": 23.455015182495117, |
|
"learning_rate": 0.00018903263592036989, |
|
"loss": 3.4385, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.16830294530154277, |
|
"grad_norm": 29.16703224182129, |
|
"learning_rate": 0.00018875857890045543, |
|
"loss": 3.1283, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17017297802711548, |
|
"grad_norm": 29.75694465637207, |
|
"learning_rate": 0.00018848134361497385, |
|
"loss": 3.0399, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.17204301075268819, |
|
"grad_norm": 28.572084426879883, |
|
"learning_rate": 0.00018820093999116124, |
|
"loss": 3.8989, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 50.12794876098633, |
|
"learning_rate": 0.00018791737806970538, |
|
"loss": 3.6389, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.17578307620383357, |
|
"grad_norm": 23.859804153442383, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 3.1504, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.17765310892940628, |
|
"grad_norm": 44.48038864135742, |
|
"learning_rate": 0.00018734082006171299, |
|
"loss": 3.1353, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17952314165497896, |
|
"grad_norm": 26.45856475830078, |
|
"learning_rate": 0.00018704784462055503, |
|
"loss": 2.7182, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.18139317438055166, |
|
"grad_norm": 30.120588302612305, |
|
"learning_rate": 0.00018675175217177175, |
|
"loss": 3.6129, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.18326320710612437, |
|
"grad_norm": 61.866886138916016, |
|
"learning_rate": 0.00018645255331783617, |
|
"loss": 5.1282, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.18513323983169705, |
|
"grad_norm": 39.70136260986328, |
|
"learning_rate": 0.00018615025877245523, |
|
"loss": 3.8535, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.18700327255726976, |
|
"grad_norm": 51.10054397583008, |
|
"learning_rate": 0.00018584487936018661, |
|
"loss": 3.8204, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18887330528284246, |
|
"grad_norm": 160.09117126464844, |
|
"learning_rate": 0.00018553642601605068, |
|
"loss": 5.7671, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.19074333800841514, |
|
"grad_norm": 126.96265411376953, |
|
"learning_rate": 0.0001852249097851391, |
|
"loss": 5.543, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.19261337073398785, |
|
"grad_norm": 79.90239715576172, |
|
"learning_rate": 0.0001849103418222194, |
|
"loss": 4.1758, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.19448340345956056, |
|
"grad_norm": 48.66373062133789, |
|
"learning_rate": 0.00018459273339133537, |
|
"loss": 3.4729, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.19635343618513323, |
|
"grad_norm": 31.121978759765625, |
|
"learning_rate": 0.0001842720958654039, |
|
"loss": 3.2988, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.19822346891070594, |
|
"grad_norm": 15.884181022644043, |
|
"learning_rate": 0.00018394844072580773, |
|
"loss": 3.4934, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.20009350163627865, |
|
"grad_norm": 13.574240684509277, |
|
"learning_rate": 0.00018362177956198408, |
|
"loss": 2.6176, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.20196353436185133, |
|
"grad_norm": 11.254271507263184, |
|
"learning_rate": 0.00018329212407100994, |
|
"loss": 2.3752, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.20383356708742403, |
|
"grad_norm": 11.554123878479004, |
|
"learning_rate": 0.00018295948605718314, |
|
"loss": 2.295, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.20570359981299674, |
|
"grad_norm": 11.747841835021973, |
|
"learning_rate": 0.0001826238774315995, |
|
"loss": 2.378, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20757363253856942, |
|
"grad_norm": 13.485661506652832, |
|
"learning_rate": 0.00018228531021172658, |
|
"loss": 2.6167, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.20944366526414213, |
|
"grad_norm": 12.09218692779541, |
|
"learning_rate": 0.0001819437965209732, |
|
"loss": 2.1478, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.21131369798971483, |
|
"grad_norm": 15.334200859069824, |
|
"learning_rate": 0.0001815993485882553, |
|
"loss": 2.7231, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2131837307152875, |
|
"grad_norm": 14.982645988464355, |
|
"learning_rate": 0.0001812519787475582, |
|
"loss": 2.7762, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.21505376344086022, |
|
"grad_norm": 15.121198654174805, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 2.6204, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.21692379616643293, |
|
"grad_norm": 12.098959922790527, |
|
"learning_rate": 0.0001805485232008601, |
|
"loss": 2.6039, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2187938288920056, |
|
"grad_norm": 13.73119068145752, |
|
"learning_rate": 0.0001801924626841824, |
|
"loss": 2.7469, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2206638616175783, |
|
"grad_norm": 14.34265422821045, |
|
"learning_rate": 0.00017983353063727016, |
|
"loss": 3.0645, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.22253389434315102, |
|
"grad_norm": 13.765779495239258, |
|
"learning_rate": 0.00017947173991275555, |
|
"loss": 3.014, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.2244039270687237, |
|
"grad_norm": 17.305545806884766, |
|
"learning_rate": 0.00017910710346563416, |
|
"loss": 2.9139, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2262739597942964, |
|
"grad_norm": 13.76815414428711, |
|
"learning_rate": 0.00017873963435280121, |
|
"loss": 2.6729, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2281439925198691, |
|
"grad_norm": 16.517879486083984, |
|
"learning_rate": 0.000178369345732584, |
|
"loss": 3.1773, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.2300140252454418, |
|
"grad_norm": 18.648420333862305, |
|
"learning_rate": 0.00017799625086427064, |
|
"loss": 3.5063, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.2318840579710145, |
|
"grad_norm": 19.074386596679688, |
|
"learning_rate": 0.00017762036310763532, |
|
"loss": 2.743, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.2337540906965872, |
|
"grad_norm": 16.391334533691406, |
|
"learning_rate": 0.00017724169592245995, |
|
"loss": 2.7288, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.23562412342215988, |
|
"grad_norm": 26.434553146362305, |
|
"learning_rate": 0.0001768602628680522, |
|
"loss": 2.6386, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.2374941561477326, |
|
"grad_norm": 18.820554733276367, |
|
"learning_rate": 0.00017647607760275987, |
|
"loss": 3.3189, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.2393641888733053, |
|
"grad_norm": 18.246240615844727, |
|
"learning_rate": 0.00017608915388348187, |
|
"loss": 2.9794, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.24123422159887797, |
|
"grad_norm": 24.38311004638672, |
|
"learning_rate": 0.00017569950556517566, |
|
"loss": 2.9017, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.24310425432445068, |
|
"grad_norm": 17.729557037353516, |
|
"learning_rate": 0.00017530714660036112, |
|
"loss": 2.7735, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2449742870500234, |
|
"grad_norm": 20.780271530151367, |
|
"learning_rate": 0.00017491209103862084, |
|
"loss": 3.2586, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.24684431977559607, |
|
"grad_norm": 25.706838607788086, |
|
"learning_rate": 0.00017451435302609714, |
|
"loss": 2.9574, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.24871435250116877, |
|
"grad_norm": 17.675687789916992, |
|
"learning_rate": 0.0001741139468049855, |
|
"loss": 2.6356, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2505843852267415, |
|
"grad_norm": 23.035436630249023, |
|
"learning_rate": 0.0001737108867130245, |
|
"loss": 3.3455, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.2505843852267415, |
|
"eval_loss": 3.025357723236084, |
|
"eval_runtime": 12.856, |
|
"eval_samples_per_second": 17.579, |
|
"eval_steps_per_second": 8.79, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.25245441795231416, |
|
"grad_norm": 17.924945831298828, |
|
"learning_rate": 0.00017330518718298264, |
|
"loss": 2.8946, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.25432445067788684, |
|
"grad_norm": 20.890548706054688, |
|
"learning_rate": 0.00017289686274214118, |
|
"loss": 2.9531, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.25619448340345957, |
|
"grad_norm": 19.297199249267578, |
|
"learning_rate": 0.0001724859280117742, |
|
"loss": 2.7719, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 23.349380493164062, |
|
"learning_rate": 0.000172072397706625, |
|
"loss": 3.5567, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.25993454885460493, |
|
"grad_norm": 24.923961639404297, |
|
"learning_rate": 0.00017165628663437922, |
|
"loss": 3.113, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.26180458158017766, |
|
"grad_norm": 21.864042282104492, |
|
"learning_rate": 0.0001712376096951345, |
|
"loss": 2.9502, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.26367461430575034, |
|
"grad_norm": 30.322755813598633, |
|
"learning_rate": 0.00017081638188086697, |
|
"loss": 3.7115, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.265544647031323, |
|
"grad_norm": 23.991077423095703, |
|
"learning_rate": 0.0001703926182748945, |
|
"loss": 3.3406, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.26741467975689576, |
|
"grad_norm": 24.86219024658203, |
|
"learning_rate": 0.00016996633405133655, |
|
"loss": 3.347, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.26928471248246844, |
|
"grad_norm": 29.057273864746094, |
|
"learning_rate": 0.00016953754447457078, |
|
"loss": 2.6652, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.2711547452080411, |
|
"grad_norm": 25.7908992767334, |
|
"learning_rate": 0.00016910626489868649, |
|
"loss": 3.1458, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.27302477793361385, |
|
"grad_norm": 27.767778396606445, |
|
"learning_rate": 0.00016867251076693482, |
|
"loss": 3.3319, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.27489481065918653, |
|
"grad_norm": 30.76727294921875, |
|
"learning_rate": 0.0001682362976111758, |
|
"loss": 3.682, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2767648433847592, |
|
"grad_norm": 25.062744140625, |
|
"learning_rate": 0.0001677976410513221, |
|
"loss": 3.0067, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.27863487611033194, |
|
"grad_norm": 48.073890686035156, |
|
"learning_rate": 0.00016735655679477979, |
|
"loss": 4.0274, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.2805049088359046, |
|
"grad_norm": 44.6626091003418, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 3.1489, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2823749415614773, |
|
"grad_norm": 73.71078491210938, |
|
"learning_rate": 0.0001664671684553426, |
|
"loss": 4.4787, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.28424497428705003, |
|
"grad_norm": 56.24951934814453, |
|
"learning_rate": 0.00016601889621964904, |
|
"loss": 3.7768, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2861150070126227, |
|
"grad_norm": 46.72991943359375, |
|
"learning_rate": 0.00016556825998052924, |
|
"loss": 3.5245, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.2879850397381954, |
|
"grad_norm": 31.89575958251953, |
|
"learning_rate": 0.00016511527587435737, |
|
"loss": 3.4228, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.2898550724637681, |
|
"grad_norm": 18.977670669555664, |
|
"learning_rate": 0.00016465996012157995, |
|
"loss": 3.0195, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2917251051893408, |
|
"grad_norm": 12.196249008178711, |
|
"learning_rate": 0.00016420232902613523, |
|
"loss": 2.8431, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2935951379149135, |
|
"grad_norm": 11.746070861816406, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 2.5194, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.2954651706404862, |
|
"grad_norm": 12.877666473388672, |
|
"learning_rate": 0.00016328018643694812, |
|
"loss": 2.695, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2973352033660589, |
|
"grad_norm": 11.190670013427734, |
|
"learning_rate": 0.00016281570796327068, |
|
"loss": 3.0105, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.2992052360916316, |
|
"grad_norm": 12.561261177062988, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 2.113, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3010752688172043, |
|
"grad_norm": 20.69705581665039, |
|
"learning_rate": 0.00016188001981733588, |
|
"loss": 2.7286, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.302945301542777, |
|
"grad_norm": 13.717788696289062, |
|
"learning_rate": 0.00016140884365018252, |
|
"loss": 2.9779, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.30481533426834967, |
|
"grad_norm": 12.364960670471191, |
|
"learning_rate": 0.00016093546855628084, |
|
"loss": 2.6577, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.3066853669939224, |
|
"grad_norm": 12.219094276428223, |
|
"learning_rate": 0.0001604599114862375, |
|
"loss": 2.3431, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3085553997194951, |
|
"grad_norm": 14.56771183013916, |
|
"learning_rate": 0.00015998218946879138, |
|
"loss": 2.7715, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.31042543244506776, |
|
"grad_norm": 14.490442276000977, |
|
"learning_rate": 0.00015950231961020373, |
|
"loss": 2.7294, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3122954651706405, |
|
"grad_norm": 13.455503463745117, |
|
"learning_rate": 0.00015902031909364564, |
|
"loss": 2.7422, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3141654978962132, |
|
"grad_norm": 14.774016380310059, |
|
"learning_rate": 0.00015853620517858276, |
|
"loss": 2.807, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.31603553062178585, |
|
"grad_norm": 15.136216163635254, |
|
"learning_rate": 0.00015804999520015734, |
|
"loss": 2.8907, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3179055633473586, |
|
"grad_norm": 14.85161018371582, |
|
"learning_rate": 0.00015756170656856737, |
|
"loss": 2.8376, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.31977559607293127, |
|
"grad_norm": 16.69162368774414, |
|
"learning_rate": 0.0001570713567684432, |
|
"loss": 2.8552, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.32164562879850395, |
|
"grad_norm": 13.032023429870605, |
|
"learning_rate": 0.00015657896335822147, |
|
"loss": 3.02, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3235156615240767, |
|
"grad_norm": 16.295326232910156, |
|
"learning_rate": 0.00015608454396951645, |
|
"loss": 3.0504, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.32538569424964936, |
|
"grad_norm": 15.998780250549316, |
|
"learning_rate": 0.00015558811630648846, |
|
"loss": 3.0265, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.32725572697522204, |
|
"grad_norm": 25.61530113220215, |
|
"learning_rate": 0.00015508969814521025, |
|
"loss": 2.8652, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3291257597007948, |
|
"grad_norm": 15.13475227355957, |
|
"learning_rate": 0.00015458930733303018, |
|
"loss": 2.5502, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.33099579242636745, |
|
"grad_norm": 13.900614738464355, |
|
"learning_rate": 0.00015408696178793331, |
|
"loss": 2.9654, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.33286582515194013, |
|
"grad_norm": 16.07615852355957, |
|
"learning_rate": 0.00015358267949789966, |
|
"loss": 2.7143, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.33473585787751287, |
|
"grad_norm": 18.99631118774414, |
|
"learning_rate": 0.0001530764785202603, |
|
"loss": 2.8096, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.33660589060308554, |
|
"grad_norm": 20.583770751953125, |
|
"learning_rate": 0.00015256837698105047, |
|
"loss": 2.9865, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3384759233286583, |
|
"grad_norm": 18.920778274536133, |
|
"learning_rate": 0.00015205839307436088, |
|
"loss": 3.3376, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.34034595605423096, |
|
"grad_norm": 18.719928741455078, |
|
"learning_rate": 0.00015154654506168585, |
|
"loss": 2.7643, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.34221598877980364, |
|
"grad_norm": 22.60373878479004, |
|
"learning_rate": 0.00015103285127126962, |
|
"loss": 2.9539, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.34408602150537637, |
|
"grad_norm": 19.586341857910156, |
|
"learning_rate": 0.00015051733009745013, |
|
"loss": 2.5908, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.34595605423094905, |
|
"grad_norm": 20.427947998046875, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 2.6959, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 19.461990356445312, |
|
"learning_rate": 0.000149480879503466, |
|
"loss": 2.9813, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.34969611968209446, |
|
"grad_norm": 23.931228637695312, |
|
"learning_rate": 0.00014895998719650526, |
|
"loss": 3.047, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.35156615240766714, |
|
"grad_norm": 19.459693908691406, |
|
"learning_rate": 0.00014843734173122002, |
|
"loss": 2.9688, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3534361851332398, |
|
"grad_norm": 25.261966705322266, |
|
"learning_rate": 0.0001479129618224895, |
|
"loss": 2.8956, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.35530621785881256, |
|
"grad_norm": 23.84093475341797, |
|
"learning_rate": 0.00014738686624729986, |
|
"loss": 3.4096, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.35717625058438524, |
|
"grad_norm": 23.549619674682617, |
|
"learning_rate": 0.00014685907384407186, |
|
"loss": 2.9033, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.3590462833099579, |
|
"grad_norm": 21.206594467163086, |
|
"learning_rate": 0.00014632960351198618, |
|
"loss": 3.0757, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.36091631603553065, |
|
"grad_norm": 21.78272247314453, |
|
"learning_rate": 0.00014579847421030678, |
|
"loss": 2.9197, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.3627863487611033, |
|
"grad_norm": 24.570682525634766, |
|
"learning_rate": 0.00014526570495770194, |
|
"loss": 3.6499, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.364656381486676, |
|
"grad_norm": 28.85085105895996, |
|
"learning_rate": 0.00014473131483156327, |
|
"loss": 2.6047, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.36652641421224874, |
|
"grad_norm": 27.24856948852539, |
|
"learning_rate": 0.0001441953229673227, |
|
"loss": 3.0789, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3683964469378214, |
|
"grad_norm": 28.561187744140625, |
|
"learning_rate": 0.000143657748557767, |
|
"loss": 3.4813, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.3702664796633941, |
|
"grad_norm": 27.148723602294922, |
|
"learning_rate": 0.00014311861085235085, |
|
"loss": 3.0691, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.37213651238896683, |
|
"grad_norm": 31.670246124267578, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 3.9914, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.3740065451145395, |
|
"grad_norm": 42.1184196472168, |
|
"learning_rate": 0.00014203572283095657, |
|
"loss": 4.6206, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3758765778401122, |
|
"grad_norm": 40.36967086791992, |
|
"learning_rate": 0.00014149201129101286, |
|
"loss": 3.2648, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.3777466105656849, |
|
"grad_norm": 38.3328971862793, |
|
"learning_rate": 0.00014094681400588906, |
|
"loss": 3.248, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3796166432912576, |
|
"grad_norm": 36.41666030883789, |
|
"learning_rate": 0.00014040015049799953, |
|
"loss": 3.2585, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.3814866760168303, |
|
"grad_norm": 23.713642120361328, |
|
"learning_rate": 0.00013985204034226115, |
|
"loss": 2.8715, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.383356708742403, |
|
"grad_norm": 13.198444366455078, |
|
"learning_rate": 0.00013930250316539238, |
|
"loss": 3.0464, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3852267414679757, |
|
"grad_norm": 11.255416870117188, |
|
"learning_rate": 0.0001387515586452103, |
|
"loss": 2.7254, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 11.445615768432617, |
|
"learning_rate": 0.00013819922650992625, |
|
"loss": 2.3275, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.3889668069191211, |
|
"grad_norm": 11.428658485412598, |
|
"learning_rate": 0.0001376455265374392, |
|
"loss": 2.7019, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3908368396446938, |
|
"grad_norm": 11.425288200378418, |
|
"learning_rate": 0.00013709047855462765, |
|
"loss": 2.6879, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.39270687237026647, |
|
"grad_norm": 9.648893356323242, |
|
"learning_rate": 0.00013653410243663952, |
|
"loss": 2.1931, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3945769050958392, |
|
"grad_norm": 12.493631362915039, |
|
"learning_rate": 0.00013597641810618073, |
|
"loss": 2.5069, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.3964469378214119, |
|
"grad_norm": 11.868135452270508, |
|
"learning_rate": 0.0001354174455328015, |
|
"loss": 2.8156, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.39831697054698456, |
|
"grad_norm": 10.956955909729004, |
|
"learning_rate": 0.00013485720473218154, |
|
"loss": 2.3458, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.4001870032725573, |
|
"grad_norm": 12.486101150512695, |
|
"learning_rate": 0.00013429571576541315, |
|
"loss": 2.559, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.40205703599813, |
|
"grad_norm": 13.160475730895996, |
|
"learning_rate": 0.00013373299873828303, |
|
"loss": 2.5989, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.40392706872370265, |
|
"grad_norm": 14.06461238861084, |
|
"learning_rate": 0.00013316907380055208, |
|
"loss": 2.9656, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.4057971014492754, |
|
"grad_norm": 13.031517028808594, |
|
"learning_rate": 0.0001326039611452342, |
|
"loss": 2.494, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.40766713417484807, |
|
"grad_norm": 12.205702781677246, |
|
"learning_rate": 0.00013203768100787297, |
|
"loss": 2.4097, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.40953716690042075, |
|
"grad_norm": 13.180871963500977, |
|
"learning_rate": 0.0001314702536658172, |
|
"loss": 2.5468, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.4114071996259935, |
|
"grad_norm": 16.636207580566406, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 3.3479, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.41327723235156616, |
|
"grad_norm": 13.308319091796875, |
|
"learning_rate": 0.000130332038681685, |
|
"loss": 2.597, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.41514726507713884, |
|
"grad_norm": 13.627033233642578, |
|
"learning_rate": 0.00012976129179678988, |
|
"loss": 2.4171, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.4170172978027116, |
|
"grad_norm": 15.449974060058594, |
|
"learning_rate": 0.00012918947922010336, |
|
"loss": 3.2259, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.41888733052828425, |
|
"grad_norm": 16.265350341796875, |
|
"learning_rate": 0.00012861662142707968, |
|
"loss": 2.7149, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.42075736325385693, |
|
"grad_norm": 14.941640853881836, |
|
"learning_rate": 0.00012804273893060028, |
|
"loss": 2.9722, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.42262739597942967, |
|
"grad_norm": 19.596858978271484, |
|
"learning_rate": 0.00012746785228023904, |
|
"loss": 2.9886, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.42449742870500234, |
|
"grad_norm": 23.8212947845459, |
|
"learning_rate": 0.00012689198206152657, |
|
"loss": 3.0503, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.426367461430575, |
|
"grad_norm": 16.560731887817383, |
|
"learning_rate": 0.0001263151488952132, |
|
"loss": 2.6271, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.42823749415614776, |
|
"grad_norm": 16.66946029663086, |
|
"learning_rate": 0.00012573737343653024, |
|
"loss": 2.804, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.43010752688172044, |
|
"grad_norm": 22.840084075927734, |
|
"learning_rate": 0.00012515867637445086, |
|
"loss": 2.7086, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4319775596072931, |
|
"grad_norm": 14.753687858581543, |
|
"learning_rate": 0.00012457907843094882, |
|
"loss": 2.5197, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.43384759233286585, |
|
"grad_norm": 14.791753768920898, |
|
"learning_rate": 0.0001239986003602566, |
|
"loss": 2.394, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.43571762505843853, |
|
"grad_norm": 22.011295318603516, |
|
"learning_rate": 0.00012341726294812238, |
|
"loss": 2.5769, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.4375876577840112, |
|
"grad_norm": 20.5321102142334, |
|
"learning_rate": 0.00012283508701106557, |
|
"loss": 3.0704, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.43945769050958394, |
|
"grad_norm": 16.91661262512207, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 2.8309, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4413277232351566, |
|
"grad_norm": 26.08629608154297, |
|
"learning_rate": 0.00012166830297764471, |
|
"loss": 3.0209, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4431977559607293, |
|
"grad_norm": 19.537260055541992, |
|
"learning_rate": 0.00012108373666146191, |
|
"loss": 2.8018, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.44506778868630203, |
|
"grad_norm": 23.111852645874023, |
|
"learning_rate": 0.00012049841537922307, |
|
"loss": 3.0249, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.4469378214118747, |
|
"grad_norm": 19.322980880737305, |
|
"learning_rate": 0.00011991236009010183, |
|
"loss": 3.0254, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.4488078541374474, |
|
"grad_norm": 30.11720848083496, |
|
"learning_rate": 0.00011932559177955533, |
|
"loss": 3.9059, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4506778868630201, |
|
"grad_norm": 24.715557098388672, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 3.1555, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.4525479195885928, |
|
"grad_norm": 29.622190475463867, |
|
"learning_rate": 0.00011815000016292164, |
|
"loss": 2.6222, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.4544179523141655, |
|
"grad_norm": 24.93659019470215, |
|
"learning_rate": 0.00011756121895239753, |
|
"loss": 2.9839, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4562879850397382, |
|
"grad_norm": 26.67534637451172, |
|
"learning_rate": 0.00011697180891006689, |
|
"loss": 3.1704, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.4581580177653109, |
|
"grad_norm": 25.920337677001953, |
|
"learning_rate": 0.00011638179114151377, |
|
"loss": 2.653, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4600280504908836, |
|
"grad_norm": 29.56134796142578, |
|
"learning_rate": 0.0001157911867740836, |
|
"loss": 2.9653, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.4618980832164563, |
|
"grad_norm": 35.859981536865234, |
|
"learning_rate": 0.00011520001695612674, |
|
"loss": 3.6873, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.463768115942029, |
|
"grad_norm": 41.76987838745117, |
|
"learning_rate": 0.00011460830285624118, |
|
"loss": 3.7447, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.46563814866760167, |
|
"grad_norm": 38.14467239379883, |
|
"learning_rate": 0.0001140160656625146, |
|
"loss": 4.65, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.4675081813931744, |
|
"grad_norm": 55.1082649230957, |
|
"learning_rate": 0.00011342332658176555, |
|
"loss": 3.6235, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4693782141187471, |
|
"grad_norm": 34.86373519897461, |
|
"learning_rate": 0.00011283010683878423, |
|
"loss": 3.1883, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.47124824684431976, |
|
"grad_norm": 30.727251052856445, |
|
"learning_rate": 0.00011223642767557227, |
|
"loss": 3.3415, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4731182795698925, |
|
"grad_norm": 32.8022575378418, |
|
"learning_rate": 0.00011164231035058228, |
|
"loss": 3.1891, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.4749883122954652, |
|
"grad_norm": 21.999401092529297, |
|
"learning_rate": 0.00011104777613795661, |
|
"loss": 3.1296, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.47685834502103785, |
|
"grad_norm": 20.68447494506836, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 2.7139, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4787283777466106, |
|
"grad_norm": 13.36279010772705, |
|
"learning_rate": 0.00010985754222024436, |
|
"loss": 2.4648, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.48059841047218327, |
|
"grad_norm": 10.14539909362793, |
|
"learning_rate": 0.00010926188513503215, |
|
"loss": 2.4138, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.48246844319775595, |
|
"grad_norm": 9.960616111755371, |
|
"learning_rate": 0.00010866589640040669, |
|
"loss": 2.7157, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.4843384759233287, |
|
"grad_norm": 10.990545272827148, |
|
"learning_rate": 0.00010806959735752174, |
|
"loss": 2.4142, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.48620850864890136, |
|
"grad_norm": 10.179009437561035, |
|
"learning_rate": 0.00010747300935864243, |
|
"loss": 2.0645, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48807854137447404, |
|
"grad_norm": 9.950582504272461, |
|
"learning_rate": 0.00010687615376638093, |
|
"loss": 2.2024, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.4899485741000468, |
|
"grad_norm": 10.244935989379883, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 2.1277, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.49181860682561945, |
|
"grad_norm": 11.46534252166748, |
|
"learning_rate": 0.00010568172529930447, |
|
"loss": 2.534, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.49368863955119213, |
|
"grad_norm": 13.022493362426758, |
|
"learning_rate": 0.00010508419519456219, |
|
"loss": 2.3155, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.49555867227676487, |
|
"grad_norm": 15.116632461547852, |
|
"learning_rate": 0.00010448648303505151, |
|
"loss": 3.0314, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.49742870500233755, |
|
"grad_norm": 13.014046669006348, |
|
"learning_rate": 0.0001038886102236385, |
|
"loss": 2.4966, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4992987377279102, |
|
"grad_norm": 11.42091178894043, |
|
"learning_rate": 0.00010329059816894186, |
|
"loss": 2.56, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.501168770453483, |
|
"grad_norm": 12.75170612335205, |
|
"learning_rate": 0.00010269246828456629, |
|
"loss": 2.5727, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.501168770453483, |
|
"eval_loss": 2.828613758087158, |
|
"eval_runtime": 12.8533, |
|
"eval_samples_per_second": 17.583, |
|
"eval_steps_per_second": 8.792, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5030388031790556, |
|
"grad_norm": 13.714940071105957, |
|
"learning_rate": 0.0001020942419883357, |
|
"loss": 2.4806, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.5049088359046283, |
|
"grad_norm": 14.08552074432373, |
|
"learning_rate": 0.00010149594070152638, |
|
"loss": 2.7159, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.506778868630201, |
|
"grad_norm": 14.860525131225586, |
|
"learning_rate": 0.00010089758584809979, |
|
"loss": 2.5984, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.5086489013557737, |
|
"grad_norm": 14.785524368286133, |
|
"learning_rate": 0.00010029919885393563, |
|
"loss": 2.6805, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5105189340813464, |
|
"grad_norm": 14.939064979553223, |
|
"learning_rate": 9.970080114606439e-05, |
|
"loss": 2.4628, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.5123889668069191, |
|
"grad_norm": 13.804859161376953, |
|
"learning_rate": 9.910241415190021e-05, |
|
"loss": 2.847, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.5142589995324918, |
|
"grad_norm": 14.139949798583984, |
|
"learning_rate": 9.850405929847366e-05, |
|
"loss": 2.3347, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 16.16015625, |
|
"learning_rate": 9.790575801166432e-05, |
|
"loss": 2.7167, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5179990649836372, |
|
"grad_norm": 17.632705688476562, |
|
"learning_rate": 9.730753171543374e-05, |
|
"loss": 3.3824, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.5198690977092099, |
|
"grad_norm": 20.08257484436035, |
|
"learning_rate": 9.670940183105812e-05, |
|
"loss": 3.1337, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 16.899017333984375, |
|
"learning_rate": 9.611138977636153e-05, |
|
"loss": 2.8295, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5236091631603553, |
|
"grad_norm": 16.086267471313477, |
|
"learning_rate": 9.551351696494854e-05, |
|
"loss": 2.2724, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.525479195885928, |
|
"grad_norm": 17.3554744720459, |
|
"learning_rate": 9.491580480543784e-05, |
|
"loss": 3.0389, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5273492286115007, |
|
"grad_norm": 15.312295913696289, |
|
"learning_rate": 9.431827470069558e-05, |
|
"loss": 1.8464, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5292192613370734, |
|
"grad_norm": 14.824910163879395, |
|
"learning_rate": 9.372094804706867e-05, |
|
"loss": 2.5675, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.531089294062646, |
|
"grad_norm": 62.370765686035156, |
|
"learning_rate": 9.312384623361909e-05, |
|
"loss": 3.6012, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5329593267882188, |
|
"grad_norm": 17.69074249267578, |
|
"learning_rate": 9.252699064135758e-05, |
|
"loss": 2.9718, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5348293595137915, |
|
"grad_norm": 18.726192474365234, |
|
"learning_rate": 9.193040264247829e-05, |
|
"loss": 2.6739, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5366993922393641, |
|
"grad_norm": 23.5485782623291, |
|
"learning_rate": 9.13341035995933e-05, |
|
"loss": 3.519, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.5385694249649369, |
|
"grad_norm": 23.890811920166016, |
|
"learning_rate": 9.073811486496788e-05, |
|
"loss": 3.0704, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5404394576905096, |
|
"grad_norm": 21.349260330200195, |
|
"learning_rate": 9.014245777975565e-05, |
|
"loss": 3.1382, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5423094904160822, |
|
"grad_norm": 27.911243438720703, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 3.3497, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.544179523141655, |
|
"grad_norm": 23.007844924926758, |
|
"learning_rate": 8.89522238620434e-05, |
|
"loss": 2.4557, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5460495558672277, |
|
"grad_norm": 28.537771224975586, |
|
"learning_rate": 8.835768964941773e-05, |
|
"loss": 3.3786, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5479195885928003, |
|
"grad_norm": 23.56622314453125, |
|
"learning_rate": 8.776357232442778e-05, |
|
"loss": 3.245, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5497896213183731, |
|
"grad_norm": 26.669605255126953, |
|
"learning_rate": 8.716989316121578e-05, |
|
"loss": 3.5067, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5516596540439458, |
|
"grad_norm": 25.95106315612793, |
|
"learning_rate": 8.657667341823448e-05, |
|
"loss": 2.7495, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5535296867695184, |
|
"grad_norm": 36.05848693847656, |
|
"learning_rate": 8.598393433748541e-05, |
|
"loss": 3.4859, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5553997194950911, |
|
"grad_norm": 25.490476608276367, |
|
"learning_rate": 8.539169714375885e-05, |
|
"loss": 3.1739, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5572697522206639, |
|
"grad_norm": 36.27049255371094, |
|
"learning_rate": 8.479998304387329e-05, |
|
"loss": 3.6866, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.5591397849462365, |
|
"grad_norm": 33.86897277832031, |
|
"learning_rate": 8.420881322591642e-05, |
|
"loss": 2.8654, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.5610098176718092, |
|
"grad_norm": 44.89165115356445, |
|
"learning_rate": 8.361820885848624e-05, |
|
"loss": 3.9259, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.562879850397382, |
|
"grad_norm": 20.971372604370117, |
|
"learning_rate": 8.302819108993312e-05, |
|
"loss": 2.7844, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.5647498831229546, |
|
"grad_norm": 24.576387405395508, |
|
"learning_rate": 8.243878104760249e-05, |
|
"loss": 3.0554, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.5666199158485273, |
|
"grad_norm": 25.367847442626953, |
|
"learning_rate": 8.184999983707837e-05, |
|
"loss": 2.9751, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.5684899485741001, |
|
"grad_norm": 20.629751205444336, |
|
"learning_rate": 8.126186854142752e-05, |
|
"loss": 2.8788, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5703599812996727, |
|
"grad_norm": 17.573223114013672, |
|
"learning_rate": 8.067440822044469e-05, |
|
"loss": 2.3871, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5722300140252454, |
|
"grad_norm": 15.68101692199707, |
|
"learning_rate": 8.00876399098982e-05, |
|
"loss": 1.9594, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.5741000467508182, |
|
"grad_norm": 13.078360557556152, |
|
"learning_rate": 7.950158462077697e-05, |
|
"loss": 2.3795, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.5759700794763908, |
|
"grad_norm": 11.211087226867676, |
|
"learning_rate": 7.891626333853812e-05, |
|
"loss": 2.7153, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.5778401122019635, |
|
"grad_norm": 9.67572021484375, |
|
"learning_rate": 7.833169702235531e-05, |
|
"loss": 2.4531, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.5797101449275363, |
|
"grad_norm": 12.671576499938965, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 2.2148, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5815801776531089, |
|
"grad_norm": 9.585877418518066, |
|
"learning_rate": 7.716491298893442e-05, |
|
"loss": 2.7332, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.5834502103786816, |
|
"grad_norm": 10.074613571166992, |
|
"learning_rate": 7.658273705187761e-05, |
|
"loss": 2.2678, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5853202431042543, |
|
"grad_norm": 10.348039627075195, |
|
"learning_rate": 7.600139963974341e-05, |
|
"loss": 2.6394, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.587190275829827, |
|
"grad_norm": 10.6520414352417, |
|
"learning_rate": 7.542092156905123e-05, |
|
"loss": 2.1318, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.5890603085553997, |
|
"grad_norm": 11.883548736572266, |
|
"learning_rate": 7.484132362554915e-05, |
|
"loss": 2.5724, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5909303412809724, |
|
"grad_norm": 11.804710388183594, |
|
"learning_rate": 7.426262656346978e-05, |
|
"loss": 2.3502, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5928003740065451, |
|
"grad_norm": 12.366445541381836, |
|
"learning_rate": 7.368485110478685e-05, |
|
"loss": 2.2743, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.5946704067321178, |
|
"grad_norm": 13.19687557220459, |
|
"learning_rate": 7.310801793847344e-05, |
|
"loss": 2.6187, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.5965404394576905, |
|
"grad_norm": 13.92205810546875, |
|
"learning_rate": 7.2532147719761e-05, |
|
"loss": 2.8836, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.5984104721832632, |
|
"grad_norm": 13.191638946533203, |
|
"learning_rate": 7.195726106939974e-05, |
|
"loss": 2.7437, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6002805049088359, |
|
"grad_norm": 12.879383087158203, |
|
"learning_rate": 7.138337857292034e-05, |
|
"loss": 2.3656, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.6021505376344086, |
|
"grad_norm": 18.539752960205078, |
|
"learning_rate": 7.081052077989667e-05, |
|
"loss": 2.7168, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.6040205703599812, |
|
"grad_norm": 15.435444831848145, |
|
"learning_rate": 7.023870820321017e-05, |
|
"loss": 2.8032, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.605890603085554, |
|
"grad_norm": 15.457225799560547, |
|
"learning_rate": 6.966796131831501e-05, |
|
"loss": 2.9741, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6077606358111267, |
|
"grad_norm": 14.631234169006348, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 2.782, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6096306685366993, |
|
"grad_norm": 13.480672836303711, |
|
"learning_rate": 6.85297463341828e-05, |
|
"loss": 2.3114, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.6115007012622721, |
|
"grad_norm": 15.09902572631836, |
|
"learning_rate": 6.796231899212704e-05, |
|
"loss": 2.0404, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.6133707339878448, |
|
"grad_norm": 19.798410415649414, |
|
"learning_rate": 6.739603885476582e-05, |
|
"loss": 3.2275, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6152407667134174, |
|
"grad_norm": 19.74947738647461, |
|
"learning_rate": 6.683092619944796e-05, |
|
"loss": 2.8039, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.6171107994389902, |
|
"grad_norm": 15.367700576782227, |
|
"learning_rate": 6.626700126171702e-05, |
|
"loss": 2.8833, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6189808321645629, |
|
"grad_norm": 18.47310447692871, |
|
"learning_rate": 6.570428423458687e-05, |
|
"loss": 2.8571, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.6208508648901355, |
|
"grad_norm": 17.92266273498535, |
|
"learning_rate": 6.51427952678185e-05, |
|
"loss": 2.3749, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6227208976157083, |
|
"grad_norm": 17.863025665283203, |
|
"learning_rate": 6.458255446719854e-05, |
|
"loss": 2.6118, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.624590930341281, |
|
"grad_norm": 16.579729080200195, |
|
"learning_rate": 6.402358189381934e-05, |
|
"loss": 2.2111, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.6264609630668536, |
|
"grad_norm": 21.433856964111328, |
|
"learning_rate": 6.34658975633605e-05, |
|
"loss": 2.8057, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6283309957924264, |
|
"grad_norm": 17.18570899963379, |
|
"learning_rate": 6.290952144537241e-05, |
|
"loss": 2.7357, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6302010285179991, |
|
"grad_norm": 27.641077041625977, |
|
"learning_rate": 6.23544734625608e-05, |
|
"loss": 3.0753, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6320710612435717, |
|
"grad_norm": 25.628257751464844, |
|
"learning_rate": 6.180077349007376e-05, |
|
"loss": 3.2045, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6339410939691444, |
|
"grad_norm": 18.360332489013672, |
|
"learning_rate": 6.12484413547897e-05, |
|
"loss": 2.6208, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6358111266947172, |
|
"grad_norm": 52.43937683105469, |
|
"learning_rate": 6.069749683460765e-05, |
|
"loss": 3.2548, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6376811594202898, |
|
"grad_norm": 25.194255828857422, |
|
"learning_rate": 6.014795965773884e-05, |
|
"loss": 2.9652, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6395511921458625, |
|
"grad_norm": 33.8499870300293, |
|
"learning_rate": 5.9599849502000485e-05, |
|
"loss": 3.4296, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6414212248714353, |
|
"grad_norm": 33.058990478515625, |
|
"learning_rate": 5.9053185994110974e-05, |
|
"loss": 3.6159, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6432912575970079, |
|
"grad_norm": 22.76582145690918, |
|
"learning_rate": 5.8507988708987146e-05, |
|
"loss": 2.9352, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 27.470590591430664, |
|
"learning_rate": 5.796427716904347e-05, |
|
"loss": 3.0844, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6470313230481534, |
|
"grad_norm": 30.144134521484375, |
|
"learning_rate": 5.7422070843492734e-05, |
|
"loss": 3.3897, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.648901355773726, |
|
"grad_norm": 27.228771209716797, |
|
"learning_rate": 5.6881389147649176e-05, |
|
"loss": 3.0126, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.6507713884992987, |
|
"grad_norm": 30.412782669067383, |
|
"learning_rate": 5.634225144223302e-05, |
|
"loss": 2.8863, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6526414212248715, |
|
"grad_norm": 43.44699478149414, |
|
"learning_rate": 5.5804677032677354e-05, |
|
"loss": 4.0037, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.6545114539504441, |
|
"grad_norm": 50.50189208984375, |
|
"learning_rate": 5.526868516843673e-05, |
|
"loss": 3.4202, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6563814866760168, |
|
"grad_norm": 17.37109375, |
|
"learning_rate": 5.47342950422981e-05, |
|
"loss": 2.9548, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.6582515194015895, |
|
"grad_norm": 18.439983367919922, |
|
"learning_rate": 5.420152578969326e-05, |
|
"loss": 2.7266, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6601215521271622, |
|
"grad_norm": 20.2562313079834, |
|
"learning_rate": 5.3670396488013854e-05, |
|
"loss": 2.5423, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.6619915848527349, |
|
"grad_norm": 15.055497169494629, |
|
"learning_rate": 5.3140926155928136e-05, |
|
"loss": 2.6748, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.6638616175783076, |
|
"grad_norm": 13.408388137817383, |
|
"learning_rate": 5.261313375270014e-05, |
|
"loss": 2.9774, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6657316503038803, |
|
"grad_norm": 14.892769813537598, |
|
"learning_rate": 5.208703817751053e-05, |
|
"loss": 2.2886, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.667601683029453, |
|
"grad_norm": 12.354819297790527, |
|
"learning_rate": 5.156265826877999e-05, |
|
"loss": 2.8214, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.6694717157550257, |
|
"grad_norm": 13.169706344604492, |
|
"learning_rate": 5.1040012803494795e-05, |
|
"loss": 2.4982, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.6713417484805985, |
|
"grad_norm": 12.36215591430664, |
|
"learning_rate": 5.0519120496534044e-05, |
|
"loss": 3.0511, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.6732117812061711, |
|
"grad_norm": 11.116745948791504, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 2.3823, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6750818139317438, |
|
"grad_norm": 11.059001922607422, |
|
"learning_rate": 4.9482669902549894e-05, |
|
"loss": 2.3138, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.6769518466573166, |
|
"grad_norm": 10.290990829467773, |
|
"learning_rate": 4.896714872873038e-05, |
|
"loss": 2.0308, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.6788218793828892, |
|
"grad_norm": 14.33850383758545, |
|
"learning_rate": 4.845345493831419e-05, |
|
"loss": 2.6556, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.6806919121084619, |
|
"grad_norm": 12.50505542755127, |
|
"learning_rate": 4.794160692563917e-05, |
|
"loss": 2.6198, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6825619448340347, |
|
"grad_norm": 11.454310417175293, |
|
"learning_rate": 4.743162301894952e-05, |
|
"loss": 2.4727, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6844319775596073, |
|
"grad_norm": 11.56086254119873, |
|
"learning_rate": 4.692352147973973e-05, |
|
"loss": 2.1338, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.68630201028518, |
|
"grad_norm": 11.487324714660645, |
|
"learning_rate": 4.6417320502100316e-05, |
|
"loss": 2.4042, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.6881720430107527, |
|
"grad_norm": 11.415949821472168, |
|
"learning_rate": 4.591303821206673e-05, |
|
"loss": 2.6558, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.6900420757363254, |
|
"grad_norm": 12.308177947998047, |
|
"learning_rate": 4.541069266696984e-05, |
|
"loss": 2.3123, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.6919121084618981, |
|
"grad_norm": 11.585166931152344, |
|
"learning_rate": 4.491030185478976e-05, |
|
"loss": 2.7433, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6937821411874708, |
|
"grad_norm": 12.77014446258545, |
|
"learning_rate": 4.441188369351157e-05, |
|
"loss": 2.646, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 12.757980346679688, |
|
"learning_rate": 4.391545603048358e-05, |
|
"loss": 2.702, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.6975222066386162, |
|
"grad_norm": 15.349445343017578, |
|
"learning_rate": 4.3421036641778556e-05, |
|
"loss": 3.0098, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.6993922393641889, |
|
"grad_norm": 15.5166597366333, |
|
"learning_rate": 4.2928643231556844e-05, |
|
"loss": 3.2723, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.7012622720897616, |
|
"grad_norm": 13.189452171325684, |
|
"learning_rate": 4.2438293431432665e-05, |
|
"loss": 2.363, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7031323048153343, |
|
"grad_norm": 17.275712966918945, |
|
"learning_rate": 4.195000479984265e-05, |
|
"loss": 3.0557, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.705002337540907, |
|
"grad_norm": 16.319204330444336, |
|
"learning_rate": 4.146379482141723e-05, |
|
"loss": 2.6115, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.7068723702664796, |
|
"grad_norm": 14.946385383605957, |
|
"learning_rate": 4.097968090635439e-05, |
|
"loss": 2.6765, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.7087424029920524, |
|
"grad_norm": 19.0872745513916, |
|
"learning_rate": 4.049768038979631e-05, |
|
"loss": 2.2803, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.7106124357176251, |
|
"grad_norm": 15.21898078918457, |
|
"learning_rate": 4.001781053120863e-05, |
|
"loss": 2.4357, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7124824684431977, |
|
"grad_norm": 17.521011352539062, |
|
"learning_rate": 3.954008851376252e-05, |
|
"loss": 2.6634, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.7143525011687705, |
|
"grad_norm": 16.630462646484375, |
|
"learning_rate": 3.90645314437192e-05, |
|
"loss": 2.2544, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.7162225338943432, |
|
"grad_norm": 18.478824615478516, |
|
"learning_rate": 3.859115634981748e-05, |
|
"loss": 2.6528, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.7180925666199158, |
|
"grad_norm": 22.990461349487305, |
|
"learning_rate": 3.811998018266416e-05, |
|
"loss": 2.8017, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7199625993454886, |
|
"grad_norm": 23.46438217163086, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 3.5866, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7218326320710613, |
|
"grad_norm": 19.73815155029297, |
|
"learning_rate": 3.718429203672936e-05, |
|
"loss": 2.6817, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.7237026647966339, |
|
"grad_norm": 20.134384155273438, |
|
"learning_rate": 3.671981356305191e-05, |
|
"loss": 3.4092, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.7255726975222067, |
|
"grad_norm": 19.134763717651367, |
|
"learning_rate": 3.6257601025131026e-05, |
|
"loss": 2.5706, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.7274427302477794, |
|
"grad_norm": 19.289335250854492, |
|
"learning_rate": 3.57976709738648e-05, |
|
"loss": 3.1928, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.729312762973352, |
|
"grad_norm": 21.083354949951172, |
|
"learning_rate": 3.534003987842005e-05, |
|
"loss": 3.1004, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7311827956989247, |
|
"grad_norm": 19.628206253051758, |
|
"learning_rate": 3.488472412564264e-05, |
|
"loss": 3.0638, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.7330528284244975, |
|
"grad_norm": 25.418609619140625, |
|
"learning_rate": 3.4431740019470774e-05, |
|
"loss": 3.3569, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7349228611500701, |
|
"grad_norm": 24.504236221313477, |
|
"learning_rate": 3.398110378035098e-05, |
|
"loss": 3.3934, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.7367928938756428, |
|
"grad_norm": 32.00563430786133, |
|
"learning_rate": 3.353283154465746e-05, |
|
"loss": 3.3888, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7386629266012156, |
|
"grad_norm": 25.788663864135742, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 3.7214, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7405329593267882, |
|
"grad_norm": 29.49918556213379, |
|
"learning_rate": 3.264344320522024e-05, |
|
"loss": 3.6635, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7424029920523609, |
|
"grad_norm": 52.400569915771484, |
|
"learning_rate": 3.220235894867794e-05, |
|
"loss": 3.574, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.7442730247779337, |
|
"grad_norm": 31.379642486572266, |
|
"learning_rate": 3.1763702388824214e-05, |
|
"loss": 3.8696, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7461430575035063, |
|
"grad_norm": 33.01647186279297, |
|
"learning_rate": 3.132748923306522e-05, |
|
"loss": 3.5235, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.748013090229079, |
|
"grad_norm": 54.04301071166992, |
|
"learning_rate": 3.089373510131354e-05, |
|
"loss": 4.3509, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7498831229546518, |
|
"grad_norm": 9.012869834899902, |
|
"learning_rate": 3.0462455525429257e-05, |
|
"loss": 2.4901, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.7517531556802244, |
|
"grad_norm": 10.679055213928223, |
|
"learning_rate": 3.0033665948663448e-05, |
|
"loss": 2.0973, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.7517531556802244, |
|
"eval_loss": 2.625810146331787, |
|
"eval_runtime": 12.8607, |
|
"eval_samples_per_second": 17.573, |
|
"eval_steps_per_second": 8.786, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.7536231884057971, |
|
"grad_norm": 9.760929107666016, |
|
"learning_rate": 2.960738172510551e-05, |
|
"loss": 2.5254, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.7554932211313699, |
|
"grad_norm": 8.057788848876953, |
|
"learning_rate": 2.9183618119133062e-05, |
|
"loss": 1.8935, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.7573632538569425, |
|
"grad_norm": 9.041131019592285, |
|
"learning_rate": 2.876239030486554e-05, |
|
"loss": 2.5203, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.7592332865825152, |
|
"grad_norm": 9.86754322052002, |
|
"learning_rate": 2.8343713365620772e-05, |
|
"loss": 2.1501, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.7611033193080879, |
|
"grad_norm": 9.489944458007812, |
|
"learning_rate": 2.7927602293375e-05, |
|
"loss": 2.1061, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.7629733520336606, |
|
"grad_norm": 9.308277130126953, |
|
"learning_rate": 2.751407198822583e-05, |
|
"loss": 1.8263, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.7648433847592333, |
|
"grad_norm": 10.4146728515625, |
|
"learning_rate": 2.7103137257858868e-05, |
|
"loss": 2.5591, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.766713417484806, |
|
"grad_norm": 10.501506805419922, |
|
"learning_rate": 2.669481281701739e-05, |
|
"loss": 2.7704, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7685834502103787, |
|
"grad_norm": 12.172067642211914, |
|
"learning_rate": 2.6289113286975485e-05, |
|
"loss": 2.5226, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.7704534829359514, |
|
"grad_norm": 12.657646179199219, |
|
"learning_rate": 2.5886053195014538e-05, |
|
"loss": 2.7101, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.7723235156615241, |
|
"grad_norm": 11.94274616241455, |
|
"learning_rate": 2.5485646973902865e-05, |
|
"loss": 2.6979, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 11.538039207458496, |
|
"learning_rate": 2.508790896137918e-05, |
|
"loss": 2.7357, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.7760635811126695, |
|
"grad_norm": 11.735588073730469, |
|
"learning_rate": 2.4692853399638917e-05, |
|
"loss": 2.7582, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7779336138382422, |
|
"grad_norm": 11.011154174804688, |
|
"learning_rate": 2.4300494434824373e-05, |
|
"loss": 2.5606, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.7798036465638148, |
|
"grad_norm": 12.485893249511719, |
|
"learning_rate": 2.391084611651816e-05, |
|
"loss": 2.2526, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.7816736792893876, |
|
"grad_norm": 13.266201972961426, |
|
"learning_rate": 2.352392239724016e-05, |
|
"loss": 2.7745, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.7835437120149603, |
|
"grad_norm": 12.729734420776367, |
|
"learning_rate": 2.3139737131947824e-05, |
|
"loss": 2.1154, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.7854137447405329, |
|
"grad_norm": 11.948831558227539, |
|
"learning_rate": 2.275830407754006e-05, |
|
"loss": 2.3556, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7872837774661057, |
|
"grad_norm": 15.799360275268555, |
|
"learning_rate": 2.237963689236472e-05, |
|
"loss": 3.1533, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.7891538101916784, |
|
"grad_norm": 14.989856719970703, |
|
"learning_rate": 2.200374913572939e-05, |
|
"loss": 2.8148, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.791023842917251, |
|
"grad_norm": 17.03190040588379, |
|
"learning_rate": 2.163065426741603e-05, |
|
"loss": 2.8717, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.7928938756428238, |
|
"grad_norm": 16.29947280883789, |
|
"learning_rate": 2.1260365647198798e-05, |
|
"loss": 3.1721, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.7947639083683965, |
|
"grad_norm": 14.097606658935547, |
|
"learning_rate": 2.0892896534365904e-05, |
|
"loss": 2.7177, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7966339410939691, |
|
"grad_norm": 18.629955291748047, |
|
"learning_rate": 2.0528260087244487e-05, |
|
"loss": 3.0088, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.7985039738195419, |
|
"grad_norm": 14.538827896118164, |
|
"learning_rate": 2.016646936272987e-05, |
|
"loss": 2.5514, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.8003740065451146, |
|
"grad_norm": 13.943222045898438, |
|
"learning_rate": 1.9807537315817604e-05, |
|
"loss": 2.4046, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.8022440392706872, |
|
"grad_norm": 17.54332160949707, |
|
"learning_rate": 1.9451476799139935e-05, |
|
"loss": 3.5145, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.80411407199626, |
|
"grad_norm": 16.975404739379883, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 2.6382, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8059841047218327, |
|
"grad_norm": 18.486377716064453, |
|
"learning_rate": 1.8748021252441817e-05, |
|
"loss": 2.5148, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.8078541374474053, |
|
"grad_norm": 15.601974487304688, |
|
"learning_rate": 1.8400651411744685e-05, |
|
"loss": 2.418, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.809724170172978, |
|
"grad_norm": 17.163917541503906, |
|
"learning_rate": 1.805620347902681e-05, |
|
"loss": 2.592, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.8115942028985508, |
|
"grad_norm": 18.376161575317383, |
|
"learning_rate": 1.771468978827343e-05, |
|
"loss": 3.3599, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.8134642356241234, |
|
"grad_norm": 20.212865829467773, |
|
"learning_rate": 1.7376122568400532e-05, |
|
"loss": 3.021, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8153342683496961, |
|
"grad_norm": 26.279674530029297, |
|
"learning_rate": 1.7040513942816906e-05, |
|
"loss": 3.0855, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.8172043010752689, |
|
"grad_norm": 18.360366821289062, |
|
"learning_rate": 1.6707875928990058e-05, |
|
"loss": 2.2329, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.8190743338008415, |
|
"grad_norm": 25.305208206176758, |
|
"learning_rate": 1.6378220438015933e-05, |
|
"loss": 2.7794, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.8209443665264142, |
|
"grad_norm": 19.780479431152344, |
|
"learning_rate": 1.6051559274192275e-05, |
|
"loss": 2.8426, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.822814399251987, |
|
"grad_norm": 24.164377212524414, |
|
"learning_rate": 1.5727904134596083e-05, |
|
"loss": 3.2383, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8246844319775596, |
|
"grad_norm": 24.38163185119629, |
|
"learning_rate": 1.540726660866466e-05, |
|
"loss": 3.5773, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.8265544647031323, |
|
"grad_norm": 22.310827255249023, |
|
"learning_rate": 1.5089658177780653e-05, |
|
"loss": 3.2983, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.828424497428705, |
|
"grad_norm": 54.48847579956055, |
|
"learning_rate": 1.477509021486091e-05, |
|
"loss": 3.3077, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.8302945301542777, |
|
"grad_norm": 23.941181182861328, |
|
"learning_rate": 1.4463573983949341e-05, |
|
"loss": 3.283, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.8321645628798504, |
|
"grad_norm": 23.76746940612793, |
|
"learning_rate": 1.415512063981339e-05, |
|
"loss": 3.4818, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.8340345956054231, |
|
"grad_norm": 26.566600799560547, |
|
"learning_rate": 1.3849741227544777e-05, |
|
"loss": 2.6301, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.8359046283309958, |
|
"grad_norm": 30.18052864074707, |
|
"learning_rate": 1.3547446682163889e-05, |
|
"loss": 3.8662, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.8377746610565685, |
|
"grad_norm": 36.08682632446289, |
|
"learning_rate": 1.3248247828228245e-05, |
|
"loss": 3.9059, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8396446937821412, |
|
"grad_norm": 39.56899642944336, |
|
"learning_rate": 1.2952155379444975e-05, |
|
"loss": 3.1025, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.8415147265077139, |
|
"grad_norm": 39.47623062133789, |
|
"learning_rate": 1.2659179938287035e-05, |
|
"loss": 4.5911, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8433847592332866, |
|
"grad_norm": 8.356927871704102, |
|
"learning_rate": 1.2369331995613665e-05, |
|
"loss": 2.3298, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.8452547919588593, |
|
"grad_norm": 7.952692985534668, |
|
"learning_rate": 1.2082621930294635e-05, |
|
"loss": 2.4439, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.847124824684432, |
|
"grad_norm": 9.349421501159668, |
|
"learning_rate": 1.1799060008838791e-05, |
|
"loss": 2.6627, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.8489948574100047, |
|
"grad_norm": 9.868120193481445, |
|
"learning_rate": 1.151865638502615e-05, |
|
"loss": 2.1623, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.8508648901355774, |
|
"grad_norm": 9.13068675994873, |
|
"learning_rate": 1.124142109954459e-05, |
|
"loss": 1.9143, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.85273492286115, |
|
"grad_norm": 7.523913860321045, |
|
"learning_rate": 1.0967364079630115e-05, |
|
"loss": 1.7452, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.8546049555867228, |
|
"grad_norm": 8.57178783416748, |
|
"learning_rate": 1.069649513871147e-05, |
|
"loss": 1.6952, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.8564749883122955, |
|
"grad_norm": 9.974442481994629, |
|
"learning_rate": 1.042882397605871e-05, |
|
"loss": 2.4002, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.8583450210378681, |
|
"grad_norm": 9.599936485290527, |
|
"learning_rate": 1.0164360176435961e-05, |
|
"loss": 2.5942, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.8602150537634409, |
|
"grad_norm": 9.855104446411133, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 1.904, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8620850864890136, |
|
"grad_norm": 11.599543571472168, |
|
"learning_rate": 9.6450924307517e-06, |
|
"loss": 1.6992, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.8639551192145862, |
|
"grad_norm": 9.953221321105957, |
|
"learning_rate": 9.39030707862013e-06, |
|
"loss": 2.1692, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.865825151940159, |
|
"grad_norm": 12.537870407104492, |
|
"learning_rate": 9.138766276712552e-06, |
|
"loss": 2.1261, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.8676951846657317, |
|
"grad_norm": 12.677627563476562, |
|
"learning_rate": 8.890479032197464e-06, |
|
"loss": 2.4732, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 12.54604434967041, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 2.1939, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.8714352501168771, |
|
"grad_norm": 11.750197410583496, |
|
"learning_rate": 8.403700661183355e-06, |
|
"loss": 2.1129, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.8733052828424498, |
|
"grad_norm": 11.671151161193848, |
|
"learning_rate": 8.165226965235328e-06, |
|
"loss": 2.4747, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.8751753155680224, |
|
"grad_norm": 14.79646110534668, |
|
"learning_rate": 7.930041687157607e-06, |
|
"loss": 2.1669, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.8770453482935952, |
|
"grad_norm": 11.03843879699707, |
|
"learning_rate": 7.698153248460271e-06, |
|
"loss": 2.1497, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.8789153810191679, |
|
"grad_norm": 12.181533813476562, |
|
"learning_rate": 7.46956995260033e-06, |
|
"loss": 2.5349, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8807854137447405, |
|
"grad_norm": 27.885995864868164, |
|
"learning_rate": 7.244299984684233e-06, |
|
"loss": 2.6204, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.8826554464703132, |
|
"grad_norm": 14.558568000793457, |
|
"learning_rate": 7.022351411174866e-06, |
|
"loss": 2.2863, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.884525479195886, |
|
"grad_norm": 14.143083572387695, |
|
"learning_rate": 6.803732179602684e-06, |
|
"loss": 2.2255, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.8863955119214586, |
|
"grad_norm": 15.334390640258789, |
|
"learning_rate": 6.5884501182811084e-06, |
|
"loss": 2.7062, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.8882655446470313, |
|
"grad_norm": 14.617964744567871, |
|
"learning_rate": 6.37651293602628e-06, |
|
"loss": 2.4742, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.8901355773726041, |
|
"grad_norm": 13.574972152709961, |
|
"learning_rate": 6.167928221880926e-06, |
|
"loss": 2.6229, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.8920056100981767, |
|
"grad_norm": 14.283699989318848, |
|
"learning_rate": 5.9627034448426545e-06, |
|
"loss": 2.2862, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.8938756428237494, |
|
"grad_norm": 16.076574325561523, |
|
"learning_rate": 5.760845953596527e-06, |
|
"loss": 2.9803, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.8957456755493222, |
|
"grad_norm": 18.35474967956543, |
|
"learning_rate": 5.562362976251901e-06, |
|
"loss": 2.6646, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.8976157082748948, |
|
"grad_norm": 16.39321517944336, |
|
"learning_rate": 5.367261620083575e-06, |
|
"loss": 2.596, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8994857410004675, |
|
"grad_norm": 16.686824798583984, |
|
"learning_rate": 5.175548871277358e-06, |
|
"loss": 2.5261, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.9013557737260403, |
|
"grad_norm": 16.57494354248047, |
|
"learning_rate": 4.9872315946798535e-06, |
|
"loss": 2.3015, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.9032258064516129, |
|
"grad_norm": 17.73284339904785, |
|
"learning_rate": 4.80231653355262e-06, |
|
"loss": 2.7761, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.9050958391771856, |
|
"grad_norm": 16.7850284576416, |
|
"learning_rate": 4.620810309330803e-06, |
|
"loss": 2.2855, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.9069658719027583, |
|
"grad_norm": 18.645483016967773, |
|
"learning_rate": 4.442719421385922e-06, |
|
"loss": 3.0409, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.908835904628331, |
|
"grad_norm": 17.9419002532959, |
|
"learning_rate": 4.268050246793276e-06, |
|
"loss": 2.664, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.9107059373539037, |
|
"grad_norm": 17.279287338256836, |
|
"learning_rate": 4.096809040103444e-06, |
|
"loss": 2.2947, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.9125759700794764, |
|
"grad_norm": 25.570619583129883, |
|
"learning_rate": 3.9290019331184145e-06, |
|
"loss": 2.6834, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.9144460028050491, |
|
"grad_norm": 22.238800048828125, |
|
"learning_rate": 3.7646349346719955e-06, |
|
"loss": 3.1555, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.9163160355306218, |
|
"grad_norm": 18.883892059326172, |
|
"learning_rate": 3.6037139304146762e-06, |
|
"loss": 2.4276, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9181860682561945, |
|
"grad_norm": 20.898273468017578, |
|
"learning_rate": 3.446244682602817e-06, |
|
"loss": 3.0974, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.9200561009817672, |
|
"grad_norm": 19.29071044921875, |
|
"learning_rate": 3.292232829892361e-06, |
|
"loss": 2.7486, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.9219261337073399, |
|
"grad_norm": 23.598310470581055, |
|
"learning_rate": 3.1416838871368924e-06, |
|
"loss": 3.4687, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.9237961664329126, |
|
"grad_norm": 22.534082412719727, |
|
"learning_rate": 2.9946032451902194e-06, |
|
"loss": 3.0483, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.9256661991584852, |
|
"grad_norm": 20.772890090942383, |
|
"learning_rate": 2.8509961707132494e-06, |
|
"loss": 2.8497, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.927536231884058, |
|
"grad_norm": 27.105192184448242, |
|
"learning_rate": 2.7108678059855065e-06, |
|
"loss": 3.8906, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.9294062646096307, |
|
"grad_norm": 29.60268211364746, |
|
"learning_rate": 2.5742231687209017e-06, |
|
"loss": 4.1652, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.9312762973352033, |
|
"grad_norm": 30.220417022705078, |
|
"learning_rate": 2.4410671518880655e-06, |
|
"loss": 3.4544, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.9331463300607761, |
|
"grad_norm": 34.319881439208984, |
|
"learning_rate": 2.311404523535243e-06, |
|
"loss": 3.7756, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.9350163627863488, |
|
"grad_norm": 48.59423065185547, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 4.6194, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9368863955119214, |
|
"grad_norm": 7.484718322753906, |
|
"learning_rate": 2.062577878840244e-06, |
|
"loss": 2.3524, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.9387564282374942, |
|
"grad_norm": 8.843035697937012, |
|
"learning_rate": 1.9434227724779984e-06, |
|
"loss": 2.4799, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.9406264609630669, |
|
"grad_norm": 7.7384161949157715, |
|
"learning_rate": 1.8277788742365965e-06, |
|
"loss": 1.8435, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.9424964936886395, |
|
"grad_norm": 9.377725601196289, |
|
"learning_rate": 1.7156503250905898e-06, |
|
"loss": 2.1838, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.9443665264142123, |
|
"grad_norm": 9.2131929397583, |
|
"learning_rate": 1.6070411401370334e-06, |
|
"loss": 2.0863, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.946236559139785, |
|
"grad_norm": 9.792388916015625, |
|
"learning_rate": 1.501955208451633e-06, |
|
"loss": 2.5053, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.9481065918653576, |
|
"grad_norm": 10.286921501159668, |
|
"learning_rate": 1.400396292949513e-06, |
|
"loss": 1.9311, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.9499766245909304, |
|
"grad_norm": 10.273000717163086, |
|
"learning_rate": 1.3023680302504338e-06, |
|
"loss": 2.4607, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.9518466573165031, |
|
"grad_norm": 9.218442916870117, |
|
"learning_rate": 1.207873930548653e-06, |
|
"loss": 1.5958, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.9537166900420757, |
|
"grad_norm": 12.184614181518555, |
|
"learning_rate": 1.1169173774871478e-06, |
|
"loss": 2.7458, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9555867227676484, |
|
"grad_norm": 12.237886428833008, |
|
"learning_rate": 1.0295016280365112e-06, |
|
"loss": 2.8767, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.9574567554932212, |
|
"grad_norm": 14.358260154724121, |
|
"learning_rate": 9.456298123782902e-07, |
|
"loss": 2.5067, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.9593267882187938, |
|
"grad_norm": 15.50924301147461, |
|
"learning_rate": 8.65304933792932e-07, |
|
"loss": 2.7335, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.9611968209443665, |
|
"grad_norm": 12.935508728027344, |
|
"learning_rate": 7.885298685522235e-07, |
|
"loss": 2.3211, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.9630668536699393, |
|
"grad_norm": 13.291459083557129, |
|
"learning_rate": 7.153073658162646e-07, |
|
"loss": 2.5455, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.9649368863955119, |
|
"grad_norm": 17.141666412353516, |
|
"learning_rate": 6.456400475351232e-07, |
|
"loss": 3.0588, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.9668069191210846, |
|
"grad_norm": 16.180906295776367, |
|
"learning_rate": 5.795304083548559e-07, |
|
"loss": 2.8284, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.9686769518466574, |
|
"grad_norm": 13.24155044555664, |
|
"learning_rate": 5.169808155281786e-07, |
|
"loss": 2.2862, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.97054698457223, |
|
"grad_norm": 15.022529602050781, |
|
"learning_rate": 4.579935088298015e-07, |
|
"loss": 2.5049, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.9724170172978027, |
|
"grad_norm": 18.232423782348633, |
|
"learning_rate": 4.025706004760932e-07, |
|
"loss": 3.3347, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9742870500233755, |
|
"grad_norm": 15.241950988769531, |
|
"learning_rate": 3.50714075049563e-07, |
|
"loss": 2.7126, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.9761570827489481, |
|
"grad_norm": 15.458436965942383, |
|
"learning_rate": 3.0242578942771825e-07, |
|
"loss": 2.6416, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.9780271154745208, |
|
"grad_norm": 18.882591247558594, |
|
"learning_rate": 2.577074727165951e-07, |
|
"loss": 2.461, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.9798971482000935, |
|
"grad_norm": 17.970928192138672, |
|
"learning_rate": 2.1656072618887468e-07, |
|
"loss": 2.9272, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.9817671809256662, |
|
"grad_norm": 18.435932159423828, |
|
"learning_rate": 1.7898702322648453e-07, |
|
"loss": 2.8594, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.9836372136512389, |
|
"grad_norm": 19.39750099182129, |
|
"learning_rate": 1.449877092679075e-07, |
|
"loss": 2.1917, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.9855072463768116, |
|
"grad_norm": 20.87994384765625, |
|
"learning_rate": 1.1456400175994252e-07, |
|
"loss": 3.1765, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.9873772791023843, |
|
"grad_norm": 20.146398544311523, |
|
"learning_rate": 8.771699011416168e-08, |
|
"loss": 2.4422, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.989247311827957, |
|
"grad_norm": 21.26473045349121, |
|
"learning_rate": 6.444763566786361e-08, |
|
"loss": 2.6351, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.9911173445535297, |
|
"grad_norm": 24.013477325439453, |
|
"learning_rate": 4.475677164966774e-08, |
|
"loss": 3.721, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9929873772791024, |
|
"grad_norm": 29.331228256225586, |
|
"learning_rate": 2.86451031496604e-08, |
|
"loss": 3.3834, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.9948574100046751, |
|
"grad_norm": 20.945201873779297, |
|
"learning_rate": 1.6113207094181626e-08, |
|
"loss": 2.846, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.9967274427302478, |
|
"grad_norm": 22.861347198486328, |
|
"learning_rate": 7.161532225130607e-09, |
|
"loss": 2.9253, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.9985974754558204, |
|
"grad_norm": 30.596744537353516, |
|
"learning_rate": 1.7903990839229779e-09, |
|
"loss": 4.0554, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.0014025245441796, |
|
"grad_norm": 14.026424407958984, |
|
"learning_rate": 0.0, |
|
"loss": 2.7318, |
|
"step": 535 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 535, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 134, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8753566709645312e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|