{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3328340822100183, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006656681644200366, "grad_norm": NaN, "learning_rate": 0.0, "loss": 1.9001, "step": 1 }, { "epoch": 0.0013313363288400732, "grad_norm": NaN, "learning_rate": 0.0, "loss": 1.1836, "step": 2 }, { "epoch": 0.00199700449326011, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 2.0432, "step": 3 }, { "epoch": 0.0026626726576801465, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 2.7364, "step": 4 }, { "epoch": 0.003328340822100183, "grad_norm": 305.05010986328125, "learning_rate": 0.0001, "loss": 2.7278, "step": 5 }, { "epoch": 0.00399400898652022, "grad_norm": 195.2991180419922, "learning_rate": 0.0002, "loss": 1.5935, "step": 6 }, { "epoch": 0.004659677150940256, "grad_norm": 135.4779815673828, "learning_rate": 0.00019995559502664298, "loss": 1.558, "step": 7 }, { "epoch": 0.005325345315360293, "grad_norm": 175.23486328125, "learning_rate": 0.00019991119005328598, "loss": 1.8278, "step": 8 }, { "epoch": 0.005991013479780329, "grad_norm": 70.18768310546875, "learning_rate": 0.00019986678507992895, "loss": 1.2821, "step": 9 }, { "epoch": 0.006656681644200366, "grad_norm": 91.9547348022461, "learning_rate": 0.00019982238010657195, "loss": 1.3009, "step": 10 }, { "epoch": 0.007322349808620403, "grad_norm": 140.80667114257812, "learning_rate": 0.00019977797513321492, "loss": 1.4739, "step": 11 }, { "epoch": 0.00798801797304044, "grad_norm": 117.13510131835938, "learning_rate": 0.00019973357015985792, "loss": 1.0922, "step": 12 }, { "epoch": 0.008653686137460476, "grad_norm": 87.54082489013672, "learning_rate": 0.0001996891651865009, "loss": 1.6633, "step": 13 }, { "epoch": 0.009319354301880512, "grad_norm": 178.7975616455078, "learning_rate": 0.0001996447602131439, "loss": 1.7614, "step": 14 }, { "epoch": 0.00998502246630055, "grad_norm": 199.108154296875, "learning_rate": 0.00019960035523978686, "loss": 1.4873, "step": 15 }, { "epoch": 0.010650690630720586, "grad_norm": 97.36341857910156, "learning_rate": 0.00019955595026642986, "loss": 1.2921, "step": 16 }, { "epoch": 0.011316358795140622, "grad_norm": 145.5021209716797, "learning_rate": 0.00019951154529307283, "loss": 1.4667, "step": 17 }, { "epoch": 0.011982026959560658, "grad_norm": 163.09628295898438, "learning_rate": 0.00019946714031971583, "loss": 1.4959, "step": 18 }, { "epoch": 0.012647695123980696, "grad_norm": 181.53123474121094, "learning_rate": 0.0001994227353463588, "loss": 1.4931, "step": 19 }, { "epoch": 0.013313363288400732, "grad_norm": 134.01087951660156, "learning_rate": 0.00019937833037300177, "loss": 1.1797, "step": 20 }, { "epoch": 0.013979031452820768, "grad_norm": 138.4213409423828, "learning_rate": 0.00019933392539964477, "loss": 1.3021, "step": 21 }, { "epoch": 0.014644699617240806, "grad_norm": 94.12645721435547, "learning_rate": 0.00019928952042628774, "loss": 1.2099, "step": 22 }, { "epoch": 0.015310367781660842, "grad_norm": 68.36018371582031, "learning_rate": 0.00019924511545293074, "loss": 1.0593, "step": 23 }, { "epoch": 0.01597603594608088, "grad_norm": 83.23944854736328, "learning_rate": 0.0001992007104795737, "loss": 1.1152, "step": 24 }, { "epoch": 0.016641704110500914, "grad_norm": 145.29953002929688, "learning_rate": 0.0001991563055062167, "loss": 1.194, "step": 25 }, { "epoch": 0.017307372274920952, "grad_norm": 71.01487731933594, "learning_rate": 0.00019911190053285968, "loss": 1.0201, "step": 26 }, { "epoch": 0.01797304043934099, "grad_norm": 82.99989318847656, "learning_rate": 0.00019906749555950268, "loss": 1.3319, "step": 27 }, { "epoch": 0.018638708603761024, "grad_norm": 51.65462112426758, "learning_rate": 0.00019902309058614565, "loss": 0.8622, "step": 28 }, { "epoch": 0.019304376768181062, "grad_norm": 106.69747161865234, "learning_rate": 0.00019897868561278865, "loss": 1.0051, "step": 29 }, { "epoch": 0.0199700449326011, "grad_norm": 85.53784942626953, "learning_rate": 0.00019893428063943162, "loss": 0.7687, "step": 30 }, { "epoch": 0.020635713097021134, "grad_norm": 141.33482360839844, "learning_rate": 0.00019888987566607462, "loss": 0.9697, "step": 31 }, { "epoch": 0.021301381261441172, "grad_norm": 110.15571594238281, "learning_rate": 0.0001988454706927176, "loss": 0.6955, "step": 32 }, { "epoch": 0.02196704942586121, "grad_norm": 123.28990173339844, "learning_rate": 0.00019880106571936056, "loss": 0.9405, "step": 33 }, { "epoch": 0.022632717590281244, "grad_norm": 104.15376281738281, "learning_rate": 0.00019875666074600356, "loss": 1.1353, "step": 34 }, { "epoch": 0.023298385754701282, "grad_norm": 122.83297729492188, "learning_rate": 0.00019871225577264653, "loss": 0.9912, "step": 35 }, { "epoch": 0.023964053919121316, "grad_norm": 137.1550750732422, "learning_rate": 0.00019866785079928953, "loss": 1.0156, "step": 36 }, { "epoch": 0.024629722083541354, "grad_norm": 95.7591781616211, "learning_rate": 0.0001986234458259325, "loss": 0.9295, "step": 37 }, { "epoch": 0.025295390247961392, "grad_norm": 163.35743713378906, "learning_rate": 0.0001985790408525755, "loss": 0.9697, "step": 38 }, { "epoch": 0.025961058412381426, "grad_norm": 42.57448959350586, "learning_rate": 0.00019853463587921847, "loss": 0.8329, "step": 39 }, { "epoch": 0.026626726576801464, "grad_norm": 161.05844116210938, "learning_rate": 0.00019849023090586147, "loss": 0.8588, "step": 40 }, { "epoch": 0.027292394741221502, "grad_norm": 123.78165435791016, "learning_rate": 0.00019844582593250444, "loss": 0.8082, "step": 41 }, { "epoch": 0.027958062905641536, "grad_norm": 117.62382507324219, "learning_rate": 0.00019840142095914744, "loss": 1.0692, "step": 42 }, { "epoch": 0.028623731070061574, "grad_norm": NaN, "learning_rate": 0.00019840142095914744, "loss": 1.316, "step": 43 }, { "epoch": 0.029289399234481612, "grad_norm": 63.35919952392578, "learning_rate": 0.0001983570159857904, "loss": 0.6028, "step": 44 }, { "epoch": 0.029955067398901646, "grad_norm": 277.1460266113281, "learning_rate": 0.00019831261101243338, "loss": 1.4349, "step": 45 }, { "epoch": 0.030620735563321684, "grad_norm": NaN, "learning_rate": 0.00019831261101243338, "loss": 1.816, "step": 46 }, { "epoch": 0.03128640372774172, "grad_norm": 206.0849151611328, "learning_rate": 0.00019826820603907638, "loss": 1.7035, "step": 47 }, { "epoch": 0.03195207189216176, "grad_norm": 154.10183715820312, "learning_rate": 0.00019822380106571935, "loss": 0.7897, "step": 48 }, { "epoch": 0.032617740056581794, "grad_norm": 182.32650756835938, "learning_rate": 0.00019817939609236235, "loss": 1.2611, "step": 49 }, { "epoch": 0.03328340822100183, "grad_norm": 100.30379486083984, "learning_rate": 0.00019813499111900532, "loss": 0.8746, "step": 50 }, { "epoch": 0.03394907638542187, "grad_norm": 99.05520629882812, "learning_rate": 0.00019809058614564832, "loss": 0.6544, "step": 51 }, { "epoch": 0.034614744549841904, "grad_norm": 75.6047592163086, "learning_rate": 0.0001980461811722913, "loss": 0.5308, "step": 52 }, { "epoch": 0.03528041271426194, "grad_norm": 81.81549835205078, "learning_rate": 0.0001980017761989343, "loss": 1.0103, "step": 53 }, { "epoch": 0.03594608087868198, "grad_norm": 305.5655517578125, "learning_rate": 0.00019795737122557726, "loss": 2.0811, "step": 54 }, { "epoch": 0.036611749043102014, "grad_norm": 254.49461364746094, "learning_rate": 0.00019791296625222026, "loss": 1.5316, "step": 55 }, { "epoch": 0.03727741720752205, "grad_norm": 249.2161407470703, "learning_rate": 0.00019786856127886323, "loss": 1.4851, "step": 56 }, { "epoch": 0.03794308537194209, "grad_norm": 187.69805908203125, "learning_rate": 0.00019782415630550623, "loss": 1.0635, "step": 57 }, { "epoch": 0.038608753536362124, "grad_norm": 135.42141723632812, "learning_rate": 0.0001977797513321492, "loss": 0.8954, "step": 58 }, { "epoch": 0.03927442170078216, "grad_norm": 198.7463836669922, "learning_rate": 0.0001977353463587922, "loss": 1.2765, "step": 59 }, { "epoch": 0.0399400898652022, "grad_norm": 256.67279052734375, "learning_rate": 0.00019769094138543517, "loss": 1.4927, "step": 60 }, { "epoch": 0.040605758029622234, "grad_norm": 64.07623291015625, "learning_rate": 0.00019764653641207817, "loss": 0.7874, "step": 61 }, { "epoch": 0.04127142619404227, "grad_norm": 147.7261962890625, "learning_rate": 0.00019760213143872114, "loss": 0.7924, "step": 62 }, { "epoch": 0.04193709435846231, "grad_norm": 122.39730072021484, "learning_rate": 0.00019755772646536413, "loss": 0.7656, "step": 63 }, { "epoch": 0.042602762522882344, "grad_norm": 94.13729095458984, "learning_rate": 0.0001975133214920071, "loss": 0.9051, "step": 64 }, { "epoch": 0.04326843068730238, "grad_norm": 48.220184326171875, "learning_rate": 0.0001974689165186501, "loss": 0.6262, "step": 65 }, { "epoch": 0.04393409885172242, "grad_norm": 32.33949661254883, "learning_rate": 0.00019742451154529308, "loss": 0.544, "step": 66 }, { "epoch": 0.044599767016142454, "grad_norm": 133.78097534179688, "learning_rate": 0.00019738010657193607, "loss": 0.6472, "step": 67 }, { "epoch": 0.04526543518056249, "grad_norm": 105.52882385253906, "learning_rate": 0.00019733570159857907, "loss": 0.7272, "step": 68 }, { "epoch": 0.04593110334498253, "grad_norm": 40.91939926147461, "learning_rate": 0.00019729129662522204, "loss": 0.6232, "step": 69 }, { "epoch": 0.046596771509402564, "grad_norm": 48.693763732910156, "learning_rate": 0.00019724689165186504, "loss": 0.3698, "step": 70 }, { "epoch": 0.0472624396738226, "grad_norm": 98.95121765136719, "learning_rate": 0.00019720248667850801, "loss": 0.4914, "step": 71 }, { "epoch": 0.04792810783824263, "grad_norm": 217.00401306152344, "learning_rate": 0.00019715808170515098, "loss": 1.3837, "step": 72 }, { "epoch": 0.048593776002662674, "grad_norm": 97.31861877441406, "learning_rate": 0.00019711367673179398, "loss": 0.7037, "step": 73 }, { "epoch": 0.04925944416708271, "grad_norm": 101.28430938720703, "learning_rate": 0.00019706927175843695, "loss": 1.3999, "step": 74 }, { "epoch": 0.04992511233150274, "grad_norm": 86.35800170898438, "learning_rate": 0.00019702486678507995, "loss": 0.8045, "step": 75 }, { "epoch": 0.050590780495922784, "grad_norm": 102.23155212402344, "learning_rate": 0.00019698046181172292, "loss": 0.7521, "step": 76 }, { "epoch": 0.05125644866034282, "grad_norm": 125.80261993408203, "learning_rate": 0.00019693605683836592, "loss": 0.7933, "step": 77 }, { "epoch": 0.05192211682476285, "grad_norm": 160.28123474121094, "learning_rate": 0.0001968916518650089, "loss": 0.6355, "step": 78 }, { "epoch": 0.052587784989182894, "grad_norm": 42.919464111328125, "learning_rate": 0.0001968472468916519, "loss": 0.2693, "step": 79 }, { "epoch": 0.05325345315360293, "grad_norm": 56.61810302734375, "learning_rate": 0.00019680284191829486, "loss": 0.3141, "step": 80 }, { "epoch": 0.05391912131802296, "grad_norm": 299.3166198730469, "learning_rate": 0.00019675843694493786, "loss": 1.7119, "step": 81 }, { "epoch": 0.054584789482443004, "grad_norm": 133.17459106445312, "learning_rate": 0.00019671403197158083, "loss": 1.2885, "step": 82 }, { "epoch": 0.05525045764686304, "grad_norm": 131.6356964111328, "learning_rate": 0.00019666962699822383, "loss": 0.7976, "step": 83 }, { "epoch": 0.05591612581128307, "grad_norm": 122.43846130371094, "learning_rate": 0.0001966252220248668, "loss": 0.7031, "step": 84 }, { "epoch": 0.056581793975703114, "grad_norm": 51.365901947021484, "learning_rate": 0.00019658081705150977, "loss": 0.6684, "step": 85 }, { "epoch": 0.05724746214012315, "grad_norm": 251.70664978027344, "learning_rate": 0.00019653641207815277, "loss": 1.2329, "step": 86 }, { "epoch": 0.05791313030454318, "grad_norm": 94.94196319580078, "learning_rate": 0.00019649200710479574, "loss": 0.746, "step": 87 }, { "epoch": 0.058578798468963224, "grad_norm": 106.89917755126953, "learning_rate": 0.00019644760213143874, "loss": 0.8735, "step": 88 }, { "epoch": 0.05924446663338326, "grad_norm": 130.65577697753906, "learning_rate": 0.00019640319715808171, "loss": 1.0905, "step": 89 }, { "epoch": 0.05991013479780329, "grad_norm": 37.6524543762207, "learning_rate": 0.0001963587921847247, "loss": 0.447, "step": 90 }, { "epoch": 0.060575802962223334, "grad_norm": 55.81675720214844, "learning_rate": 0.00019631438721136768, "loss": 0.8305, "step": 91 }, { "epoch": 0.06124147112664337, "grad_norm": 56.32381057739258, "learning_rate": 0.00019626998223801068, "loss": 0.6437, "step": 92 }, { "epoch": 0.0619071392910634, "grad_norm": 107.72248840332031, "learning_rate": 0.00019622557726465365, "loss": 0.7331, "step": 93 }, { "epoch": 0.06257280745548344, "grad_norm": 60.965084075927734, "learning_rate": 0.00019618117229129665, "loss": 0.1975, "step": 94 }, { "epoch": 0.06323847561990348, "grad_norm": 236.83071899414062, "learning_rate": 0.00019613676731793962, "loss": 1.3465, "step": 95 }, { "epoch": 0.06390414378432352, "grad_norm": 32.12284851074219, "learning_rate": 0.0001960923623445826, "loss": 0.4818, "step": 96 }, { "epoch": 0.06456981194874355, "grad_norm": 139.45631408691406, "learning_rate": 0.0001960479573712256, "loss": 0.7907, "step": 97 }, { "epoch": 0.06523548011316359, "grad_norm": 85.41361999511719, "learning_rate": 0.00019600355239786856, "loss": 0.5648, "step": 98 }, { "epoch": 0.06590114827758363, "grad_norm": 79.85189056396484, "learning_rate": 0.00019595914742451156, "loss": 0.7735, "step": 99 }, { "epoch": 0.06656681644200366, "grad_norm": 64.68962860107422, "learning_rate": 0.00019591474245115453, "loss": 0.4128, "step": 100 }, { "epoch": 0.0672324846064237, "grad_norm": 58.707054138183594, "learning_rate": 0.00019587033747779753, "loss": 0.8143, "step": 101 }, { "epoch": 0.06789815277084374, "grad_norm": 107.83698272705078, "learning_rate": 0.0001958259325044405, "loss": 0.8662, "step": 102 }, { "epoch": 0.06856382093526377, "grad_norm": 221.64857482910156, "learning_rate": 0.0001957815275310835, "loss": 1.2667, "step": 103 }, { "epoch": 0.06922948909968381, "grad_norm": 75.43016815185547, "learning_rate": 0.00019573712255772647, "loss": 0.6056, "step": 104 }, { "epoch": 0.06989515726410385, "grad_norm": 81.48358917236328, "learning_rate": 0.00019569271758436947, "loss": 0.4954, "step": 105 }, { "epoch": 0.07056082542852388, "grad_norm": 100.88373565673828, "learning_rate": 0.00019564831261101244, "loss": 0.8129, "step": 106 }, { "epoch": 0.07122649359294392, "grad_norm": 56.06926727294922, "learning_rate": 0.00019560390763765544, "loss": 0.5694, "step": 107 }, { "epoch": 0.07189216175736396, "grad_norm": 119.89936828613281, "learning_rate": 0.0001955595026642984, "loss": 0.8492, "step": 108 }, { "epoch": 0.07255782992178399, "grad_norm": 226.14344787597656, "learning_rate": 0.00019551509769094138, "loss": 1.5915, "step": 109 }, { "epoch": 0.07322349808620403, "grad_norm": 167.6849365234375, "learning_rate": 0.00019547069271758438, "loss": 0.8734, "step": 110 }, { "epoch": 0.07388916625062407, "grad_norm": 191.29388427734375, "learning_rate": 0.00019542628774422735, "loss": 1.1915, "step": 111 }, { "epoch": 0.0745548344150441, "grad_norm": 74.29916381835938, "learning_rate": 0.00019538188277087035, "loss": 0.399, "step": 112 }, { "epoch": 0.07522050257946414, "grad_norm": 193.20899963378906, "learning_rate": 0.00019533747779751332, "loss": 1.0812, "step": 113 }, { "epoch": 0.07588617074388418, "grad_norm": 104.37834930419922, "learning_rate": 0.00019529307282415632, "loss": 0.7821, "step": 114 }, { "epoch": 0.0765518389083042, "grad_norm": 63.434146881103516, "learning_rate": 0.0001952486678507993, "loss": 0.5636, "step": 115 }, { "epoch": 0.07721750707272425, "grad_norm": 163.28578186035156, "learning_rate": 0.0001952042628774423, "loss": 0.8423, "step": 116 }, { "epoch": 0.07788317523714429, "grad_norm": 94.21780395507812, "learning_rate": 0.00019515985790408526, "loss": 0.6704, "step": 117 }, { "epoch": 0.07854884340156432, "grad_norm": 97.0186767578125, "learning_rate": 0.00019511545293072826, "loss": 0.4906, "step": 118 }, { "epoch": 0.07921451156598436, "grad_norm": 67.47974395751953, "learning_rate": 0.00019507104795737123, "loss": 0.5328, "step": 119 }, { "epoch": 0.0798801797304044, "grad_norm": 113.68756103515625, "learning_rate": 0.00019502664298401423, "loss": 0.7459, "step": 120 }, { "epoch": 0.08054584789482443, "grad_norm": 71.1500244140625, "learning_rate": 0.0001949822380106572, "loss": 0.6177, "step": 121 }, { "epoch": 0.08121151605924447, "grad_norm": 131.37168884277344, "learning_rate": 0.00019493783303730017, "loss": 0.7955, "step": 122 }, { "epoch": 0.08187718422366451, "grad_norm": 72.7484359741211, "learning_rate": 0.00019489342806394317, "loss": 0.7026, "step": 123 }, { "epoch": 0.08254285238808454, "grad_norm": 34.95873260498047, "learning_rate": 0.00019484902309058614, "loss": 0.3607, "step": 124 }, { "epoch": 0.08320852055250458, "grad_norm": 23.322694778442383, "learning_rate": 0.00019480461811722914, "loss": 0.4757, "step": 125 }, { "epoch": 0.08387418871692462, "grad_norm": 102.11334228515625, "learning_rate": 0.0001947602131438721, "loss": 0.9689, "step": 126 }, { "epoch": 0.08453985688134465, "grad_norm": 70.20091247558594, "learning_rate": 0.0001947158081705151, "loss": 0.5523, "step": 127 }, { "epoch": 0.08520552504576469, "grad_norm": 108.84575653076172, "learning_rate": 0.00019467140319715808, "loss": 0.7221, "step": 128 }, { "epoch": 0.08587119321018473, "grad_norm": 74.28465270996094, "learning_rate": 0.00019462699822380108, "loss": 0.3408, "step": 129 }, { "epoch": 0.08653686137460476, "grad_norm": 45.54611587524414, "learning_rate": 0.00019458259325044405, "loss": 0.3064, "step": 130 }, { "epoch": 0.0872025295390248, "grad_norm": 107.5720443725586, "learning_rate": 0.00019453818827708705, "loss": 0.4285, "step": 131 }, { "epoch": 0.08786819770344484, "grad_norm": 113.14404296875, "learning_rate": 0.00019449378330373002, "loss": 0.7893, "step": 132 }, { "epoch": 0.08853386586786487, "grad_norm": 41.36758041381836, "learning_rate": 0.00019444937833037302, "loss": 0.2135, "step": 133 }, { "epoch": 0.08919953403228491, "grad_norm": 111.0009765625, "learning_rate": 0.000194404973357016, "loss": 0.4909, "step": 134 }, { "epoch": 0.08986520219670495, "grad_norm": 119.50690460205078, "learning_rate": 0.00019436056838365896, "loss": 0.8744, "step": 135 }, { "epoch": 0.09053087036112498, "grad_norm": 83.16583251953125, "learning_rate": 0.00019431616341030196, "loss": 0.5229, "step": 136 }, { "epoch": 0.09119653852554502, "grad_norm": 96.94984436035156, "learning_rate": 0.00019427175843694493, "loss": 0.6746, "step": 137 }, { "epoch": 0.09186220668996506, "grad_norm": 65.90393829345703, "learning_rate": 0.00019422735346358793, "loss": 0.3112, "step": 138 }, { "epoch": 0.09252787485438509, "grad_norm": 92.0575180053711, "learning_rate": 0.0001941829484902309, "loss": 0.5264, "step": 139 }, { "epoch": 0.09319354301880513, "grad_norm": 63.846744537353516, "learning_rate": 0.0001941385435168739, "loss": 0.8869, "step": 140 }, { "epoch": 0.09385921118322517, "grad_norm": 129.04086303710938, "learning_rate": 0.00019409413854351687, "loss": 0.7077, "step": 141 }, { "epoch": 0.0945248793476452, "grad_norm": 38.49999237060547, "learning_rate": 0.00019404973357015987, "loss": 0.1854, "step": 142 }, { "epoch": 0.09519054751206524, "grad_norm": 142.83619689941406, "learning_rate": 0.00019400532859680284, "loss": 1.3549, "step": 143 }, { "epoch": 0.09585621567648527, "grad_norm": 58.420223236083984, "learning_rate": 0.00019396092362344584, "loss": 0.3978, "step": 144 }, { "epoch": 0.0965218838409053, "grad_norm": 82.57951354980469, "learning_rate": 0.0001939165186500888, "loss": 0.9771, "step": 145 }, { "epoch": 0.09718755200532535, "grad_norm": 52.69244384765625, "learning_rate": 0.00019387211367673178, "loss": 0.389, "step": 146 }, { "epoch": 0.09785322016974538, "grad_norm": 99.57772827148438, "learning_rate": 0.00019382770870337478, "loss": 0.938, "step": 147 }, { "epoch": 0.09851888833416542, "grad_norm": 66.55644226074219, "learning_rate": 0.00019378330373001775, "loss": 0.4984, "step": 148 }, { "epoch": 0.09918455649858546, "grad_norm": 64.8500747680664, "learning_rate": 0.00019373889875666075, "loss": 0.4189, "step": 149 }, { "epoch": 0.09985022466300549, "grad_norm": 113.6292495727539, "learning_rate": 0.00019369449378330372, "loss": 0.486, "step": 150 }, { "epoch": 0.10051589282742553, "grad_norm": 83.64993286132812, "learning_rate": 0.00019365008880994672, "loss": 0.3288, "step": 151 }, { "epoch": 0.10118156099184557, "grad_norm": 70.0993423461914, "learning_rate": 0.0001936056838365897, "loss": 0.7637, "step": 152 }, { "epoch": 0.1018472291562656, "grad_norm": 68.05122375488281, "learning_rate": 0.0001935612788632327, "loss": 0.3255, "step": 153 }, { "epoch": 0.10251289732068564, "grad_norm": 56.31193923950195, "learning_rate": 0.00019351687388987566, "loss": 0.1978, "step": 154 }, { "epoch": 0.10317856548510568, "grad_norm": 131.5350799560547, "learning_rate": 0.00019347246891651866, "loss": 0.7034, "step": 155 }, { "epoch": 0.1038442336495257, "grad_norm": 143.9272003173828, "learning_rate": 0.00019342806394316163, "loss": 0.6122, "step": 156 }, { "epoch": 0.10450990181394575, "grad_norm": 56.10287094116211, "learning_rate": 0.00019338365896980463, "loss": 0.5982, "step": 157 }, { "epoch": 0.10517556997836579, "grad_norm": 113.03327941894531, "learning_rate": 0.0001933392539964476, "loss": 0.5578, "step": 158 }, { "epoch": 0.10584123814278582, "grad_norm": 156.54730224609375, "learning_rate": 0.0001932948490230906, "loss": 0.9944, "step": 159 }, { "epoch": 0.10650690630720586, "grad_norm": 117.35420989990234, "learning_rate": 0.00019325044404973357, "loss": 0.9612, "step": 160 }, { "epoch": 0.1071725744716259, "grad_norm": 137.27517700195312, "learning_rate": 0.00019320603907637657, "loss": 0.8892, "step": 161 }, { "epoch": 0.10783824263604593, "grad_norm": 121.9662094116211, "learning_rate": 0.00019316163410301954, "loss": 0.8541, "step": 162 }, { "epoch": 0.10850391080046597, "grad_norm": 50.96897506713867, "learning_rate": 0.00019311722912966254, "loss": 0.4626, "step": 163 }, { "epoch": 0.10916957896488601, "grad_norm": 86.26678466796875, "learning_rate": 0.0001930728241563055, "loss": 0.4941, "step": 164 }, { "epoch": 0.10983524712930604, "grad_norm": 92.44398498535156, "learning_rate": 0.0001930284191829485, "loss": 0.6016, "step": 165 }, { "epoch": 0.11050091529372608, "grad_norm": 76.84557342529297, "learning_rate": 0.00019298401420959148, "loss": 0.465, "step": 166 }, { "epoch": 0.11116658345814612, "grad_norm": 120.3193588256836, "learning_rate": 0.00019293960923623448, "loss": 0.7597, "step": 167 }, { "epoch": 0.11183225162256615, "grad_norm": 98.53234100341797, "learning_rate": 0.00019289520426287745, "loss": 0.4161, "step": 168 }, { "epoch": 0.11249791978698619, "grad_norm": 136.8874053955078, "learning_rate": 0.00019285079928952045, "loss": 1.0431, "step": 169 }, { "epoch": 0.11316358795140623, "grad_norm": 145.96075439453125, "learning_rate": 0.00019280639431616342, "loss": 0.8223, "step": 170 }, { "epoch": 0.11382925611582626, "grad_norm": 95.09919738769531, "learning_rate": 0.00019276198934280642, "loss": 0.4007, "step": 171 }, { "epoch": 0.1144949242802463, "grad_norm": 95.6208267211914, "learning_rate": 0.0001927175843694494, "loss": 0.4496, "step": 172 }, { "epoch": 0.11516059244466634, "grad_norm": 131.4598846435547, "learning_rate": 0.00019267317939609239, "loss": 0.7203, "step": 173 }, { "epoch": 0.11582626060908637, "grad_norm": 183.2200469970703, "learning_rate": 0.00019262877442273536, "loss": 0.8879, "step": 174 }, { "epoch": 0.1164919287735064, "grad_norm": 65.30191802978516, "learning_rate": 0.00019258436944937836, "loss": 0.2161, "step": 175 }, { "epoch": 0.11715759693792645, "grad_norm": 100.97784423828125, "learning_rate": 0.00019253996447602133, "loss": 0.8095, "step": 176 }, { "epoch": 0.11782326510234647, "grad_norm": 57.705963134765625, "learning_rate": 0.00019249555950266432, "loss": 0.3389, "step": 177 }, { "epoch": 0.11848893326676652, "grad_norm": 119.22857666015625, "learning_rate": 0.0001924511545293073, "loss": 0.5287, "step": 178 }, { "epoch": 0.11915460143118656, "grad_norm": 97.59465026855469, "learning_rate": 0.0001924067495559503, "loss": 0.4567, "step": 179 }, { "epoch": 0.11982026959560658, "grad_norm": 47.80794906616211, "learning_rate": 0.00019236234458259327, "loss": 0.3795, "step": 180 }, { "epoch": 0.12048593776002663, "grad_norm": 93.19955444335938, "learning_rate": 0.00019231793960923626, "loss": 1.0376, "step": 181 }, { "epoch": 0.12115160592444667, "grad_norm": 57.24619674682617, "learning_rate": 0.00019227353463587924, "loss": 0.3167, "step": 182 }, { "epoch": 0.1218172740888667, "grad_norm": 71.7004165649414, "learning_rate": 0.00019222912966252223, "loss": 0.3515, "step": 183 }, { "epoch": 0.12248294225328674, "grad_norm": 42.93682098388672, "learning_rate": 0.0001921847246891652, "loss": 0.3272, "step": 184 }, { "epoch": 0.12314861041770678, "grad_norm": 88.22802734375, "learning_rate": 0.00019214031971580818, "loss": 0.5751, "step": 185 }, { "epoch": 0.1238142785821268, "grad_norm": 88.80204772949219, "learning_rate": 0.00019209591474245117, "loss": 0.5306, "step": 186 }, { "epoch": 0.12447994674654685, "grad_norm": 92.46751403808594, "learning_rate": 0.00019205150976909415, "loss": 0.4262, "step": 187 }, { "epoch": 0.12514561491096687, "grad_norm": 55.72347640991211, "learning_rate": 0.00019200710479573714, "loss": 0.3939, "step": 188 }, { "epoch": 0.12581128307538691, "grad_norm": 72.36625671386719, "learning_rate": 0.00019196269982238012, "loss": 0.3739, "step": 189 }, { "epoch": 0.12647695123980696, "grad_norm": 96.56300354003906, "learning_rate": 0.00019191829484902311, "loss": 0.4574, "step": 190 }, { "epoch": 0.127142619404227, "grad_norm": 122.78588104248047, "learning_rate": 0.00019187388987566609, "loss": 0.7646, "step": 191 }, { "epoch": 0.12780828756864704, "grad_norm": 74.9114990234375, "learning_rate": 0.00019182948490230908, "loss": 0.2156, "step": 192 }, { "epoch": 0.12847395573306705, "grad_norm": 61.17184829711914, "learning_rate": 0.00019178507992895206, "loss": 0.7067, "step": 193 }, { "epoch": 0.1291396238974871, "grad_norm": 120.3477783203125, "learning_rate": 0.00019174067495559505, "loss": 0.7562, "step": 194 }, { "epoch": 0.12980529206190713, "grad_norm": 204.0528564453125, "learning_rate": 0.00019169626998223802, "loss": 1.2893, "step": 195 }, { "epoch": 0.13047096022632718, "grad_norm": 91.55975341796875, "learning_rate": 0.000191651865008881, "loss": 1.025, "step": 196 }, { "epoch": 0.13113662839074722, "grad_norm": 111.15852355957031, "learning_rate": 0.000191607460035524, "loss": 0.4063, "step": 197 }, { "epoch": 0.13180229655516726, "grad_norm": 90.34394073486328, "learning_rate": 0.00019156305506216697, "loss": 0.6954, "step": 198 }, { "epoch": 0.13246796471958727, "grad_norm": 95.49404907226562, "learning_rate": 0.00019151865008880996, "loss": 0.6267, "step": 199 }, { "epoch": 0.1331336328840073, "grad_norm": 41.02389144897461, "learning_rate": 0.00019147424511545294, "loss": 0.2385, "step": 200 }, { "epoch": 0.13379930104842735, "grad_norm": 91.21604919433594, "learning_rate": 0.00019142984014209593, "loss": 0.4531, "step": 201 }, { "epoch": 0.1344649692128474, "grad_norm": 49.900184631347656, "learning_rate": 0.0001913854351687389, "loss": 0.1657, "step": 202 }, { "epoch": 0.13513063737726744, "grad_norm": 144.78623962402344, "learning_rate": 0.0001913410301953819, "loss": 0.7326, "step": 203 }, { "epoch": 0.13579630554168748, "grad_norm": 143.92132568359375, "learning_rate": 0.00019129662522202487, "loss": 1.0342, "step": 204 }, { "epoch": 0.1364619737061075, "grad_norm": 107.93486022949219, "learning_rate": 0.00019125222024866787, "loss": 0.3643, "step": 205 }, { "epoch": 0.13712764187052753, "grad_norm": 69.4767074584961, "learning_rate": 0.00019120781527531084, "loss": 0.4302, "step": 206 }, { "epoch": 0.13779331003494757, "grad_norm": 102.22624206542969, "learning_rate": 0.00019116341030195384, "loss": 0.7252, "step": 207 }, { "epoch": 0.13845897819936762, "grad_norm": 39.15114974975586, "learning_rate": 0.00019111900532859681, "loss": 0.376, "step": 208 }, { "epoch": 0.13912464636378766, "grad_norm": 65.12492370605469, "learning_rate": 0.00019107460035523979, "loss": 0.6005, "step": 209 }, { "epoch": 0.1397903145282077, "grad_norm": 95.04318237304688, "learning_rate": 0.00019103019538188278, "loss": 0.8371, "step": 210 }, { "epoch": 0.1404559826926277, "grad_norm": 119.12702941894531, "learning_rate": 0.00019098579040852576, "loss": 0.5701, "step": 211 }, { "epoch": 0.14112165085704775, "grad_norm": 56.38259506225586, "learning_rate": 0.00019094138543516875, "loss": 0.6285, "step": 212 }, { "epoch": 0.1417873190214678, "grad_norm": 29.10639762878418, "learning_rate": 0.00019089698046181172, "loss": 0.4639, "step": 213 }, { "epoch": 0.14245298718588784, "grad_norm": 60.445865631103516, "learning_rate": 0.00019085257548845472, "loss": 0.6099, "step": 214 }, { "epoch": 0.14311865535030788, "grad_norm": 83.33357238769531, "learning_rate": 0.0001908081705150977, "loss": 0.6992, "step": 215 }, { "epoch": 0.14378432351472792, "grad_norm": 50.539451599121094, "learning_rate": 0.0001907637655417407, "loss": 0.673, "step": 216 }, { "epoch": 0.14444999167914793, "grad_norm": 108.4279556274414, "learning_rate": 0.00019071936056838366, "loss": 0.7867, "step": 217 }, { "epoch": 0.14511565984356797, "grad_norm": 71.25292205810547, "learning_rate": 0.00019067495559502666, "loss": 0.2414, "step": 218 }, { "epoch": 0.14578132800798801, "grad_norm": 96.37142944335938, "learning_rate": 0.00019063055062166963, "loss": 0.4556, "step": 219 }, { "epoch": 0.14644699617240806, "grad_norm": 105.84346008300781, "learning_rate": 0.00019058614564831263, "loss": 0.5288, "step": 220 }, { "epoch": 0.1471126643368281, "grad_norm": 53.66656494140625, "learning_rate": 0.0001905417406749556, "loss": 0.2665, "step": 221 }, { "epoch": 0.14777833250124814, "grad_norm": 73.43135070800781, "learning_rate": 0.00019049733570159857, "loss": 0.465, "step": 222 }, { "epoch": 0.14844400066566815, "grad_norm": 108.94127655029297, "learning_rate": 0.00019045293072824157, "loss": 0.6516, "step": 223 }, { "epoch": 0.1491096688300882, "grad_norm": 23.10163688659668, "learning_rate": 0.00019040852575488454, "loss": 0.1401, "step": 224 }, { "epoch": 0.14977533699450823, "grad_norm": 182.62322998046875, "learning_rate": 0.00019036412078152754, "loss": 0.7953, "step": 225 }, { "epoch": 0.15044100515892828, "grad_norm": 104.31300354003906, "learning_rate": 0.00019031971580817051, "loss": 0.5117, "step": 226 }, { "epoch": 0.15110667332334832, "grad_norm": 122.88026428222656, "learning_rate": 0.0001902753108348135, "loss": 0.8029, "step": 227 }, { "epoch": 0.15177234148776836, "grad_norm": 72.17605590820312, "learning_rate": 0.00019023090586145648, "loss": 0.3403, "step": 228 }, { "epoch": 0.15243800965218837, "grad_norm": 55.70381164550781, "learning_rate": 0.00019018650088809948, "loss": 0.2811, "step": 229 }, { "epoch": 0.1531036778166084, "grad_norm": 80.8816146850586, "learning_rate": 0.00019014209591474245, "loss": 1.0904, "step": 230 }, { "epoch": 0.15376934598102845, "grad_norm": 77.0992431640625, "learning_rate": 0.00019009769094138545, "loss": 0.8617, "step": 231 }, { "epoch": 0.1544350141454485, "grad_norm": 47.77476119995117, "learning_rate": 0.00019005328596802842, "loss": 0.3185, "step": 232 }, { "epoch": 0.15510068230986854, "grad_norm": 77.98711395263672, "learning_rate": 0.00019000888099467142, "loss": 0.2655, "step": 233 }, { "epoch": 0.15576635047428858, "grad_norm": 63.54255294799805, "learning_rate": 0.0001899644760213144, "loss": 0.3955, "step": 234 }, { "epoch": 0.1564320186387086, "grad_norm": 190.26271057128906, "learning_rate": 0.00018992007104795736, "loss": 1.2886, "step": 235 }, { "epoch": 0.15709768680312863, "grad_norm": 117.44766998291016, "learning_rate": 0.00018987566607460036, "loss": 0.7142, "step": 236 }, { "epoch": 0.15776335496754867, "grad_norm": 133.50717163085938, "learning_rate": 0.00018983126110124333, "loss": 0.8898, "step": 237 }, { "epoch": 0.15842902313196872, "grad_norm": 62.771507263183594, "learning_rate": 0.00018978685612788633, "loss": 0.3062, "step": 238 }, { "epoch": 0.15909469129638876, "grad_norm": 74.36737060546875, "learning_rate": 0.0001897424511545293, "loss": 0.5336, "step": 239 }, { "epoch": 0.1597603594608088, "grad_norm": 137.45458984375, "learning_rate": 0.0001896980461811723, "loss": 0.7025, "step": 240 }, { "epoch": 0.1604260276252288, "grad_norm": 54.84712600708008, "learning_rate": 0.00018965364120781527, "loss": 0.338, "step": 241 }, { "epoch": 0.16109169578964885, "grad_norm": 127.16575622558594, "learning_rate": 0.00018960923623445827, "loss": 1.0382, "step": 242 }, { "epoch": 0.1617573639540689, "grad_norm": 93.72176361083984, "learning_rate": 0.00018956483126110124, "loss": 1.0084, "step": 243 }, { "epoch": 0.16242303211848894, "grad_norm": 56.54580307006836, "learning_rate": 0.00018952042628774424, "loss": 0.5353, "step": 244 }, { "epoch": 0.16308870028290898, "grad_norm": 76.96385955810547, "learning_rate": 0.0001894760213143872, "loss": 0.397, "step": 245 }, { "epoch": 0.16375436844732902, "grad_norm": 91.15630340576172, "learning_rate": 0.00018943161634103018, "loss": 0.4533, "step": 246 }, { "epoch": 0.16442003661174903, "grad_norm": 103.4432373046875, "learning_rate": 0.00018938721136767318, "loss": 0.6514, "step": 247 }, { "epoch": 0.16508570477616907, "grad_norm": 60.81359100341797, "learning_rate": 0.00018934280639431615, "loss": 0.4884, "step": 248 }, { "epoch": 0.16575137294058911, "grad_norm": 46.649139404296875, "learning_rate": 0.00018929840142095915, "loss": 0.271, "step": 249 }, { "epoch": 0.16641704110500916, "grad_norm": 58.7072868347168, "learning_rate": 0.00018925399644760212, "loss": 0.27, "step": 250 }, { "epoch": 0.1670827092694292, "grad_norm": 79.08338928222656, "learning_rate": 0.00018920959147424512, "loss": 0.4671, "step": 251 }, { "epoch": 0.16774837743384924, "grad_norm": 98.57723236083984, "learning_rate": 0.0001891651865008881, "loss": 0.3235, "step": 252 }, { "epoch": 0.16841404559826925, "grad_norm": 131.836181640625, "learning_rate": 0.0001891207815275311, "loss": 0.4688, "step": 253 }, { "epoch": 0.1690797137626893, "grad_norm": 67.51493835449219, "learning_rate": 0.00018907637655417406, "loss": 0.6911, "step": 254 }, { "epoch": 0.16974538192710933, "grad_norm": 51.76738357543945, "learning_rate": 0.00018903197158081706, "loss": 0.1837, "step": 255 }, { "epoch": 0.17041105009152938, "grad_norm": 52.23995590209961, "learning_rate": 0.00018898756660746003, "loss": 0.71, "step": 256 }, { "epoch": 0.17107671825594942, "grad_norm": 49.984336853027344, "learning_rate": 0.00018894316163410303, "loss": 0.3221, "step": 257 }, { "epoch": 0.17174238642036946, "grad_norm": 48.08108901977539, "learning_rate": 0.000188898756660746, "loss": 0.3334, "step": 258 }, { "epoch": 0.17240805458478947, "grad_norm": 24.143003463745117, "learning_rate": 0.00018885435168738897, "loss": 0.2096, "step": 259 }, { "epoch": 0.1730737227492095, "grad_norm": 116.61510467529297, "learning_rate": 0.00018880994671403197, "loss": 0.8852, "step": 260 }, { "epoch": 0.17373939091362955, "grad_norm": 41.168052673339844, "learning_rate": 0.00018876554174067494, "loss": 0.2541, "step": 261 }, { "epoch": 0.1744050590780496, "grad_norm": 19.256343841552734, "learning_rate": 0.00018872113676731794, "loss": 0.2278, "step": 262 }, { "epoch": 0.17507072724246964, "grad_norm": 51.543418884277344, "learning_rate": 0.0001886767317939609, "loss": 0.2494, "step": 263 }, { "epoch": 0.17573639540688968, "grad_norm": 44.67826461791992, "learning_rate": 0.0001886323268206039, "loss": 0.3112, "step": 264 }, { "epoch": 0.1764020635713097, "grad_norm": 41.30339431762695, "learning_rate": 0.00018858792184724688, "loss": 0.5114, "step": 265 }, { "epoch": 0.17706773173572973, "grad_norm": 12.575431823730469, "learning_rate": 0.00018854351687388988, "loss": 0.0375, "step": 266 }, { "epoch": 0.17773339990014977, "grad_norm": 71.91178894042969, "learning_rate": 0.00018849911190053285, "loss": 0.5989, "step": 267 }, { "epoch": 0.17839906806456982, "grad_norm": 83.31620788574219, "learning_rate": 0.00018845470692717585, "loss": 0.51, "step": 268 }, { "epoch": 0.17906473622898986, "grad_norm": 83.13764190673828, "learning_rate": 0.00018841030195381885, "loss": 1.1146, "step": 269 }, { "epoch": 0.1797304043934099, "grad_norm": 151.9849853515625, "learning_rate": 0.00018836589698046182, "loss": 1.1219, "step": 270 }, { "epoch": 0.1803960725578299, "grad_norm": 121.70146179199219, "learning_rate": 0.00018832149200710482, "loss": 0.9397, "step": 271 }, { "epoch": 0.18106174072224995, "grad_norm": 51.62641906738281, "learning_rate": 0.0001882770870337478, "loss": 0.2968, "step": 272 }, { "epoch": 0.18172740888667, "grad_norm": 66.82881164550781, "learning_rate": 0.0001882326820603908, "loss": 0.5365, "step": 273 }, { "epoch": 0.18239307705109004, "grad_norm": 78.71537017822266, "learning_rate": 0.00018818827708703376, "loss": 0.8149, "step": 274 }, { "epoch": 0.18305874521551008, "grad_norm": 137.4581756591797, "learning_rate": 0.00018814387211367676, "loss": 0.6777, "step": 275 }, { "epoch": 0.18372441337993012, "grad_norm": 64.81407165527344, "learning_rate": 0.00018809946714031973, "loss": 0.7677, "step": 276 }, { "epoch": 0.18439008154435013, "grad_norm": 96.37416076660156, "learning_rate": 0.00018805506216696273, "loss": 1.332, "step": 277 }, { "epoch": 0.18505574970877017, "grad_norm": 47.11737823486328, "learning_rate": 0.0001880106571936057, "loss": 0.29, "step": 278 }, { "epoch": 0.18572141787319021, "grad_norm": 111.5100326538086, "learning_rate": 0.0001879662522202487, "loss": 0.5466, "step": 279 }, { "epoch": 0.18638708603761026, "grad_norm": 123.91747283935547, "learning_rate": 0.00018792184724689167, "loss": 0.6733, "step": 280 }, { "epoch": 0.1870527542020303, "grad_norm": 107.9887924194336, "learning_rate": 0.00018787744227353467, "loss": 0.4885, "step": 281 }, { "epoch": 0.18771842236645034, "grad_norm": 76.66865539550781, "learning_rate": 0.00018783303730017764, "loss": 0.6393, "step": 282 }, { "epoch": 0.18838409053087035, "grad_norm": 67.36553192138672, "learning_rate": 0.00018778863232682064, "loss": 0.4087, "step": 283 }, { "epoch": 0.1890497586952904, "grad_norm": 50.0120849609375, "learning_rate": 0.0001877442273534636, "loss": 0.3911, "step": 284 }, { "epoch": 0.18971542685971043, "grad_norm": 109.88864135742188, "learning_rate": 0.00018769982238010658, "loss": 0.6727, "step": 285 }, { "epoch": 0.19038109502413048, "grad_norm": 114.64707946777344, "learning_rate": 0.00018765541740674958, "loss": 0.8044, "step": 286 }, { "epoch": 0.19104676318855052, "grad_norm": 130.69219970703125, "learning_rate": 0.00018761101243339255, "loss": 0.6119, "step": 287 }, { "epoch": 0.19171243135297053, "grad_norm": 50.73808670043945, "learning_rate": 0.00018756660746003555, "loss": 0.4887, "step": 288 }, { "epoch": 0.19237809951739057, "grad_norm": 143.2826385498047, "learning_rate": 0.00018752220248667852, "loss": 0.8228, "step": 289 }, { "epoch": 0.1930437676818106, "grad_norm": 132.3501739501953, "learning_rate": 0.00018747779751332152, "loss": 0.808, "step": 290 }, { "epoch": 0.19370943584623065, "grad_norm": 74.29531860351562, "learning_rate": 0.0001874333925399645, "loss": 0.7815, "step": 291 }, { "epoch": 0.1943751040106507, "grad_norm": 81.61593627929688, "learning_rate": 0.00018738898756660749, "loss": 0.5732, "step": 292 }, { "epoch": 0.19504077217507074, "grad_norm": 86.1830825805664, "learning_rate": 0.00018734458259325046, "loss": 0.5226, "step": 293 }, { "epoch": 0.19570644033949075, "grad_norm": 23.495864868164062, "learning_rate": 0.00018730017761989346, "loss": 0.1476, "step": 294 }, { "epoch": 0.1963721085039108, "grad_norm": 45.22040939331055, "learning_rate": 0.00018725577264653643, "loss": 0.4335, "step": 295 }, { "epoch": 0.19703777666833083, "grad_norm": 102.33810424804688, "learning_rate": 0.0001872113676731794, "loss": 0.6778, "step": 296 }, { "epoch": 0.19770344483275087, "grad_norm": 90.49777221679688, "learning_rate": 0.0001871669626998224, "loss": 0.7652, "step": 297 }, { "epoch": 0.19836911299717092, "grad_norm": 14.38054084777832, "learning_rate": 0.00018712255772646537, "loss": 0.0598, "step": 298 }, { "epoch": 0.19903478116159096, "grad_norm": 24.415199279785156, "learning_rate": 0.00018707815275310837, "loss": 0.3666, "step": 299 }, { "epoch": 0.19970044932601097, "grad_norm": 37.869869232177734, "learning_rate": 0.00018703374777975134, "loss": 0.1511, "step": 300 }, { "epoch": 0.200366117490431, "grad_norm": 115.4464111328125, "learning_rate": 0.00018698934280639434, "loss": 0.7441, "step": 301 }, { "epoch": 0.20103178565485105, "grad_norm": 124.49742889404297, "learning_rate": 0.0001869449378330373, "loss": 0.6898, "step": 302 }, { "epoch": 0.2016974538192711, "grad_norm": 45.529197692871094, "learning_rate": 0.0001869005328596803, "loss": 0.2891, "step": 303 }, { "epoch": 0.20236312198369114, "grad_norm": 107.60458374023438, "learning_rate": 0.00018685612788632328, "loss": 0.6037, "step": 304 }, { "epoch": 0.20302879014811118, "grad_norm": 72.92047119140625, "learning_rate": 0.00018681172291296628, "loss": 0.8364, "step": 305 }, { "epoch": 0.2036944583125312, "grad_norm": 80.35264587402344, "learning_rate": 0.00018676731793960925, "loss": 0.6218, "step": 306 }, { "epoch": 0.20436012647695123, "grad_norm": 181.0091552734375, "learning_rate": 0.00018672291296625225, "loss": 0.8943, "step": 307 }, { "epoch": 0.20502579464137127, "grad_norm": 54.508758544921875, "learning_rate": 0.00018667850799289522, "loss": 0.3581, "step": 308 }, { "epoch": 0.20569146280579131, "grad_norm": 22.76518440246582, "learning_rate": 0.0001866341030195382, "loss": 0.0707, "step": 309 }, { "epoch": 0.20635713097021136, "grad_norm": 144.45138549804688, "learning_rate": 0.00018658969804618119, "loss": 1.199, "step": 310 }, { "epoch": 0.2070227991346314, "grad_norm": 51.15176010131836, "learning_rate": 0.00018654529307282416, "loss": 0.2966, "step": 311 }, { "epoch": 0.2076884672990514, "grad_norm": 84.0188980102539, "learning_rate": 0.00018650088809946716, "loss": 0.3478, "step": 312 }, { "epoch": 0.20835413546347145, "grad_norm": 74.65904235839844, "learning_rate": 0.00018645648312611013, "loss": 0.5906, "step": 313 }, { "epoch": 0.2090198036278915, "grad_norm": 76.40025329589844, "learning_rate": 0.00018641207815275313, "loss": 0.5313, "step": 314 }, { "epoch": 0.20968547179231153, "grad_norm": 67.46442413330078, "learning_rate": 0.0001863676731793961, "loss": 0.5431, "step": 315 }, { "epoch": 0.21035113995673158, "grad_norm": 153.51947021484375, "learning_rate": 0.0001863232682060391, "loss": 1.4613, "step": 316 }, { "epoch": 0.21101680812115162, "grad_norm": 141.46417236328125, "learning_rate": 0.00018627886323268207, "loss": 0.6173, "step": 317 }, { "epoch": 0.21168247628557163, "grad_norm": 117.94355773925781, "learning_rate": 0.00018623445825932506, "loss": 0.703, "step": 318 }, { "epoch": 0.21234814444999167, "grad_norm": 72.00027465820312, "learning_rate": 0.00018619005328596804, "loss": 0.3786, "step": 319 }, { "epoch": 0.2130138126144117, "grad_norm": 54.18503189086914, "learning_rate": 0.00018614564831261103, "loss": 0.3633, "step": 320 }, { "epoch": 0.21367948077883175, "grad_norm": 121.49830627441406, "learning_rate": 0.000186101243339254, "loss": 0.7459, "step": 321 }, { "epoch": 0.2143451489432518, "grad_norm": 176.50872802734375, "learning_rate": 0.00018605683836589698, "loss": 0.8166, "step": 322 }, { "epoch": 0.21501081710767184, "grad_norm": 151.5113067626953, "learning_rate": 0.00018601243339253998, "loss": 0.7919, "step": 323 }, { "epoch": 0.21567648527209185, "grad_norm": 84.63894653320312, "learning_rate": 0.00018596802841918295, "loss": 0.7106, "step": 324 }, { "epoch": 0.2163421534365119, "grad_norm": 102.90968322753906, "learning_rate": 0.00018592362344582595, "loss": 0.689, "step": 325 }, { "epoch": 0.21700782160093193, "grad_norm": 61.89885330200195, "learning_rate": 0.00018587921847246892, "loss": 0.3393, "step": 326 }, { "epoch": 0.21767348976535197, "grad_norm": 106.49290466308594, "learning_rate": 0.00018583481349911191, "loss": 0.3928, "step": 327 }, { "epoch": 0.21833915792977202, "grad_norm": 43.59565353393555, "learning_rate": 0.00018579040852575489, "loss": 0.3696, "step": 328 }, { "epoch": 0.21900482609419206, "grad_norm": 67.05834197998047, "learning_rate": 0.00018574600355239788, "loss": 0.8861, "step": 329 }, { "epoch": 0.21967049425861207, "grad_norm": 108.7596435546875, "learning_rate": 0.00018570159857904086, "loss": 1.1147, "step": 330 }, { "epoch": 0.2203361624230321, "grad_norm": 45.871212005615234, "learning_rate": 0.00018565719360568385, "loss": 0.2634, "step": 331 }, { "epoch": 0.22100183058745215, "grad_norm": 97.05673217773438, "learning_rate": 0.00018561278863232683, "loss": 0.7176, "step": 332 }, { "epoch": 0.2216674987518722, "grad_norm": 37.64607238769531, "learning_rate": 0.00018556838365896982, "loss": 0.405, "step": 333 }, { "epoch": 0.22233316691629224, "grad_norm": 82.15957641601562, "learning_rate": 0.0001855239786856128, "loss": 0.3589, "step": 334 }, { "epoch": 0.22299883508071228, "grad_norm": 48.32238006591797, "learning_rate": 0.00018547957371225577, "loss": 0.2946, "step": 335 }, { "epoch": 0.2236645032451323, "grad_norm": 33.3898811340332, "learning_rate": 0.00018543516873889876, "loss": 0.388, "step": 336 }, { "epoch": 0.22433017140955233, "grad_norm": 42.87260818481445, "learning_rate": 0.00018539076376554174, "loss": 0.2302, "step": 337 }, { "epoch": 0.22499583957397237, "grad_norm": 58.45392608642578, "learning_rate": 0.00018534635879218473, "loss": 0.6616, "step": 338 }, { "epoch": 0.22566150773839241, "grad_norm": 28.114885330200195, "learning_rate": 0.0001853019538188277, "loss": 0.1042, "step": 339 }, { "epoch": 0.22632717590281246, "grad_norm": 51.35744857788086, "learning_rate": 0.0001852575488454707, "loss": 0.4782, "step": 340 }, { "epoch": 0.2269928440672325, "grad_norm": 82.15337371826172, "learning_rate": 0.00018521314387211368, "loss": 0.3746, "step": 341 }, { "epoch": 0.2276585122316525, "grad_norm": 109.32012176513672, "learning_rate": 0.00018516873889875667, "loss": 0.6589, "step": 342 }, { "epoch": 0.22832418039607255, "grad_norm": 155.4207000732422, "learning_rate": 0.00018512433392539965, "loss": 1.1968, "step": 343 }, { "epoch": 0.2289898485604926, "grad_norm": 33.03417205810547, "learning_rate": 0.00018507992895204264, "loss": 0.3845, "step": 344 }, { "epoch": 0.22965551672491263, "grad_norm": 37.060585021972656, "learning_rate": 0.00018503552397868561, "loss": 0.1341, "step": 345 }, { "epoch": 0.23032118488933268, "grad_norm": 106.52214050292969, "learning_rate": 0.00018499111900532859, "loss": 0.7823, "step": 346 }, { "epoch": 0.23098685305375272, "grad_norm": 48.00297164916992, "learning_rate": 0.00018494671403197158, "loss": 0.6375, "step": 347 }, { "epoch": 0.23165252121817273, "grad_norm": 51.31806564331055, "learning_rate": 0.00018490230905861456, "loss": 0.4176, "step": 348 }, { "epoch": 0.23231818938259277, "grad_norm": 57.65751647949219, "learning_rate": 0.00018485790408525755, "loss": 0.2368, "step": 349 }, { "epoch": 0.2329838575470128, "grad_norm": 46.041908264160156, "learning_rate": 0.00018481349911190053, "loss": 0.4219, "step": 350 }, { "epoch": 0.23364952571143285, "grad_norm": 97.23726654052734, "learning_rate": 0.00018476909413854352, "loss": 0.8618, "step": 351 }, { "epoch": 0.2343151938758529, "grad_norm": 39.60933303833008, "learning_rate": 0.0001847246891651865, "loss": 0.2446, "step": 352 }, { "epoch": 0.23498086204027294, "grad_norm": 83.39789581298828, "learning_rate": 0.0001846802841918295, "loss": 0.5283, "step": 353 }, { "epoch": 0.23564653020469295, "grad_norm": 23.28626823425293, "learning_rate": 0.00018463587921847246, "loss": 0.2269, "step": 354 }, { "epoch": 0.236312198369113, "grad_norm": 75.43800354003906, "learning_rate": 0.00018459147424511546, "loss": 0.5959, "step": 355 }, { "epoch": 0.23697786653353303, "grad_norm": 33.72787094116211, "learning_rate": 0.00018454706927175843, "loss": 0.2795, "step": 356 }, { "epoch": 0.23764353469795307, "grad_norm": 56.4429817199707, "learning_rate": 0.00018450266429840143, "loss": 0.6191, "step": 357 }, { "epoch": 0.23830920286237312, "grad_norm": 97.20005798339844, "learning_rate": 0.0001844582593250444, "loss": 0.6252, "step": 358 }, { "epoch": 0.23897487102679316, "grad_norm": NaN, "learning_rate": 0.0001844582593250444, "loss": 1.1109, "step": 359 }, { "epoch": 0.23964053919121317, "grad_norm": 103.0977783203125, "learning_rate": 0.00018441385435168738, "loss": 0.5326, "step": 360 }, { "epoch": 0.2403062073556332, "grad_norm": 33.12345504760742, "learning_rate": 0.00018436944937833037, "loss": 0.4465, "step": 361 }, { "epoch": 0.24097187552005325, "grad_norm": 29.757606506347656, "learning_rate": 0.00018432504440497335, "loss": 0.3853, "step": 362 }, { "epoch": 0.2416375436844733, "grad_norm": 52.76167678833008, "learning_rate": 0.00018428063943161634, "loss": 0.3521, "step": 363 }, { "epoch": 0.24230321184889334, "grad_norm": 80.51199340820312, "learning_rate": 0.00018423623445825931, "loss": 0.4551, "step": 364 }, { "epoch": 0.24296888001331338, "grad_norm": 47.3682861328125, "learning_rate": 0.0001841918294849023, "loss": 0.2393, "step": 365 }, { "epoch": 0.2436345481777334, "grad_norm": 80.18375396728516, "learning_rate": 0.00018414742451154528, "loss": 0.9122, "step": 366 }, { "epoch": 0.24430021634215343, "grad_norm": 65.6572265625, "learning_rate": 0.00018410301953818828, "loss": 0.3345, "step": 367 }, { "epoch": 0.24496588450657347, "grad_norm": 108.21941375732422, "learning_rate": 0.00018405861456483125, "loss": 0.5405, "step": 368 }, { "epoch": 0.2456315526709935, "grad_norm": 57.63541030883789, "learning_rate": 0.00018401420959147425, "loss": 0.2502, "step": 369 }, { "epoch": 0.24629722083541356, "grad_norm": 35.54652786254883, "learning_rate": 0.00018396980461811722, "loss": 0.5846, "step": 370 }, { "epoch": 0.2469628889998336, "grad_norm": 55.58504867553711, "learning_rate": 0.00018392539964476022, "loss": 0.2154, "step": 371 }, { "epoch": 0.2476285571642536, "grad_norm": 55.62077331542969, "learning_rate": 0.0001838809946714032, "loss": 0.4984, "step": 372 }, { "epoch": 0.24829422532867365, "grad_norm": 87.00872039794922, "learning_rate": 0.0001838365896980462, "loss": 0.4538, "step": 373 }, { "epoch": 0.2489598934930937, "grad_norm": 46.597068786621094, "learning_rate": 0.00018379218472468916, "loss": 0.648, "step": 374 }, { "epoch": 0.24962556165751373, "grad_norm": 99.84630584716797, "learning_rate": 0.00018374777975133216, "loss": 0.7931, "step": 375 }, { "epoch": 0.25029122982193375, "grad_norm": 54.02254104614258, "learning_rate": 0.00018370337477797513, "loss": 0.2499, "step": 376 }, { "epoch": 0.2509568979863538, "grad_norm": 29.319000244140625, "learning_rate": 0.00018365896980461813, "loss": 0.4007, "step": 377 }, { "epoch": 0.25162256615077383, "grad_norm": 113.27352905273438, "learning_rate": 0.0001836145648312611, "loss": 0.7379, "step": 378 }, { "epoch": 0.25228823431519387, "grad_norm": 112.12903594970703, "learning_rate": 0.0001835701598579041, "loss": 0.6326, "step": 379 }, { "epoch": 0.2529539024796139, "grad_norm": 123.96865844726562, "learning_rate": 0.00018352575488454707, "loss": 0.5521, "step": 380 }, { "epoch": 0.25361957064403395, "grad_norm": 42.07058334350586, "learning_rate": 0.00018348134991119007, "loss": 0.4259, "step": 381 }, { "epoch": 0.254285238808454, "grad_norm": 55.002960205078125, "learning_rate": 0.00018343694493783304, "loss": 0.4797, "step": 382 }, { "epoch": 0.25495090697287404, "grad_norm": 33.028038024902344, "learning_rate": 0.00018339253996447604, "loss": 0.2893, "step": 383 }, { "epoch": 0.2556165751372941, "grad_norm": 54.003570556640625, "learning_rate": 0.000183348134991119, "loss": 0.4897, "step": 384 }, { "epoch": 0.2562822433017141, "grad_norm": 80.12110137939453, "learning_rate": 0.000183303730017762, "loss": 0.2902, "step": 385 }, { "epoch": 0.2569479114661341, "grad_norm": 46.586753845214844, "learning_rate": 0.00018325932504440498, "loss": 0.3721, "step": 386 }, { "epoch": 0.25761357963055415, "grad_norm": 45.596553802490234, "learning_rate": 0.00018321492007104798, "loss": 0.2719, "step": 387 }, { "epoch": 0.2582792477949742, "grad_norm": 90.06937408447266, "learning_rate": 0.00018317051509769095, "loss": 0.6424, "step": 388 }, { "epoch": 0.25894491595939423, "grad_norm": 44.34123611450195, "learning_rate": 0.00018312611012433395, "loss": 0.2926, "step": 389 }, { "epoch": 0.25961058412381427, "grad_norm": 123.05774688720703, "learning_rate": 0.00018308170515097692, "loss": 0.8703, "step": 390 }, { "epoch": 0.2602762522882343, "grad_norm": 132.67784118652344, "learning_rate": 0.00018303730017761992, "loss": 0.492, "step": 391 }, { "epoch": 0.26094192045265435, "grad_norm": 65.75527954101562, "learning_rate": 0.0001829928952042629, "loss": 0.2371, "step": 392 }, { "epoch": 0.2616075886170744, "grad_norm": 70.54750061035156, "learning_rate": 0.0001829484902309059, "loss": 0.5192, "step": 393 }, { "epoch": 0.26227325678149443, "grad_norm": 85.00897216796875, "learning_rate": 0.00018290408525754886, "loss": 0.7114, "step": 394 }, { "epoch": 0.2629389249459145, "grad_norm": 96.78849792480469, "learning_rate": 0.00018285968028419186, "loss": 0.4515, "step": 395 }, { "epoch": 0.2636045931103345, "grad_norm": 21.13523292541504, "learning_rate": 0.00018281527531083483, "loss": 0.3526, "step": 396 }, { "epoch": 0.26427026127475456, "grad_norm": 29.27351188659668, "learning_rate": 0.0001827708703374778, "loss": 0.5151, "step": 397 }, { "epoch": 0.26493592943917454, "grad_norm": 35.63692855834961, "learning_rate": 0.0001827264653641208, "loss": 0.2374, "step": 398 }, { "epoch": 0.2656015976035946, "grad_norm": 66.08139038085938, "learning_rate": 0.00018268206039076377, "loss": 0.4133, "step": 399 }, { "epoch": 0.2662672657680146, "grad_norm": 34.3465461730957, "learning_rate": 0.00018263765541740677, "loss": 0.3082, "step": 400 }, { "epoch": 0.26693293393243467, "grad_norm": 100.62875366210938, "learning_rate": 0.00018259325044404974, "loss": 0.6884, "step": 401 }, { "epoch": 0.2675986020968547, "grad_norm": 139.31932067871094, "learning_rate": 0.00018254884547069274, "loss": 1.0101, "step": 402 }, { "epoch": 0.26826427026127475, "grad_norm": 118.92940521240234, "learning_rate": 0.0001825044404973357, "loss": 0.9706, "step": 403 }, { "epoch": 0.2689299384256948, "grad_norm": 151.63421630859375, "learning_rate": 0.0001824600355239787, "loss": 0.961, "step": 404 }, { "epoch": 0.26959560659011483, "grad_norm": 27.14082145690918, "learning_rate": 0.00018241563055062168, "loss": 0.227, "step": 405 }, { "epoch": 0.2702612747545349, "grad_norm": 80.59782409667969, "learning_rate": 0.00018237122557726468, "loss": 0.6828, "step": 406 }, { "epoch": 0.2709269429189549, "grad_norm": 49.46958541870117, "learning_rate": 0.00018232682060390765, "loss": 0.3184, "step": 407 }, { "epoch": 0.27159261108337496, "grad_norm": 73.98738098144531, "learning_rate": 0.00018228241563055065, "loss": 0.6858, "step": 408 }, { "epoch": 0.272258279247795, "grad_norm": 86.27637481689453, "learning_rate": 0.00018223801065719362, "loss": 0.5358, "step": 409 }, { "epoch": 0.272923947412215, "grad_norm": 145.42340087890625, "learning_rate": 0.0001821936056838366, "loss": 1.0992, "step": 410 }, { "epoch": 0.273589615576635, "grad_norm": 41.17599105834961, "learning_rate": 0.0001821492007104796, "loss": 0.369, "step": 411 }, { "epoch": 0.27425528374105507, "grad_norm": 55.49694061279297, "learning_rate": 0.00018210479573712256, "loss": 0.2842, "step": 412 }, { "epoch": 0.2749209519054751, "grad_norm": 47.01858139038086, "learning_rate": 0.00018206039076376556, "loss": 0.3368, "step": 413 }, { "epoch": 0.27558662006989515, "grad_norm": 40.550880432128906, "learning_rate": 0.00018201598579040853, "loss": 0.2099, "step": 414 }, { "epoch": 0.2762522882343152, "grad_norm": 38.81869125366211, "learning_rate": 0.00018197158081705153, "loss": 0.28, "step": 415 }, { "epoch": 0.27691795639873523, "grad_norm": 65.78675842285156, "learning_rate": 0.0001819271758436945, "loss": 0.521, "step": 416 }, { "epoch": 0.2775836245631553, "grad_norm": 73.13219451904297, "learning_rate": 0.0001818827708703375, "loss": 0.7937, "step": 417 }, { "epoch": 0.2782492927275753, "grad_norm": 65.54695129394531, "learning_rate": 0.00018183836589698047, "loss": 0.4079, "step": 418 }, { "epoch": 0.27891496089199536, "grad_norm": 34.30622863769531, "learning_rate": 0.00018179396092362347, "loss": 0.0964, "step": 419 }, { "epoch": 0.2795806290564154, "grad_norm": 67.12702941894531, "learning_rate": 0.00018174955595026644, "loss": 1.4374, "step": 420 }, { "epoch": 0.28024629722083544, "grad_norm": 33.8336181640625, "learning_rate": 0.00018170515097690944, "loss": 0.1509, "step": 421 }, { "epoch": 0.2809119653852554, "grad_norm": 66.7548599243164, "learning_rate": 0.0001816607460035524, "loss": 0.2623, "step": 422 }, { "epoch": 0.28157763354967547, "grad_norm": 57.463253021240234, "learning_rate": 0.00018161634103019538, "loss": 0.3713, "step": 423 }, { "epoch": 0.2822433017140955, "grad_norm": 32.50680923461914, "learning_rate": 0.00018157193605683838, "loss": 0.1945, "step": 424 }, { "epoch": 0.28290896987851555, "grad_norm": 72.0036849975586, "learning_rate": 0.00018152753108348135, "loss": 0.3324, "step": 425 }, { "epoch": 0.2835746380429356, "grad_norm": 38.52781295776367, "learning_rate": 0.00018148312611012435, "loss": 0.185, "step": 426 }, { "epoch": 0.28424030620735563, "grad_norm": 74.13616943359375, "learning_rate": 0.00018143872113676732, "loss": 0.2684, "step": 427 }, { "epoch": 0.28490597437177567, "grad_norm": 76.92218017578125, "learning_rate": 0.00018139431616341032, "loss": 0.1456, "step": 428 }, { "epoch": 0.2855716425361957, "grad_norm": 131.1004638671875, "learning_rate": 0.0001813499111900533, "loss": 0.6252, "step": 429 }, { "epoch": 0.28623731070061575, "grad_norm": 112.32926940917969, "learning_rate": 0.0001813055062166963, "loss": 0.8337, "step": 430 }, { "epoch": 0.2869029788650358, "grad_norm": 125.7912368774414, "learning_rate": 0.00018126110124333926, "loss": 0.3741, "step": 431 }, { "epoch": 0.28756864702945584, "grad_norm": 164.3785858154297, "learning_rate": 0.00018121669626998226, "loss": 0.9616, "step": 432 }, { "epoch": 0.2882343151938759, "grad_norm": 191.42367553710938, "learning_rate": 0.00018117229129662523, "loss": 0.8257, "step": 433 }, { "epoch": 0.28889998335829586, "grad_norm": 79.08208465576172, "learning_rate": 0.00018112788632326823, "loss": 0.5202, "step": 434 }, { "epoch": 0.2895656515227159, "grad_norm": 109.46979522705078, "learning_rate": 0.0001810834813499112, "loss": 0.941, "step": 435 }, { "epoch": 0.29023131968713595, "grad_norm": 74.53866577148438, "learning_rate": 0.00018103907637655417, "loss": 0.7279, "step": 436 }, { "epoch": 0.290896987851556, "grad_norm": 137.80848693847656, "learning_rate": 0.00018099467140319717, "loss": 1.4923, "step": 437 }, { "epoch": 0.29156265601597603, "grad_norm": 126.38627624511719, "learning_rate": 0.00018095026642984014, "loss": 0.721, "step": 438 }, { "epoch": 0.29222832418039607, "grad_norm": 98.57434844970703, "learning_rate": 0.00018090586145648314, "loss": 0.5846, "step": 439 }, { "epoch": 0.2928939923448161, "grad_norm": 33.60892868041992, "learning_rate": 0.0001808614564831261, "loss": 0.4007, "step": 440 }, { "epoch": 0.29355966050923615, "grad_norm": 84.4782943725586, "learning_rate": 0.0001808170515097691, "loss": 0.6742, "step": 441 }, { "epoch": 0.2942253286736562, "grad_norm": 78.77970886230469, "learning_rate": 0.00018077264653641208, "loss": 0.5823, "step": 442 }, { "epoch": 0.29489099683807624, "grad_norm": 157.4933319091797, "learning_rate": 0.00018072824156305508, "loss": 0.7795, "step": 443 }, { "epoch": 0.2955566650024963, "grad_norm": 18.716060638427734, "learning_rate": 0.00018068383658969805, "loss": 0.3715, "step": 444 }, { "epoch": 0.2962223331669163, "grad_norm": 85.15080261230469, "learning_rate": 0.00018063943161634105, "loss": 0.606, "step": 445 }, { "epoch": 0.2968880013313363, "grad_norm": 83.07491302490234, "learning_rate": 0.00018059502664298402, "loss": 0.4614, "step": 446 }, { "epoch": 0.29755366949575635, "grad_norm": 52.511085510253906, "learning_rate": 0.000180550621669627, "loss": 0.4219, "step": 447 }, { "epoch": 0.2982193376601764, "grad_norm": 126.2627944946289, "learning_rate": 0.00018050621669627, "loss": 0.6889, "step": 448 }, { "epoch": 0.29888500582459643, "grad_norm": 73.53312683105469, "learning_rate": 0.00018046181172291296, "loss": 0.577, "step": 449 }, { "epoch": 0.29955067398901647, "grad_norm": 103.77356719970703, "learning_rate": 0.00018041740674955596, "loss": 0.7924, "step": 450 }, { "epoch": 0.3002163421534365, "grad_norm": 89.16803741455078, "learning_rate": 0.00018037300177619893, "loss": 0.5117, "step": 451 }, { "epoch": 0.30088201031785655, "grad_norm": 116.20187377929688, "learning_rate": 0.00018032859680284193, "loss": 0.6893, "step": 452 }, { "epoch": 0.3015476784822766, "grad_norm": 130.5649871826172, "learning_rate": 0.0001802841918294849, "loss": 0.6782, "step": 453 }, { "epoch": 0.30221334664669663, "grad_norm": 71.50609588623047, "learning_rate": 0.0001802397868561279, "loss": 0.4608, "step": 454 }, { "epoch": 0.3028790148111167, "grad_norm": 101.877197265625, "learning_rate": 0.00018019538188277087, "loss": 0.5512, "step": 455 }, { "epoch": 0.3035446829755367, "grad_norm": 135.80545043945312, "learning_rate": 0.00018015097690941387, "loss": 0.6684, "step": 456 }, { "epoch": 0.30421035113995676, "grad_norm": 83.38865661621094, "learning_rate": 0.00018010657193605684, "loss": 0.4582, "step": 457 }, { "epoch": 0.30487601930437674, "grad_norm": 93.57701110839844, "learning_rate": 0.00018006216696269984, "loss": 0.7277, "step": 458 }, { "epoch": 0.3055416874687968, "grad_norm": 72.99111938476562, "learning_rate": 0.0001800177619893428, "loss": 0.5737, "step": 459 }, { "epoch": 0.3062073556332168, "grad_norm": 64.9205551147461, "learning_rate": 0.00017997335701598578, "loss": 0.3831, "step": 460 }, { "epoch": 0.30687302379763687, "grad_norm": 36.749935150146484, "learning_rate": 0.00017992895204262878, "loss": 0.4547, "step": 461 }, { "epoch": 0.3075386919620569, "grad_norm": 77.0653076171875, "learning_rate": 0.00017988454706927175, "loss": 0.8629, "step": 462 }, { "epoch": 0.30820436012647695, "grad_norm": 48.22278594970703, "learning_rate": 0.00017984014209591475, "loss": 0.3634, "step": 463 }, { "epoch": 0.308870028290897, "grad_norm": 39.0225715637207, "learning_rate": 0.00017979573712255772, "loss": 0.317, "step": 464 }, { "epoch": 0.30953569645531703, "grad_norm": 68.747314453125, "learning_rate": 0.00017975133214920072, "loss": 0.3624, "step": 465 }, { "epoch": 0.3102013646197371, "grad_norm": 71.55371856689453, "learning_rate": 0.0001797069271758437, "loss": 0.3992, "step": 466 }, { "epoch": 0.3108670327841571, "grad_norm": 36.87495422363281, "learning_rate": 0.00017966252220248669, "loss": 0.2098, "step": 467 }, { "epoch": 0.31153270094857716, "grad_norm": 10.664894104003906, "learning_rate": 0.00017961811722912966, "loss": 0.0542, "step": 468 }, { "epoch": 0.3121983691129972, "grad_norm": 34.32514953613281, "learning_rate": 0.00017957371225577265, "loss": 0.3147, "step": 469 }, { "epoch": 0.3128640372774172, "grad_norm": 24.026748657226562, "learning_rate": 0.00017952930728241563, "loss": 0.0797, "step": 470 }, { "epoch": 0.3135297054418372, "grad_norm": 23.764530181884766, "learning_rate": 0.00017948490230905862, "loss": 0.0965, "step": 471 }, { "epoch": 0.31419537360625727, "grad_norm": 5.237178325653076, "learning_rate": 0.0001794404973357016, "loss": 0.0106, "step": 472 }, { "epoch": 0.3148610417706773, "grad_norm": 20.264196395874023, "learning_rate": 0.0001793960923623446, "loss": 0.0546, "step": 473 }, { "epoch": 0.31552670993509735, "grad_norm": 79.56272888183594, "learning_rate": 0.00017935168738898757, "loss": 0.5564, "step": 474 }, { "epoch": 0.3161923780995174, "grad_norm": 37.59955596923828, "learning_rate": 0.00017930728241563056, "loss": 0.3443, "step": 475 }, { "epoch": 0.31685804626393743, "grad_norm": 49.22467803955078, "learning_rate": 0.00017926287744227354, "loss": 0.3749, "step": 476 }, { "epoch": 0.3175237144283575, "grad_norm": 175.10850524902344, "learning_rate": 0.00017921847246891653, "loss": 1.7856, "step": 477 }, { "epoch": 0.3181893825927775, "grad_norm": 47.71009826660156, "learning_rate": 0.0001791740674955595, "loss": 0.7082, "step": 478 }, { "epoch": 0.31885505075719756, "grad_norm": 82.27182006835938, "learning_rate": 0.0001791296625222025, "loss": 0.9351, "step": 479 }, { "epoch": 0.3195207189216176, "grad_norm": 79.33952331542969, "learning_rate": 0.00017908525754884547, "loss": 0.3707, "step": 480 }, { "epoch": 0.3201863870860376, "grad_norm": 88.69058227539062, "learning_rate": 0.00017904085257548847, "loss": 0.85, "step": 481 }, { "epoch": 0.3208520552504576, "grad_norm": 110.95427703857422, "learning_rate": 0.00017899644760213144, "loss": 0.3507, "step": 482 }, { "epoch": 0.32151772341487767, "grad_norm": 101.72648620605469, "learning_rate": 0.00017895204262877444, "loss": 0.8123, "step": 483 }, { "epoch": 0.3221833915792977, "grad_norm": 172.9845733642578, "learning_rate": 0.00017890763765541741, "loss": 1.1193, "step": 484 }, { "epoch": 0.32284905974371775, "grad_norm": 40.887081146240234, "learning_rate": 0.0001788632326820604, "loss": 0.1791, "step": 485 }, { "epoch": 0.3235147279081378, "grad_norm": 89.45999145507812, "learning_rate": 0.00017881882770870338, "loss": 0.6104, "step": 486 }, { "epoch": 0.32418039607255783, "grad_norm": 101.5306167602539, "learning_rate": 0.00017877442273534638, "loss": 0.5197, "step": 487 }, { "epoch": 0.32484606423697787, "grad_norm": 25.401044845581055, "learning_rate": 0.00017873001776198935, "loss": 0.2328, "step": 488 }, { "epoch": 0.3255117324013979, "grad_norm": 122.61588287353516, "learning_rate": 0.00017868561278863235, "loss": 0.9357, "step": 489 }, { "epoch": 0.32617740056581795, "grad_norm": 51.50986099243164, "learning_rate": 0.00017864120781527532, "loss": 0.506, "step": 490 }, { "epoch": 0.326843068730238, "grad_norm": 64.19971466064453, "learning_rate": 0.00017859680284191832, "loss": 0.2713, "step": 491 }, { "epoch": 0.32750873689465804, "grad_norm": 19.61275863647461, "learning_rate": 0.0001785523978685613, "loss": 0.1013, "step": 492 }, { "epoch": 0.328174405059078, "grad_norm": 66.36785125732422, "learning_rate": 0.0001785079928952043, "loss": 0.3726, "step": 493 }, { "epoch": 0.32884007322349806, "grad_norm": 87.43013763427734, "learning_rate": 0.00017846358792184726, "loss": 0.4851, "step": 494 }, { "epoch": 0.3295057413879181, "grad_norm": 63.96004867553711, "learning_rate": 0.00017841918294849026, "loss": 0.2111, "step": 495 }, { "epoch": 0.33017140955233815, "grad_norm": 81.01036834716797, "learning_rate": 0.00017837477797513323, "loss": 0.6494, "step": 496 }, { "epoch": 0.3308370777167582, "grad_norm": 63.75148010253906, "learning_rate": 0.0001783303730017762, "loss": 0.3041, "step": 497 }, { "epoch": 0.33150274588117823, "grad_norm": 91.487060546875, "learning_rate": 0.0001782859680284192, "loss": 0.689, "step": 498 }, { "epoch": 0.33216841404559827, "grad_norm": 58.02032470703125, "learning_rate": 0.00017824156305506217, "loss": 0.5346, "step": 499 }, { "epoch": 0.3328340822100183, "grad_norm": 73.02722930908203, "learning_rate": 0.00017819715808170517, "loss": 0.541, "step": 500 } ], "logging_steps": 1, "max_steps": 4506, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.22820987830272e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }