{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 2666, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007501875468867217, "grad_norm": 0.06649772077798843, "learning_rate": 3.7453183520599253e-07, "loss": 0.1491, "step": 1 }, { "epoch": 0.0015003750937734434, "grad_norm": 0.1297842562198639, "learning_rate": 7.490636704119851e-07, "loss": 0.2357, "step": 2 }, { "epoch": 0.002250562640660165, "grad_norm": 0.0687314048409462, "learning_rate": 1.1235955056179775e-06, "loss": 0.114, "step": 3 }, { "epoch": 0.003000750187546887, "grad_norm": 0.1251819133758545, "learning_rate": 1.4981273408239701e-06, "loss": 0.2133, "step": 4 }, { "epoch": 0.0037509377344336083, "grad_norm": 0.060200925916433334, "learning_rate": 1.8726591760299627e-06, "loss": 0.113, "step": 5 }, { "epoch": 0.00450112528132033, "grad_norm": 0.12672510743141174, "learning_rate": 2.247191011235955e-06, "loss": 0.2232, "step": 6 }, { "epoch": 0.005251312828207052, "grad_norm": 0.10037848353385925, "learning_rate": 2.621722846441948e-06, "loss": 0.1297, "step": 7 }, { "epoch": 0.006001500375093774, "grad_norm": 0.07913671433925629, "learning_rate": 2.9962546816479402e-06, "loss": 0.1174, "step": 8 }, { "epoch": 0.006751687921980495, "grad_norm": 0.08461928367614746, "learning_rate": 3.3707865168539327e-06, "loss": 0.131, "step": 9 }, { "epoch": 0.007501875468867217, "grad_norm": 0.09129065275192261, "learning_rate": 3.7453183520599255e-06, "loss": 0.1834, "step": 10 }, { "epoch": 0.008252063015753939, "grad_norm": 0.0547911673784256, "learning_rate": 4.1198501872659175e-06, "loss": 0.078, "step": 11 }, { "epoch": 0.00900225056264066, "grad_norm": 0.11671412736177444, "learning_rate": 4.49438202247191e-06, "loss": 0.1937, "step": 12 }, { "epoch": 0.009752438109527382, "grad_norm": 0.11381004750728607, "learning_rate": 4.868913857677903e-06, "loss": 0.1882, "step": 13 }, { "epoch": 0.010502625656414103, "grad_norm": 0.4827083945274353, "learning_rate": 5.243445692883896e-06, "loss": 0.3005, "step": 14 }, { "epoch": 0.011252813203300824, "grad_norm": 0.0948605015873909, "learning_rate": 5.617977528089888e-06, "loss": 0.1403, "step": 15 }, { "epoch": 0.012003000750187547, "grad_norm": 0.17905683815479279, "learning_rate": 5.9925093632958805e-06, "loss": 0.1826, "step": 16 }, { "epoch": 0.012753188297074268, "grad_norm": 0.04930970072746277, "learning_rate": 6.367041198501873e-06, "loss": 0.0794, "step": 17 }, { "epoch": 0.01350337584396099, "grad_norm": 0.05358840152621269, "learning_rate": 6.741573033707865e-06, "loss": 0.0867, "step": 18 }, { "epoch": 0.014253563390847712, "grad_norm": 0.058819886296987534, "learning_rate": 7.116104868913858e-06, "loss": 0.1035, "step": 19 }, { "epoch": 0.015003750937734433, "grad_norm": 0.31954073905944824, "learning_rate": 7.490636704119851e-06, "loss": 0.2974, "step": 20 }, { "epoch": 0.015753938484621154, "grad_norm": 0.12035220861434937, "learning_rate": 7.865168539325843e-06, "loss": 0.142, "step": 21 }, { "epoch": 0.016504126031507877, "grad_norm": 0.1415967345237732, "learning_rate": 8.239700374531835e-06, "loss": 0.2324, "step": 22 }, { "epoch": 0.0172543135783946, "grad_norm": 0.11801585555076599, "learning_rate": 8.614232209737828e-06, "loss": 0.1633, "step": 23 }, { "epoch": 0.01800450112528132, "grad_norm": 0.1823718100786209, "learning_rate": 8.98876404494382e-06, "loss": 0.1321, "step": 24 }, { "epoch": 0.018754688672168042, "grad_norm": 0.16873396933078766, "learning_rate": 9.363295880149813e-06, "loss": 0.1453, "step": 25 }, { "epoch": 0.019504876219054765, "grad_norm": 0.12931793928146362, "learning_rate": 9.737827715355806e-06, "loss": 0.169, "step": 26 }, { "epoch": 0.020255063765941484, "grad_norm": 0.13040228188037872, "learning_rate": 1.0112359550561798e-05, "loss": 0.1103, "step": 27 }, { "epoch": 0.021005251312828207, "grad_norm": 0.21055294573307037, "learning_rate": 1.0486891385767791e-05, "loss": 0.2282, "step": 28 }, { "epoch": 0.02175543885971493, "grad_norm": 0.09372907131910324, "learning_rate": 1.0861423220973783e-05, "loss": 0.1398, "step": 29 }, { "epoch": 0.02250562640660165, "grad_norm": 0.15932485461235046, "learning_rate": 1.1235955056179776e-05, "loss": 0.1226, "step": 30 }, { "epoch": 0.023255813953488372, "grad_norm": 0.09856501966714859, "learning_rate": 1.161048689138577e-05, "loss": 0.1182, "step": 31 }, { "epoch": 0.024006001500375095, "grad_norm": 0.0559878945350647, "learning_rate": 1.1985018726591761e-05, "loss": 0.0681, "step": 32 }, { "epoch": 0.024756189047261814, "grad_norm": 0.1646910458803177, "learning_rate": 1.2359550561797752e-05, "loss": 0.1503, "step": 33 }, { "epoch": 0.025506376594148537, "grad_norm": 0.09920473396778107, "learning_rate": 1.2734082397003746e-05, "loss": 0.1493, "step": 34 }, { "epoch": 0.02625656414103526, "grad_norm": 0.21710704267024994, "learning_rate": 1.3108614232209737e-05, "loss": 0.2232, "step": 35 }, { "epoch": 0.02700675168792198, "grad_norm": 0.06066899746656418, "learning_rate": 1.348314606741573e-05, "loss": 0.0856, "step": 36 }, { "epoch": 0.0277569392348087, "grad_norm": 0.12835277616977692, "learning_rate": 1.3857677902621724e-05, "loss": 0.146, "step": 37 }, { "epoch": 0.028507126781695424, "grad_norm": 0.10923058539628983, "learning_rate": 1.4232209737827715e-05, "loss": 0.1094, "step": 38 }, { "epoch": 0.029257314328582147, "grad_norm": 0.24330253899097443, "learning_rate": 1.4606741573033709e-05, "loss": 0.2436, "step": 39 }, { "epoch": 0.030007501875468866, "grad_norm": 0.10736766457557678, "learning_rate": 1.4981273408239702e-05, "loss": 0.1085, "step": 40 }, { "epoch": 0.03075768942235559, "grad_norm": 0.05089711770415306, "learning_rate": 1.5355805243445692e-05, "loss": 0.0871, "step": 41 }, { "epoch": 0.03150787696924231, "grad_norm": 0.13075260818004608, "learning_rate": 1.5730337078651687e-05, "loss": 0.1558, "step": 42 }, { "epoch": 0.03225806451612903, "grad_norm": 0.09256450831890106, "learning_rate": 1.610486891385768e-05, "loss": 0.1214, "step": 43 }, { "epoch": 0.033008252063015754, "grad_norm": 0.13702747225761414, "learning_rate": 1.647940074906367e-05, "loss": 0.1186, "step": 44 }, { "epoch": 0.03375843960990248, "grad_norm": 0.1761491447687149, "learning_rate": 1.6853932584269665e-05, "loss": 0.1627, "step": 45 }, { "epoch": 0.0345086271567892, "grad_norm": 0.12984786927700043, "learning_rate": 1.7228464419475657e-05, "loss": 0.1335, "step": 46 }, { "epoch": 0.035258814703675916, "grad_norm": 0.09949901700019836, "learning_rate": 1.760299625468165e-05, "loss": 0.1188, "step": 47 }, { "epoch": 0.03600900225056264, "grad_norm": 0.2773364782333374, "learning_rate": 1.797752808988764e-05, "loss": 0.2703, "step": 48 }, { "epoch": 0.03675918979744936, "grad_norm": 0.3009333610534668, "learning_rate": 1.8352059925093635e-05, "loss": 0.2168, "step": 49 }, { "epoch": 0.037509377344336084, "grad_norm": 0.08068252354860306, "learning_rate": 1.8726591760299626e-05, "loss": 0.1113, "step": 50 }, { "epoch": 0.03825956489122281, "grad_norm": 0.08510925620794296, "learning_rate": 1.9101123595505618e-05, "loss": 0.0675, "step": 51 }, { "epoch": 0.03900975243810953, "grad_norm": 0.16944704949855804, "learning_rate": 1.9475655430711613e-05, "loss": 0.1452, "step": 52 }, { "epoch": 0.03975993998499625, "grad_norm": 0.21001164615154266, "learning_rate": 1.9850187265917604e-05, "loss": 0.2077, "step": 53 }, { "epoch": 0.04051012753188297, "grad_norm": 0.21587851643562317, "learning_rate": 2.0224719101123596e-05, "loss": 0.14, "step": 54 }, { "epoch": 0.04126031507876969, "grad_norm": 0.19861550629138947, "learning_rate": 2.059925093632959e-05, "loss": 0.1491, "step": 55 }, { "epoch": 0.042010502625656414, "grad_norm": 0.05874146148562431, "learning_rate": 2.0973782771535582e-05, "loss": 0.0653, "step": 56 }, { "epoch": 0.04276069017254314, "grad_norm": 0.2626660466194153, "learning_rate": 2.1348314606741574e-05, "loss": 0.1116, "step": 57 }, { "epoch": 0.04351087771942986, "grad_norm": 0.3618119955062866, "learning_rate": 2.1722846441947566e-05, "loss": 0.2157, "step": 58 }, { "epoch": 0.04426106526631658, "grad_norm": 0.12022845447063446, "learning_rate": 2.209737827715356e-05, "loss": 0.0955, "step": 59 }, { "epoch": 0.0450112528132033, "grad_norm": 0.07745017111301422, "learning_rate": 2.2471910112359552e-05, "loss": 0.0777, "step": 60 }, { "epoch": 0.04576144036009002, "grad_norm": 0.12869754433631897, "learning_rate": 2.2846441947565544e-05, "loss": 0.1211, "step": 61 }, { "epoch": 0.046511627906976744, "grad_norm": 0.13848523795604706, "learning_rate": 2.322097378277154e-05, "loss": 0.0762, "step": 62 }, { "epoch": 0.047261815453863466, "grad_norm": 0.1735474169254303, "learning_rate": 2.359550561797753e-05, "loss": 0.1488, "step": 63 }, { "epoch": 0.04801200300075019, "grad_norm": 0.1833038032054901, "learning_rate": 2.3970037453183522e-05, "loss": 0.1407, "step": 64 }, { "epoch": 0.04876219054763691, "grad_norm": 0.08675671368837357, "learning_rate": 2.4344569288389517e-05, "loss": 0.0663, "step": 65 }, { "epoch": 0.04951237809452363, "grad_norm": 0.4732707738876343, "learning_rate": 2.4719101123595505e-05, "loss": 0.0857, "step": 66 }, { "epoch": 0.05026256564141035, "grad_norm": 0.07791633903980255, "learning_rate": 2.50936329588015e-05, "loss": 0.0585, "step": 67 }, { "epoch": 0.05101275318829707, "grad_norm": 0.11928839981555939, "learning_rate": 2.546816479400749e-05, "loss": 0.0748, "step": 68 }, { "epoch": 0.051762940735183796, "grad_norm": 0.089127317070961, "learning_rate": 2.5842696629213486e-05, "loss": 0.09, "step": 69 }, { "epoch": 0.05251312828207052, "grad_norm": 0.10768590867519379, "learning_rate": 2.6217228464419475e-05, "loss": 0.0804, "step": 70 }, { "epoch": 0.05326331582895724, "grad_norm": 0.21062488853931427, "learning_rate": 2.6591760299625466e-05, "loss": 0.1271, "step": 71 }, { "epoch": 0.05401350337584396, "grad_norm": 0.11834652721881866, "learning_rate": 2.696629213483146e-05, "loss": 0.0821, "step": 72 }, { "epoch": 0.05476369092273068, "grad_norm": 0.09663186967372894, "learning_rate": 2.7340823970037456e-05, "loss": 0.0593, "step": 73 }, { "epoch": 0.0555138784696174, "grad_norm": 0.22409404814243317, "learning_rate": 2.7715355805243448e-05, "loss": 0.1147, "step": 74 }, { "epoch": 0.056264066016504126, "grad_norm": 0.13819625973701477, "learning_rate": 2.8089887640449443e-05, "loss": 0.1101, "step": 75 }, { "epoch": 0.05701425356339085, "grad_norm": 0.25516584515571594, "learning_rate": 2.846441947565543e-05, "loss": 0.1276, "step": 76 }, { "epoch": 0.05776444111027757, "grad_norm": 0.0935330018401146, "learning_rate": 2.8838951310861422e-05, "loss": 0.0749, "step": 77 }, { "epoch": 0.058514628657164294, "grad_norm": 0.11693164706230164, "learning_rate": 2.9213483146067417e-05, "loss": 0.0756, "step": 78 }, { "epoch": 0.05926481620405101, "grad_norm": 0.1455622911453247, "learning_rate": 2.958801498127341e-05, "loss": 0.0847, "step": 79 }, { "epoch": 0.06001500375093773, "grad_norm": 0.09133189916610718, "learning_rate": 2.9962546816479404e-05, "loss": 0.056, "step": 80 }, { "epoch": 0.060765191297824456, "grad_norm": 0.12661296129226685, "learning_rate": 3.0337078651685396e-05, "loss": 0.0823, "step": 81 }, { "epoch": 0.06151537884471118, "grad_norm": 0.13131161034107208, "learning_rate": 3.0711610486891384e-05, "loss": 0.0772, "step": 82 }, { "epoch": 0.0622655663915979, "grad_norm": 0.2745862305164337, "learning_rate": 3.108614232209738e-05, "loss": 0.1155, "step": 83 }, { "epoch": 0.06301575393848462, "grad_norm": 0.11527538299560547, "learning_rate": 3.1460674157303374e-05, "loss": 0.0655, "step": 84 }, { "epoch": 0.06376594148537135, "grad_norm": 0.12666447460651398, "learning_rate": 3.183520599250936e-05, "loss": 0.0656, "step": 85 }, { "epoch": 0.06451612903225806, "grad_norm": 0.09897023439407349, "learning_rate": 3.220973782771536e-05, "loss": 0.0659, "step": 86 }, { "epoch": 0.06526631657914479, "grad_norm": 0.15928857028484344, "learning_rate": 3.258426966292135e-05, "loss": 0.0857, "step": 87 }, { "epoch": 0.06601650412603151, "grad_norm": 0.14418122172355652, "learning_rate": 3.295880149812734e-05, "loss": 0.0903, "step": 88 }, { "epoch": 0.06676669167291822, "grad_norm": 0.15921813249588013, "learning_rate": 3.3333333333333335e-05, "loss": 0.0811, "step": 89 }, { "epoch": 0.06751687921980495, "grad_norm": 0.18934333324432373, "learning_rate": 3.370786516853933e-05, "loss": 0.0836, "step": 90 }, { "epoch": 0.06826706676669167, "grad_norm": 0.10202574729919434, "learning_rate": 3.408239700374532e-05, "loss": 0.0464, "step": 91 }, { "epoch": 0.0690172543135784, "grad_norm": 0.12126179784536362, "learning_rate": 3.445692883895131e-05, "loss": 0.052, "step": 92 }, { "epoch": 0.06976744186046512, "grad_norm": 0.06997974961996078, "learning_rate": 3.483146067415731e-05, "loss": 0.0423, "step": 93 }, { "epoch": 0.07051762940735183, "grad_norm": 0.1367831528186798, "learning_rate": 3.52059925093633e-05, "loss": 0.054, "step": 94 }, { "epoch": 0.07126781695423856, "grad_norm": 0.1674944907426834, "learning_rate": 3.558052434456929e-05, "loss": 0.0635, "step": 95 }, { "epoch": 0.07201800450112528, "grad_norm": 0.1916777491569519, "learning_rate": 3.595505617977528e-05, "loss": 0.0476, "step": 96 }, { "epoch": 0.072768192048012, "grad_norm": 0.5634555220603943, "learning_rate": 3.6329588014981274e-05, "loss": 0.0509, "step": 97 }, { "epoch": 0.07351837959489872, "grad_norm": 0.24589844048023224, "learning_rate": 3.670411985018727e-05, "loss": 0.0702, "step": 98 }, { "epoch": 0.07426856714178545, "grad_norm": 0.19461844861507416, "learning_rate": 3.7078651685393264e-05, "loss": 0.0684, "step": 99 }, { "epoch": 0.07501875468867217, "grad_norm": 0.17358846962451935, "learning_rate": 3.745318352059925e-05, "loss": 0.072, "step": 100 }, { "epoch": 0.07576894223555888, "grad_norm": 0.1436762511730194, "learning_rate": 3.782771535580524e-05, "loss": 0.0573, "step": 101 }, { "epoch": 0.07651912978244561, "grad_norm": 0.17795026302337646, "learning_rate": 3.8202247191011236e-05, "loss": 0.0662, "step": 102 }, { "epoch": 0.07726931732933233, "grad_norm": 0.1983191967010498, "learning_rate": 3.857677902621723e-05, "loss": 0.0559, "step": 103 }, { "epoch": 0.07801950487621906, "grad_norm": 0.24022872745990753, "learning_rate": 3.8951310861423226e-05, "loss": 0.0724, "step": 104 }, { "epoch": 0.07876969242310577, "grad_norm": 0.16081267595291138, "learning_rate": 3.9325842696629214e-05, "loss": 0.0762, "step": 105 }, { "epoch": 0.0795198799699925, "grad_norm": 0.31475764513015747, "learning_rate": 3.970037453183521e-05, "loss": 0.0581, "step": 106 }, { "epoch": 0.08027006751687922, "grad_norm": 0.23797044157981873, "learning_rate": 4.00749063670412e-05, "loss": 0.0546, "step": 107 }, { "epoch": 0.08102025506376594, "grad_norm": 0.18666911125183105, "learning_rate": 4.044943820224719e-05, "loss": 0.0604, "step": 108 }, { "epoch": 0.08177044261065267, "grad_norm": 0.10255415737628937, "learning_rate": 4.082397003745319e-05, "loss": 0.0404, "step": 109 }, { "epoch": 0.08252063015753938, "grad_norm": 0.2886727750301361, "learning_rate": 4.119850187265918e-05, "loss": 0.058, "step": 110 }, { "epoch": 0.08327081770442611, "grad_norm": 0.1744941771030426, "learning_rate": 4.157303370786517e-05, "loss": 0.0696, "step": 111 }, { "epoch": 0.08402100525131283, "grad_norm": 0.2490309327840805, "learning_rate": 4.1947565543071165e-05, "loss": 0.0479, "step": 112 }, { "epoch": 0.08477119279819954, "grad_norm": 0.2167392075061798, "learning_rate": 4.232209737827715e-05, "loss": 0.0539, "step": 113 }, { "epoch": 0.08552138034508627, "grad_norm": 0.16126973927021027, "learning_rate": 4.269662921348315e-05, "loss": 0.044, "step": 114 }, { "epoch": 0.08627156789197299, "grad_norm": 0.17505522072315216, "learning_rate": 4.307116104868914e-05, "loss": 0.0548, "step": 115 }, { "epoch": 0.08702175543885972, "grad_norm": 0.27888697385787964, "learning_rate": 4.344569288389513e-05, "loss": 0.0681, "step": 116 }, { "epoch": 0.08777194298574643, "grad_norm": 0.13575726747512817, "learning_rate": 4.3820224719101126e-05, "loss": 0.0385, "step": 117 }, { "epoch": 0.08852213053263316, "grad_norm": 0.143665611743927, "learning_rate": 4.419475655430712e-05, "loss": 0.0442, "step": 118 }, { "epoch": 0.08927231807951988, "grad_norm": 0.3198046386241913, "learning_rate": 4.456928838951311e-05, "loss": 0.0796, "step": 119 }, { "epoch": 0.0900225056264066, "grad_norm": 0.11711857467889786, "learning_rate": 4.4943820224719104e-05, "loss": 0.0325, "step": 120 }, { "epoch": 0.09077269317329333, "grad_norm": 0.19504939019680023, "learning_rate": 4.531835205992509e-05, "loss": 0.0454, "step": 121 }, { "epoch": 0.09152288072018004, "grad_norm": 0.16343529522418976, "learning_rate": 4.569288389513109e-05, "loss": 0.0423, "step": 122 }, { "epoch": 0.09227306826706677, "grad_norm": 0.0722413882613182, "learning_rate": 4.606741573033708e-05, "loss": 0.0364, "step": 123 }, { "epoch": 0.09302325581395349, "grad_norm": 0.20592856407165527, "learning_rate": 4.644194756554308e-05, "loss": 0.0628, "step": 124 }, { "epoch": 0.09377344336084022, "grad_norm": 0.19177493453025818, "learning_rate": 4.6816479400749066e-05, "loss": 0.0543, "step": 125 }, { "epoch": 0.09452363090772693, "grad_norm": 0.24182191491127014, "learning_rate": 4.719101123595506e-05, "loss": 0.0493, "step": 126 }, { "epoch": 0.09527381845461365, "grad_norm": 0.18809586763381958, "learning_rate": 4.756554307116105e-05, "loss": 0.0531, "step": 127 }, { "epoch": 0.09602400600150038, "grad_norm": 0.16542655229568481, "learning_rate": 4.7940074906367044e-05, "loss": 0.0345, "step": 128 }, { "epoch": 0.0967741935483871, "grad_norm": 0.12689034640789032, "learning_rate": 4.831460674157304e-05, "loss": 0.0255, "step": 129 }, { "epoch": 0.09752438109527382, "grad_norm": 0.14506380259990692, "learning_rate": 4.8689138576779034e-05, "loss": 0.0432, "step": 130 }, { "epoch": 0.09827456864216054, "grad_norm": 0.24236801266670227, "learning_rate": 4.906367041198502e-05, "loss": 0.0869, "step": 131 }, { "epoch": 0.09902475618904726, "grad_norm": 0.23130767047405243, "learning_rate": 4.943820224719101e-05, "loss": 0.0498, "step": 132 }, { "epoch": 0.09977494373593399, "grad_norm": 0.06554227322340012, "learning_rate": 4.9812734082397005e-05, "loss": 0.0256, "step": 133 }, { "epoch": 0.1005251312828207, "grad_norm": 0.14324985444545746, "learning_rate": 5.0187265917603e-05, "loss": 0.0467, "step": 134 }, { "epoch": 0.10127531882970743, "grad_norm": 0.37844982743263245, "learning_rate": 5.0561797752808995e-05, "loss": 0.0656, "step": 135 }, { "epoch": 0.10202550637659415, "grad_norm": 0.20668180286884308, "learning_rate": 5.093632958801498e-05, "loss": 0.0444, "step": 136 }, { "epoch": 0.10277569392348088, "grad_norm": 0.10723793506622314, "learning_rate": 5.131086142322098e-05, "loss": 0.0214, "step": 137 }, { "epoch": 0.10352588147036759, "grad_norm": 0.19435639679431915, "learning_rate": 5.168539325842697e-05, "loss": 0.0547, "step": 138 }, { "epoch": 0.10427606901725431, "grad_norm": 0.19808021187782288, "learning_rate": 5.205992509363297e-05, "loss": 0.0372, "step": 139 }, { "epoch": 0.10502625656414104, "grad_norm": 0.17958632111549377, "learning_rate": 5.243445692883895e-05, "loss": 0.0416, "step": 140 }, { "epoch": 0.10577644411102775, "grad_norm": 0.09122506529092789, "learning_rate": 5.2808988764044944e-05, "loss": 0.0324, "step": 141 }, { "epoch": 0.10652663165791448, "grad_norm": 0.13046078383922577, "learning_rate": 5.318352059925093e-05, "loss": 0.0304, "step": 142 }, { "epoch": 0.1072768192048012, "grad_norm": 0.2404167801141739, "learning_rate": 5.355805243445693e-05, "loss": 0.0572, "step": 143 }, { "epoch": 0.10802700675168792, "grad_norm": 0.2153376042842865, "learning_rate": 5.393258426966292e-05, "loss": 0.0443, "step": 144 }, { "epoch": 0.10877719429857464, "grad_norm": 0.2301207333803177, "learning_rate": 5.430711610486892e-05, "loss": 0.0619, "step": 145 }, { "epoch": 0.10952738184546136, "grad_norm": 0.2225511372089386, "learning_rate": 5.468164794007491e-05, "loss": 0.041, "step": 146 }, { "epoch": 0.11027756939234809, "grad_norm": 0.3045608699321747, "learning_rate": 5.50561797752809e-05, "loss": 0.0608, "step": 147 }, { "epoch": 0.1110277569392348, "grad_norm": 0.47876784205436707, "learning_rate": 5.5430711610486895e-05, "loss": 0.1024, "step": 148 }, { "epoch": 0.11177794448612154, "grad_norm": 0.1713646650314331, "learning_rate": 5.580524344569289e-05, "loss": 0.0512, "step": 149 }, { "epoch": 0.11252813203300825, "grad_norm": 0.1879701018333435, "learning_rate": 5.6179775280898885e-05, "loss": 0.0342, "step": 150 }, { "epoch": 0.11327831957989497, "grad_norm": 0.2024771124124527, "learning_rate": 5.6554307116104874e-05, "loss": 0.0421, "step": 151 }, { "epoch": 0.1140285071267817, "grad_norm": 0.18217679858207703, "learning_rate": 5.692883895131086e-05, "loss": 0.0344, "step": 152 }, { "epoch": 0.11477869467366841, "grad_norm": 0.13079820573329926, "learning_rate": 5.730337078651685e-05, "loss": 0.0328, "step": 153 }, { "epoch": 0.11552888222055514, "grad_norm": 0.15242129564285278, "learning_rate": 5.7677902621722845e-05, "loss": 0.0392, "step": 154 }, { "epoch": 0.11627906976744186, "grad_norm": 0.08799269795417786, "learning_rate": 5.805243445692884e-05, "loss": 0.0217, "step": 155 }, { "epoch": 0.11702925731432859, "grad_norm": 0.15637950599193573, "learning_rate": 5.8426966292134835e-05, "loss": 0.0356, "step": 156 }, { "epoch": 0.1177794448612153, "grad_norm": 0.11212990432977676, "learning_rate": 5.880149812734082e-05, "loss": 0.0298, "step": 157 }, { "epoch": 0.11852963240810202, "grad_norm": 0.1721002459526062, "learning_rate": 5.917602996254682e-05, "loss": 0.0511, "step": 158 }, { "epoch": 0.11927981995498875, "grad_norm": 0.23305867612361908, "learning_rate": 5.955056179775281e-05, "loss": 0.0543, "step": 159 }, { "epoch": 0.12003000750187547, "grad_norm": 0.24314911663532257, "learning_rate": 5.992509363295881e-05, "loss": 0.0542, "step": 160 }, { "epoch": 0.1207801950487622, "grad_norm": 0.12653297185897827, "learning_rate": 6.02996254681648e-05, "loss": 0.0361, "step": 161 }, { "epoch": 0.12153038259564891, "grad_norm": 0.1558983474969864, "learning_rate": 6.067415730337079e-05, "loss": 0.0554, "step": 162 }, { "epoch": 0.12228057014253563, "grad_norm": 0.15075267851352692, "learning_rate": 6.104868913857679e-05, "loss": 0.0477, "step": 163 }, { "epoch": 0.12303075768942236, "grad_norm": 0.14883558452129364, "learning_rate": 6.142322097378277e-05, "loss": 0.041, "step": 164 }, { "epoch": 0.12378094523630907, "grad_norm": 0.09038081020116806, "learning_rate": 6.179775280898876e-05, "loss": 0.0316, "step": 165 }, { "epoch": 0.1245311327831958, "grad_norm": 0.07693412154912949, "learning_rate": 6.217228464419476e-05, "loss": 0.0223, "step": 166 }, { "epoch": 0.12528132033008252, "grad_norm": 0.10217053443193436, "learning_rate": 6.254681647940075e-05, "loss": 0.0251, "step": 167 }, { "epoch": 0.12603150787696923, "grad_norm": 0.12066909670829773, "learning_rate": 6.292134831460675e-05, "loss": 0.0319, "step": 168 }, { "epoch": 0.12678169542385595, "grad_norm": 0.15891261398792267, "learning_rate": 6.329588014981274e-05, "loss": 0.0384, "step": 169 }, { "epoch": 0.1275318829707427, "grad_norm": 0.16246218979358673, "learning_rate": 6.367041198501872e-05, "loss": 0.0364, "step": 170 }, { "epoch": 0.1282820705176294, "grad_norm": 0.16500765085220337, "learning_rate": 6.404494382022472e-05, "loss": 0.0316, "step": 171 }, { "epoch": 0.12903225806451613, "grad_norm": 0.15567168593406677, "learning_rate": 6.441947565543071e-05, "loss": 0.0369, "step": 172 }, { "epoch": 0.12978244561140284, "grad_norm": 0.2111051082611084, "learning_rate": 6.479400749063671e-05, "loss": 0.0425, "step": 173 }, { "epoch": 0.13053263315828958, "grad_norm": 0.20215699076652527, "learning_rate": 6.51685393258427e-05, "loss": 0.0392, "step": 174 }, { "epoch": 0.1312828207051763, "grad_norm": 0.1345527321100235, "learning_rate": 6.55430711610487e-05, "loss": 0.0267, "step": 175 }, { "epoch": 0.13203300825206302, "grad_norm": 0.14389894902706146, "learning_rate": 6.591760299625468e-05, "loss": 0.023, "step": 176 }, { "epoch": 0.13278319579894973, "grad_norm": 0.23610113561153412, "learning_rate": 6.629213483146067e-05, "loss": 0.0596, "step": 177 }, { "epoch": 0.13353338334583645, "grad_norm": 0.20613926649093628, "learning_rate": 6.666666666666667e-05, "loss": 0.0343, "step": 178 }, { "epoch": 0.1342835708927232, "grad_norm": 0.28699466586112976, "learning_rate": 6.704119850187266e-05, "loss": 0.0434, "step": 179 }, { "epoch": 0.1350337584396099, "grad_norm": 0.2657482326030731, "learning_rate": 6.741573033707866e-05, "loss": 0.0388, "step": 180 }, { "epoch": 0.13578394598649662, "grad_norm": 0.3304208219051361, "learning_rate": 6.779026217228464e-05, "loss": 0.0564, "step": 181 }, { "epoch": 0.13653413353338334, "grad_norm": 0.20125621557235718, "learning_rate": 6.816479400749064e-05, "loss": 0.031, "step": 182 }, { "epoch": 0.13728432108027006, "grad_norm": 0.15342605113983154, "learning_rate": 6.853932584269663e-05, "loss": 0.0332, "step": 183 }, { "epoch": 0.1380345086271568, "grad_norm": 0.2653309106826782, "learning_rate": 6.891385767790263e-05, "loss": 0.0589, "step": 184 }, { "epoch": 0.13878469617404351, "grad_norm": 0.2295987457036972, "learning_rate": 6.928838951310862e-05, "loss": 0.039, "step": 185 }, { "epoch": 0.13953488372093023, "grad_norm": 0.16181237995624542, "learning_rate": 6.966292134831462e-05, "loss": 0.0294, "step": 186 }, { "epoch": 0.14028507126781695, "grad_norm": 0.08588402718305588, "learning_rate": 7.003745318352061e-05, "loss": 0.0288, "step": 187 }, { "epoch": 0.14103525881470366, "grad_norm": 0.15989354252815247, "learning_rate": 7.04119850187266e-05, "loss": 0.0533, "step": 188 }, { "epoch": 0.1417854463615904, "grad_norm": 0.21237890422344208, "learning_rate": 7.078651685393259e-05, "loss": 0.0543, "step": 189 }, { "epoch": 0.14253563390847712, "grad_norm": 0.23640727996826172, "learning_rate": 7.116104868913858e-05, "loss": 0.0511, "step": 190 }, { "epoch": 0.14328582145536384, "grad_norm": 0.22190897166728973, "learning_rate": 7.153558052434456e-05, "loss": 0.0518, "step": 191 }, { "epoch": 0.14403600900225055, "grad_norm": 0.22951455414295197, "learning_rate": 7.191011235955056e-05, "loss": 0.0448, "step": 192 }, { "epoch": 0.1447861965491373, "grad_norm": 0.15378020703792572, "learning_rate": 7.228464419475655e-05, "loss": 0.0351, "step": 193 }, { "epoch": 0.145536384096024, "grad_norm": 0.21683332324028015, "learning_rate": 7.265917602996255e-05, "loss": 0.0571, "step": 194 }, { "epoch": 0.14628657164291073, "grad_norm": 0.15624549984931946, "learning_rate": 7.303370786516854e-05, "loss": 0.0418, "step": 195 }, { "epoch": 0.14703675918979744, "grad_norm": 0.14242206513881683, "learning_rate": 7.340823970037454e-05, "loss": 0.0407, "step": 196 }, { "epoch": 0.14778694673668416, "grad_norm": 0.10074103623628616, "learning_rate": 7.378277153558053e-05, "loss": 0.0312, "step": 197 }, { "epoch": 0.1485371342835709, "grad_norm": 0.24729660153388977, "learning_rate": 7.415730337078653e-05, "loss": 0.0394, "step": 198 }, { "epoch": 0.14928732183045762, "grad_norm": 0.16826917231082916, "learning_rate": 7.453183520599252e-05, "loss": 0.0345, "step": 199 }, { "epoch": 0.15003750937734434, "grad_norm": 0.2644681930541992, "learning_rate": 7.49063670411985e-05, "loss": 0.0528, "step": 200 }, { "epoch": 0.15003750937734434, "eval_loss": 0.049570418894290924, "eval_runtime": 5.1436, "eval_samples_per_second": 10.498, "eval_steps_per_second": 2.722, "step": 200 }, { "epoch": 0.15078769692423105, "grad_norm": 0.17306017875671387, "learning_rate": 7.52808988764045e-05, "loss": 0.0529, "step": 201 }, { "epoch": 0.15153788447111777, "grad_norm": 0.13111412525177002, "learning_rate": 7.565543071161048e-05, "loss": 0.0404, "step": 202 }, { "epoch": 0.1522880720180045, "grad_norm": 0.12245412915945053, "learning_rate": 7.602996254681648e-05, "loss": 0.0325, "step": 203 }, { "epoch": 0.15303825956489123, "grad_norm": 0.15576554834842682, "learning_rate": 7.640449438202247e-05, "loss": 0.0393, "step": 204 }, { "epoch": 0.15378844711177794, "grad_norm": 0.22586987912654877, "learning_rate": 7.677902621722847e-05, "loss": 0.036, "step": 205 }, { "epoch": 0.15453863465866466, "grad_norm": 0.19062842428684235, "learning_rate": 7.715355805243446e-05, "loss": 0.0454, "step": 206 }, { "epoch": 0.15528882220555137, "grad_norm": 0.18759313225746155, "learning_rate": 7.752808988764046e-05, "loss": 0.0362, "step": 207 }, { "epoch": 0.15603900975243812, "grad_norm": 0.33924275636672974, "learning_rate": 7.790262172284645e-05, "loss": 0.0645, "step": 208 }, { "epoch": 0.15678919729932483, "grad_norm": 0.124433733522892, "learning_rate": 7.827715355805245e-05, "loss": 0.0331, "step": 209 }, { "epoch": 0.15753938484621155, "grad_norm": 0.10698409378528595, "learning_rate": 7.865168539325843e-05, "loss": 0.0341, "step": 210 }, { "epoch": 0.15828957239309827, "grad_norm": 0.1937510073184967, "learning_rate": 7.902621722846442e-05, "loss": 0.0535, "step": 211 }, { "epoch": 0.159039759939985, "grad_norm": 0.25406792759895325, "learning_rate": 7.940074906367042e-05, "loss": 0.0622, "step": 212 }, { "epoch": 0.15978994748687173, "grad_norm": 0.16593150794506073, "learning_rate": 7.97752808988764e-05, "loss": 0.0371, "step": 213 }, { "epoch": 0.16054013503375844, "grad_norm": 0.17848573625087738, "learning_rate": 8.01498127340824e-05, "loss": 0.0414, "step": 214 }, { "epoch": 0.16129032258064516, "grad_norm": 0.1251983791589737, "learning_rate": 8.052434456928839e-05, "loss": 0.0355, "step": 215 }, { "epoch": 0.16204051012753187, "grad_norm": 0.2389572262763977, "learning_rate": 8.089887640449438e-05, "loss": 0.0433, "step": 216 }, { "epoch": 0.16279069767441862, "grad_norm": 0.13656052947044373, "learning_rate": 8.127340823970038e-05, "loss": 0.0394, "step": 217 }, { "epoch": 0.16354088522130533, "grad_norm": 0.1780516654253006, "learning_rate": 8.164794007490637e-05, "loss": 0.0553, "step": 218 }, { "epoch": 0.16429107276819205, "grad_norm": 0.13755282759666443, "learning_rate": 8.202247191011237e-05, "loss": 0.0348, "step": 219 }, { "epoch": 0.16504126031507876, "grad_norm": 0.35480576753616333, "learning_rate": 8.239700374531836e-05, "loss": 0.0675, "step": 220 }, { "epoch": 0.16579144786196548, "grad_norm": 0.13914087414741516, "learning_rate": 8.277153558052434e-05, "loss": 0.0403, "step": 221 }, { "epoch": 0.16654163540885222, "grad_norm": 0.1394328624010086, "learning_rate": 8.314606741573034e-05, "loss": 0.0321, "step": 222 }, { "epoch": 0.16729182295573894, "grad_norm": 0.10997188836336136, "learning_rate": 8.352059925093633e-05, "loss": 0.0386, "step": 223 }, { "epoch": 0.16804201050262565, "grad_norm": 0.10793869197368622, "learning_rate": 8.389513108614233e-05, "loss": 0.0323, "step": 224 }, { "epoch": 0.16879219804951237, "grad_norm": 0.119425468146801, "learning_rate": 8.426966292134831e-05, "loss": 0.0337, "step": 225 }, { "epoch": 0.1695423855963991, "grad_norm": 0.21268026530742645, "learning_rate": 8.46441947565543e-05, "loss": 0.0314, "step": 226 }, { "epoch": 0.17029257314328583, "grad_norm": 0.1542912721633911, "learning_rate": 8.50187265917603e-05, "loss": 0.0429, "step": 227 }, { "epoch": 0.17104276069017255, "grad_norm": 0.1182723417878151, "learning_rate": 8.53932584269663e-05, "loss": 0.0344, "step": 228 }, { "epoch": 0.17179294823705926, "grad_norm": 0.14600324630737305, "learning_rate": 8.576779026217229e-05, "loss": 0.0291, "step": 229 }, { "epoch": 0.17254313578394598, "grad_norm": 0.10251737385988235, "learning_rate": 8.614232209737829e-05, "loss": 0.0358, "step": 230 }, { "epoch": 0.17329332333083272, "grad_norm": 0.1304892748594284, "learning_rate": 8.651685393258427e-05, "loss": 0.0427, "step": 231 }, { "epoch": 0.17404351087771944, "grad_norm": 0.2422615885734558, "learning_rate": 8.689138576779026e-05, "loss": 0.0751, "step": 232 }, { "epoch": 0.17479369842460615, "grad_norm": 0.24723097681999207, "learning_rate": 8.726591760299626e-05, "loss": 0.0543, "step": 233 }, { "epoch": 0.17554388597149287, "grad_norm": 0.15995217859745026, "learning_rate": 8.764044943820225e-05, "loss": 0.04, "step": 234 }, { "epoch": 0.17629407351837958, "grad_norm": 0.17250633239746094, "learning_rate": 8.801498127340825e-05, "loss": 0.0568, "step": 235 }, { "epoch": 0.17704426106526633, "grad_norm": 0.1270926296710968, "learning_rate": 8.838951310861424e-05, "loss": 0.0427, "step": 236 }, { "epoch": 0.17779444861215304, "grad_norm": 0.22896043956279755, "learning_rate": 8.876404494382022e-05, "loss": 0.0633, "step": 237 }, { "epoch": 0.17854463615903976, "grad_norm": 0.09588818997144699, "learning_rate": 8.913857677902622e-05, "loss": 0.0362, "step": 238 }, { "epoch": 0.17929482370592648, "grad_norm": 0.10814792662858963, "learning_rate": 8.951310861423221e-05, "loss": 0.0361, "step": 239 }, { "epoch": 0.1800450112528132, "grad_norm": 0.10366350412368774, "learning_rate": 8.988764044943821e-05, "loss": 0.0279, "step": 240 }, { "epoch": 0.18079519879969994, "grad_norm": 0.17119480669498444, "learning_rate": 9.02621722846442e-05, "loss": 0.047, "step": 241 }, { "epoch": 0.18154538634658665, "grad_norm": 0.13574019074440002, "learning_rate": 9.063670411985018e-05, "loss": 0.0395, "step": 242 }, { "epoch": 0.18229557389347337, "grad_norm": 0.16211435198783875, "learning_rate": 9.101123595505618e-05, "loss": 0.0473, "step": 243 }, { "epoch": 0.18304576144036008, "grad_norm": 0.21162639558315277, "learning_rate": 9.138576779026217e-05, "loss": 0.0666, "step": 244 }, { "epoch": 0.1837959489872468, "grad_norm": 0.18772201240062714, "learning_rate": 9.176029962546817e-05, "loss": 0.0421, "step": 245 }, { "epoch": 0.18454613653413354, "grad_norm": 0.11692129820585251, "learning_rate": 9.213483146067416e-05, "loss": 0.0393, "step": 246 }, { "epoch": 0.18529632408102026, "grad_norm": 0.17434294521808624, "learning_rate": 9.250936329588016e-05, "loss": 0.0417, "step": 247 }, { "epoch": 0.18604651162790697, "grad_norm": 0.10198646783828735, "learning_rate": 9.288389513108615e-05, "loss": 0.0236, "step": 248 }, { "epoch": 0.1867966991747937, "grad_norm": 0.13234616816043854, "learning_rate": 9.325842696629214e-05, "loss": 0.0229, "step": 249 }, { "epoch": 0.18754688672168043, "grad_norm": 0.08883968740701675, "learning_rate": 9.363295880149813e-05, "loss": 0.0274, "step": 250 }, { "epoch": 0.18829707426856715, "grad_norm": 0.16140736639499664, "learning_rate": 9.400749063670413e-05, "loss": 0.0357, "step": 251 }, { "epoch": 0.18904726181545387, "grad_norm": 0.1592225581407547, "learning_rate": 9.438202247191012e-05, "loss": 0.0338, "step": 252 }, { "epoch": 0.18979744936234058, "grad_norm": 0.21853357553482056, "learning_rate": 9.47565543071161e-05, "loss": 0.0502, "step": 253 }, { "epoch": 0.1905476369092273, "grad_norm": 0.11385892331600189, "learning_rate": 9.51310861423221e-05, "loss": 0.0283, "step": 254 }, { "epoch": 0.19129782445611404, "grad_norm": 0.12962888181209564, "learning_rate": 9.550561797752809e-05, "loss": 0.0355, "step": 255 }, { "epoch": 0.19204801200300076, "grad_norm": 0.15307216346263885, "learning_rate": 9.588014981273409e-05, "loss": 0.0412, "step": 256 }, { "epoch": 0.19279819954988747, "grad_norm": 0.20015685260295868, "learning_rate": 9.625468164794008e-05, "loss": 0.0371, "step": 257 }, { "epoch": 0.1935483870967742, "grad_norm": 0.2568797469139099, "learning_rate": 9.662921348314608e-05, "loss": 0.0465, "step": 258 }, { "epoch": 0.1942985746436609, "grad_norm": 0.09511401504278183, "learning_rate": 9.700374531835207e-05, "loss": 0.029, "step": 259 }, { "epoch": 0.19504876219054765, "grad_norm": 0.19397354125976562, "learning_rate": 9.737827715355807e-05, "loss": 0.0422, "step": 260 }, { "epoch": 0.19579894973743436, "grad_norm": 0.15164721012115479, "learning_rate": 9.775280898876405e-05, "loss": 0.0454, "step": 261 }, { "epoch": 0.19654913728432108, "grad_norm": 0.11148524284362793, "learning_rate": 9.812734082397004e-05, "loss": 0.0347, "step": 262 }, { "epoch": 0.1972993248312078, "grad_norm": 0.09554921835660934, "learning_rate": 9.850187265917602e-05, "loss": 0.0259, "step": 263 }, { "epoch": 0.1980495123780945, "grad_norm": 0.35856032371520996, "learning_rate": 9.887640449438202e-05, "loss": 0.0481, "step": 264 }, { "epoch": 0.19879969992498125, "grad_norm": 0.15284886956214905, "learning_rate": 9.925093632958801e-05, "loss": 0.0475, "step": 265 }, { "epoch": 0.19954988747186797, "grad_norm": 0.0865863636136055, "learning_rate": 9.962546816479401e-05, "loss": 0.0302, "step": 266 }, { "epoch": 0.2003000750187547, "grad_norm": 0.1828830987215042, "learning_rate": 0.0001, "loss": 0.0514, "step": 267 }, { "epoch": 0.2010502625656414, "grad_norm": 0.08779073506593704, "learning_rate": 9.99999571274618e-05, "loss": 0.0198, "step": 268 }, { "epoch": 0.20180045011252815, "grad_norm": 0.1077849492430687, "learning_rate": 9.999982850992069e-05, "loss": 0.0313, "step": 269 }, { "epoch": 0.20255063765941486, "grad_norm": 0.17151670157909393, "learning_rate": 9.999961414759727e-05, "loss": 0.0454, "step": 270 }, { "epoch": 0.20330082520630158, "grad_norm": 0.09756535291671753, "learning_rate": 9.999931404085912e-05, "loss": 0.0206, "step": 271 }, { "epoch": 0.2040510127531883, "grad_norm": 0.18158778548240662, "learning_rate": 9.999892819022092e-05, "loss": 0.0455, "step": 272 }, { "epoch": 0.204801200300075, "grad_norm": 0.10948951542377472, "learning_rate": 9.999845659634435e-05, "loss": 0.0337, "step": 273 }, { "epoch": 0.20555138784696175, "grad_norm": 0.13912096619606018, "learning_rate": 9.999789926003814e-05, "loss": 0.0423, "step": 274 }, { "epoch": 0.20630157539384847, "grad_norm": 0.08084128797054291, "learning_rate": 9.999725618225808e-05, "loss": 0.0336, "step": 275 }, { "epoch": 0.20705176294073518, "grad_norm": 0.1288767158985138, "learning_rate": 9.999652736410698e-05, "loss": 0.0317, "step": 276 }, { "epoch": 0.2078019504876219, "grad_norm": 0.14830724895000458, "learning_rate": 9.999571280683468e-05, "loss": 0.0465, "step": 277 }, { "epoch": 0.20855213803450862, "grad_norm": 0.14257365465164185, "learning_rate": 9.99948125118381e-05, "loss": 0.0442, "step": 278 }, { "epoch": 0.20930232558139536, "grad_norm": 0.09027249366044998, "learning_rate": 9.999382648066113e-05, "loss": 0.0319, "step": 279 }, { "epoch": 0.21005251312828208, "grad_norm": 0.10757441818714142, "learning_rate": 9.999275471499472e-05, "loss": 0.0397, "step": 280 }, { "epoch": 0.2108027006751688, "grad_norm": 0.12700489163398743, "learning_rate": 9.999159721667685e-05, "loss": 0.0374, "step": 281 }, { "epoch": 0.2115528882220555, "grad_norm": 0.10365627706050873, "learning_rate": 9.999035398769252e-05, "loss": 0.0346, "step": 282 }, { "epoch": 0.21230307576894222, "grad_norm": 0.14833049476146698, "learning_rate": 9.998902503017372e-05, "loss": 0.054, "step": 283 }, { "epoch": 0.21305326331582897, "grad_norm": 0.21913230419158936, "learning_rate": 9.99876103463995e-05, "loss": 0.0505, "step": 284 }, { "epoch": 0.21380345086271568, "grad_norm": 0.173373743891716, "learning_rate": 9.998610993879589e-05, "loss": 0.0362, "step": 285 }, { "epoch": 0.2145536384096024, "grad_norm": 0.14817701280117035, "learning_rate": 9.998452380993597e-05, "loss": 0.0339, "step": 286 }, { "epoch": 0.21530382595648911, "grad_norm": 0.14449955523014069, "learning_rate": 9.998285196253977e-05, "loss": 0.0427, "step": 287 }, { "epoch": 0.21605401350337583, "grad_norm": 0.11007577925920486, "learning_rate": 9.998109439947434e-05, "loss": 0.0241, "step": 288 }, { "epoch": 0.21680420105026257, "grad_norm": 0.10484791547060013, "learning_rate": 9.997925112375375e-05, "loss": 0.029, "step": 289 }, { "epoch": 0.2175543885971493, "grad_norm": 0.27984753251075745, "learning_rate": 9.997732213853902e-05, "loss": 0.0493, "step": 290 }, { "epoch": 0.218304576144036, "grad_norm": 0.4144909977912903, "learning_rate": 9.997530744713817e-05, "loss": 0.075, "step": 291 }, { "epoch": 0.21905476369092272, "grad_norm": 0.14818748831748962, "learning_rate": 9.997320705300621e-05, "loss": 0.0465, "step": 292 }, { "epoch": 0.21980495123780946, "grad_norm": 0.11120837181806564, "learning_rate": 9.997102095974508e-05, "loss": 0.0297, "step": 293 }, { "epoch": 0.22055513878469618, "grad_norm": 0.10603461414575577, "learning_rate": 9.996874917110378e-05, "loss": 0.0335, "step": 294 }, { "epoch": 0.2213053263315829, "grad_norm": 0.11525816470384598, "learning_rate": 9.996639169097811e-05, "loss": 0.0357, "step": 295 }, { "epoch": 0.2220555138784696, "grad_norm": 0.26272228360176086, "learning_rate": 9.996394852341098e-05, "loss": 0.0636, "step": 296 }, { "epoch": 0.22280570142535633, "grad_norm": 0.12792189419269562, "learning_rate": 9.996141967259218e-05, "loss": 0.0389, "step": 297 }, { "epoch": 0.22355588897224307, "grad_norm": 0.13640092313289642, "learning_rate": 9.995880514285841e-05, "loss": 0.043, "step": 298 }, { "epoch": 0.2243060765191298, "grad_norm": 0.08026789128780365, "learning_rate": 9.995610493869336e-05, "loss": 0.0333, "step": 299 }, { "epoch": 0.2250562640660165, "grad_norm": 0.12414755672216415, "learning_rate": 9.99533190647276e-05, "loss": 0.0428, "step": 300 }, { "epoch": 0.22580645161290322, "grad_norm": 0.1047615334391594, "learning_rate": 9.995044752573864e-05, "loss": 0.0301, "step": 301 }, { "epoch": 0.22655663915978994, "grad_norm": 0.09507095813751221, "learning_rate": 9.994749032665085e-05, "loss": 0.0299, "step": 302 }, { "epoch": 0.22730682670667668, "grad_norm": 0.1469980925321579, "learning_rate": 9.994444747253559e-05, "loss": 0.0408, "step": 303 }, { "epoch": 0.2280570142535634, "grad_norm": 0.06208149716258049, "learning_rate": 9.9941318968611e-05, "loss": 0.0199, "step": 304 }, { "epoch": 0.2288072018004501, "grad_norm": 0.12007199227809906, "learning_rate": 9.993810482024221e-05, "loss": 0.0442, "step": 305 }, { "epoch": 0.22955738934733683, "grad_norm": 0.10592085868120193, "learning_rate": 9.993480503294114e-05, "loss": 0.0375, "step": 306 }, { "epoch": 0.23030757689422354, "grad_norm": 0.16347835958003998, "learning_rate": 9.993141961236661e-05, "loss": 0.0341, "step": 307 }, { "epoch": 0.23105776444111029, "grad_norm": 0.10255320370197296, "learning_rate": 9.992794856432426e-05, "loss": 0.0312, "step": 308 }, { "epoch": 0.231807951987997, "grad_norm": 0.18392932415008545, "learning_rate": 9.992439189476661e-05, "loss": 0.0518, "step": 309 }, { "epoch": 0.23255813953488372, "grad_norm": 0.08901845663785934, "learning_rate": 9.992074960979301e-05, "loss": 0.0268, "step": 310 }, { "epoch": 0.23330832708177043, "grad_norm": 0.14844289422035217, "learning_rate": 9.991702171564961e-05, "loss": 0.0391, "step": 311 }, { "epoch": 0.23405851462865718, "grad_norm": 0.18847529590129852, "learning_rate": 9.991320821872939e-05, "loss": 0.0418, "step": 312 }, { "epoch": 0.2348087021755439, "grad_norm": 0.12273112684488297, "learning_rate": 9.990930912557209e-05, "loss": 0.0403, "step": 313 }, { "epoch": 0.2355588897224306, "grad_norm": 0.08602188527584076, "learning_rate": 9.990532444286431e-05, "loss": 0.0231, "step": 314 }, { "epoch": 0.23630907726931732, "grad_norm": 0.18314284086227417, "learning_rate": 9.990125417743937e-05, "loss": 0.0357, "step": 315 }, { "epoch": 0.23705926481620404, "grad_norm": 0.11995340138673782, "learning_rate": 9.989709833627736e-05, "loss": 0.0341, "step": 316 }, { "epoch": 0.23780945236309078, "grad_norm": 0.20460061728954315, "learning_rate": 9.989285692650518e-05, "loss": 0.0464, "step": 317 }, { "epoch": 0.2385596399099775, "grad_norm": 0.1889379769563675, "learning_rate": 9.98885299553964e-05, "loss": 0.0517, "step": 318 }, { "epoch": 0.23930982745686422, "grad_norm": 0.09578392654657364, "learning_rate": 9.988411743037134e-05, "loss": 0.027, "step": 319 }, { "epoch": 0.24006001500375093, "grad_norm": 0.12053310871124268, "learning_rate": 9.987961935899706e-05, "loss": 0.0319, "step": 320 }, { "epoch": 0.24081020255063765, "grad_norm": 0.21278013288974762, "learning_rate": 9.987503574898731e-05, "loss": 0.0468, "step": 321 }, { "epoch": 0.2415603900975244, "grad_norm": 0.08968649059534073, "learning_rate": 9.987036660820255e-05, "loss": 0.025, "step": 322 }, { "epoch": 0.2423105776444111, "grad_norm": 0.12514202296733856, "learning_rate": 9.986561194464985e-05, "loss": 0.0367, "step": 323 }, { "epoch": 0.24306076519129782, "grad_norm": 0.10158783197402954, "learning_rate": 9.986077176648303e-05, "loss": 0.0317, "step": 324 }, { "epoch": 0.24381095273818454, "grad_norm": 0.1138206347823143, "learning_rate": 9.985584608200251e-05, "loss": 0.0312, "step": 325 }, { "epoch": 0.24456114028507125, "grad_norm": 0.07617976516485214, "learning_rate": 9.985083489965534e-05, "loss": 0.0213, "step": 326 }, { "epoch": 0.245311327831958, "grad_norm": 0.07596145570278168, "learning_rate": 9.984573822803521e-05, "loss": 0.024, "step": 327 }, { "epoch": 0.24606151537884471, "grad_norm": 0.11473222076892853, "learning_rate": 9.984055607588242e-05, "loss": 0.0321, "step": 328 }, { "epoch": 0.24681170292573143, "grad_norm": 0.10172963887453079, "learning_rate": 9.983528845208384e-05, "loss": 0.0298, "step": 329 }, { "epoch": 0.24756189047261815, "grad_norm": 0.16156165301799774, "learning_rate": 9.982993536567293e-05, "loss": 0.0401, "step": 330 }, { "epoch": 0.2483120780195049, "grad_norm": 0.09021109342575073, "learning_rate": 9.98244968258297e-05, "loss": 0.0255, "step": 331 }, { "epoch": 0.2490622655663916, "grad_norm": 0.14055666327476501, "learning_rate": 9.981897284188073e-05, "loss": 0.0484, "step": 332 }, { "epoch": 0.24981245311327832, "grad_norm": 0.1804196536540985, "learning_rate": 9.981336342329909e-05, "loss": 0.0328, "step": 333 }, { "epoch": 0.25056264066016504, "grad_norm": 0.10542705655097961, "learning_rate": 9.980766857970438e-05, "loss": 0.0335, "step": 334 }, { "epoch": 0.25131282820705175, "grad_norm": 0.23658208549022675, "learning_rate": 9.98018883208627e-05, "loss": 0.0511, "step": 335 }, { "epoch": 0.25206301575393847, "grad_norm": 0.08560968190431595, "learning_rate": 9.979602265668664e-05, "loss": 0.023, "step": 336 }, { "epoch": 0.2528132033008252, "grad_norm": 0.1381518393754959, "learning_rate": 9.979007159723521e-05, "loss": 0.0399, "step": 337 }, { "epoch": 0.2535633908477119, "grad_norm": 0.12532010674476624, "learning_rate": 9.97840351527139e-05, "loss": 0.0321, "step": 338 }, { "epoch": 0.25431357839459867, "grad_norm": 0.1004038155078888, "learning_rate": 9.977791333347462e-05, "loss": 0.034, "step": 339 }, { "epoch": 0.2550637659414854, "grad_norm": 0.16208575665950775, "learning_rate": 9.97717061500157e-05, "loss": 0.0403, "step": 340 }, { "epoch": 0.2558139534883721, "grad_norm": 0.06728474050760269, "learning_rate": 9.976541361298184e-05, "loss": 0.0233, "step": 341 }, { "epoch": 0.2565641410352588, "grad_norm": 0.17869628965854645, "learning_rate": 9.97590357331641e-05, "loss": 0.0395, "step": 342 }, { "epoch": 0.25731432858214554, "grad_norm": 0.1622343361377716, "learning_rate": 9.975257252149994e-05, "loss": 0.0381, "step": 343 }, { "epoch": 0.25806451612903225, "grad_norm": 0.16202475130558014, "learning_rate": 9.974602398907313e-05, "loss": 0.0506, "step": 344 }, { "epoch": 0.25881470367591897, "grad_norm": 0.17167797684669495, "learning_rate": 9.973939014711375e-05, "loss": 0.0503, "step": 345 }, { "epoch": 0.2595648912228057, "grad_norm": 0.2033884972333908, "learning_rate": 9.973267100699819e-05, "loss": 0.0424, "step": 346 }, { "epoch": 0.2603150787696924, "grad_norm": 0.16165803372859955, "learning_rate": 9.972586658024911e-05, "loss": 0.0471, "step": 347 }, { "epoch": 0.26106526631657917, "grad_norm": 0.11009008437395096, "learning_rate": 9.971897687853544e-05, "loss": 0.0302, "step": 348 }, { "epoch": 0.2618154538634659, "grad_norm": 0.17024879157543182, "learning_rate": 9.971200191367234e-05, "loss": 0.0363, "step": 349 }, { "epoch": 0.2625656414103526, "grad_norm": 0.1899632215499878, "learning_rate": 9.970494169762117e-05, "loss": 0.051, "step": 350 }, { "epoch": 0.2633158289572393, "grad_norm": 0.18265698850154877, "learning_rate": 9.969779624248954e-05, "loss": 0.0434, "step": 351 }, { "epoch": 0.26406601650412603, "grad_norm": 0.21099521219730377, "learning_rate": 9.969056556053116e-05, "loss": 0.0552, "step": 352 }, { "epoch": 0.26481620405101275, "grad_norm": 0.12291638553142548, "learning_rate": 9.968324966414597e-05, "loss": 0.0413, "step": 353 }, { "epoch": 0.26556639159789946, "grad_norm": 0.1895735114812851, "learning_rate": 9.967584856588e-05, "loss": 0.0581, "step": 354 }, { "epoch": 0.2663165791447862, "grad_norm": 0.1934838891029358, "learning_rate": 9.966836227842538e-05, "loss": 0.0438, "step": 355 }, { "epoch": 0.2670667666916729, "grad_norm": 0.11946920305490494, "learning_rate": 9.96607908146204e-05, "loss": 0.0367, "step": 356 }, { "epoch": 0.2678169542385596, "grad_norm": 0.10245028138160706, "learning_rate": 9.965313418744935e-05, "loss": 0.0318, "step": 357 }, { "epoch": 0.2685671417854464, "grad_norm": 0.07515406608581543, "learning_rate": 9.964539241004261e-05, "loss": 0.0243, "step": 358 }, { "epoch": 0.2693173293323331, "grad_norm": 0.2560199797153473, "learning_rate": 9.963756549567654e-05, "loss": 0.0573, "step": 359 }, { "epoch": 0.2700675168792198, "grad_norm": 0.160162553191185, "learning_rate": 9.962965345777353e-05, "loss": 0.0478, "step": 360 }, { "epoch": 0.27081770442610653, "grad_norm": 0.13862910866737366, "learning_rate": 9.962165630990196e-05, "loss": 0.0525, "step": 361 }, { "epoch": 0.27156789197299325, "grad_norm": 0.1421460658311844, "learning_rate": 9.961357406577617e-05, "loss": 0.0388, "step": 362 }, { "epoch": 0.27231807951987996, "grad_norm": 0.07698662579059601, "learning_rate": 9.960540673925636e-05, "loss": 0.0265, "step": 363 }, { "epoch": 0.2730682670667667, "grad_norm": 0.12471983581781387, "learning_rate": 9.959715434434873e-05, "loss": 0.0497, "step": 364 }, { "epoch": 0.2738184546136534, "grad_norm": 0.10801269859075546, "learning_rate": 9.958881689520531e-05, "loss": 0.0328, "step": 365 }, { "epoch": 0.2745686421605401, "grad_norm": 0.12907591462135315, "learning_rate": 9.958039440612402e-05, "loss": 0.0411, "step": 366 }, { "epoch": 0.2753188297074269, "grad_norm": 0.15533341467380524, "learning_rate": 9.957188689154859e-05, "loss": 0.0404, "step": 367 }, { "epoch": 0.2760690172543136, "grad_norm": 0.11384189873933792, "learning_rate": 9.956329436606857e-05, "loss": 0.0385, "step": 368 }, { "epoch": 0.2768192048012003, "grad_norm": 0.11162270605564117, "learning_rate": 9.955461684441928e-05, "loss": 0.0416, "step": 369 }, { "epoch": 0.27756939234808703, "grad_norm": 0.19340857863426208, "learning_rate": 9.954585434148183e-05, "loss": 0.0458, "step": 370 }, { "epoch": 0.27831957989497375, "grad_norm": 0.10425236821174622, "learning_rate": 9.953700687228306e-05, "loss": 0.0336, "step": 371 }, { "epoch": 0.27906976744186046, "grad_norm": 0.1434914767742157, "learning_rate": 9.952807445199549e-05, "loss": 0.0354, "step": 372 }, { "epoch": 0.2798199549887472, "grad_norm": 0.08324921876192093, "learning_rate": 9.951905709593735e-05, "loss": 0.0241, "step": 373 }, { "epoch": 0.2805701425356339, "grad_norm": 0.1119835302233696, "learning_rate": 9.950995481957251e-05, "loss": 0.0345, "step": 374 }, { "epoch": 0.2813203300825206, "grad_norm": 0.09572914987802505, "learning_rate": 9.950076763851049e-05, "loss": 0.0305, "step": 375 }, { "epoch": 0.2820705176294073, "grad_norm": 0.10482971370220184, "learning_rate": 9.949149556850638e-05, "loss": 0.0358, "step": 376 }, { "epoch": 0.2828207051762941, "grad_norm": 0.14237084984779358, "learning_rate": 9.94821386254609e-05, "loss": 0.0338, "step": 377 }, { "epoch": 0.2835708927231808, "grad_norm": 0.10478667169809341, "learning_rate": 9.947269682542027e-05, "loss": 0.0249, "step": 378 }, { "epoch": 0.2843210802700675, "grad_norm": 0.11596731841564178, "learning_rate": 9.946317018457622e-05, "loss": 0.0286, "step": 379 }, { "epoch": 0.28507126781695424, "grad_norm": 0.11092496663331985, "learning_rate": 9.945355871926605e-05, "loss": 0.0333, "step": 380 }, { "epoch": 0.28582145536384096, "grad_norm": 0.14135751128196716, "learning_rate": 9.944386244597244e-05, "loss": 0.0348, "step": 381 }, { "epoch": 0.2865716429107277, "grad_norm": 0.1096055880188942, "learning_rate": 9.943408138132357e-05, "loss": 0.0382, "step": 382 }, { "epoch": 0.2873218304576144, "grad_norm": 0.07303545624017715, "learning_rate": 9.942421554209297e-05, "loss": 0.0237, "step": 383 }, { "epoch": 0.2880720180045011, "grad_norm": 0.1861746609210968, "learning_rate": 9.94142649451996e-05, "loss": 0.0462, "step": 384 }, { "epoch": 0.2888222055513878, "grad_norm": 0.11650864779949188, "learning_rate": 9.940422960770776e-05, "loss": 0.0374, "step": 385 }, { "epoch": 0.2895723930982746, "grad_norm": 0.13004302978515625, "learning_rate": 9.939410954682706e-05, "loss": 0.0275, "step": 386 }, { "epoch": 0.2903225806451613, "grad_norm": 0.13394056260585785, "learning_rate": 9.938390477991242e-05, "loss": 0.0402, "step": 387 }, { "epoch": 0.291072768192048, "grad_norm": 0.11594215780496597, "learning_rate": 9.937361532446399e-05, "loss": 0.0274, "step": 388 }, { "epoch": 0.29182295573893474, "grad_norm": 0.08473474532365799, "learning_rate": 9.936324119812719e-05, "loss": 0.0264, "step": 389 }, { "epoch": 0.29257314328582146, "grad_norm": 0.14290671050548553, "learning_rate": 9.93527824186926e-05, "loss": 0.0294, "step": 390 }, { "epoch": 0.2933233308327082, "grad_norm": 0.13471192121505737, "learning_rate": 9.934223900409603e-05, "loss": 0.029, "step": 391 }, { "epoch": 0.2940735183795949, "grad_norm": 0.11877185851335526, "learning_rate": 9.933161097241837e-05, "loss": 0.0295, "step": 392 }, { "epoch": 0.2948237059264816, "grad_norm": 0.14871439337730408, "learning_rate": 9.932089834188567e-05, "loss": 0.0354, "step": 393 }, { "epoch": 0.2955738934733683, "grad_norm": 0.12378165125846863, "learning_rate": 9.931010113086902e-05, "loss": 0.0348, "step": 394 }, { "epoch": 0.29632408102025504, "grad_norm": 0.15641850233078003, "learning_rate": 9.929921935788457e-05, "loss": 0.0414, "step": 395 }, { "epoch": 0.2970742685671418, "grad_norm": 0.07526502013206482, "learning_rate": 9.928825304159351e-05, "loss": 0.0277, "step": 396 }, { "epoch": 0.2978244561140285, "grad_norm": 0.13670268654823303, "learning_rate": 9.927720220080199e-05, "loss": 0.035, "step": 397 }, { "epoch": 0.29857464366091524, "grad_norm": 0.0850730836391449, "learning_rate": 9.926606685446109e-05, "loss": 0.0373, "step": 398 }, { "epoch": 0.29932483120780196, "grad_norm": 0.08862190693616867, "learning_rate": 9.925484702166686e-05, "loss": 0.0317, "step": 399 }, { "epoch": 0.30007501875468867, "grad_norm": 0.08291352540254593, "learning_rate": 9.924354272166017e-05, "loss": 0.0179, "step": 400 }, { "epoch": 0.30007501875468867, "eval_loss": 0.03630685806274414, "eval_runtime": 5.1209, "eval_samples_per_second": 10.545, "eval_steps_per_second": 2.734, "step": 400 }, { "epoch": 0.3008252063015754, "grad_norm": 0.1142362654209137, "learning_rate": 9.923215397382684e-05, "loss": 0.0294, "step": 401 }, { "epoch": 0.3015753938484621, "grad_norm": 0.08515766263008118, "learning_rate": 9.92206807976974e-05, "loss": 0.0208, "step": 402 }, { "epoch": 0.3023255813953488, "grad_norm": 0.1859070360660553, "learning_rate": 9.920912321294723e-05, "loss": 0.0405, "step": 403 }, { "epoch": 0.30307576894223553, "grad_norm": 0.0877511203289032, "learning_rate": 9.919748123939647e-05, "loss": 0.0276, "step": 404 }, { "epoch": 0.3038259564891223, "grad_norm": 0.14317767322063446, "learning_rate": 9.918575489700993e-05, "loss": 0.0405, "step": 405 }, { "epoch": 0.304576144036009, "grad_norm": 0.1826557219028473, "learning_rate": 9.917394420589716e-05, "loss": 0.034, "step": 406 }, { "epoch": 0.30532633158289574, "grad_norm": 0.08531735837459564, "learning_rate": 9.916204918631231e-05, "loss": 0.031, "step": 407 }, { "epoch": 0.30607651912978245, "grad_norm": 0.13493755459785461, "learning_rate": 9.915006985865416e-05, "loss": 0.0298, "step": 408 }, { "epoch": 0.30682670667666917, "grad_norm": 0.10673831403255463, "learning_rate": 9.913800624346612e-05, "loss": 0.0313, "step": 409 }, { "epoch": 0.3075768942235559, "grad_norm": 0.16792036592960358, "learning_rate": 9.912585836143606e-05, "loss": 0.0457, "step": 410 }, { "epoch": 0.3083270817704426, "grad_norm": 0.11888712644577026, "learning_rate": 9.911362623339642e-05, "loss": 0.0337, "step": 411 }, { "epoch": 0.3090772693173293, "grad_norm": 0.13570362329483032, "learning_rate": 9.91013098803241e-05, "loss": 0.0408, "step": 412 }, { "epoch": 0.30982745686421603, "grad_norm": 0.11306290328502655, "learning_rate": 9.908890932334042e-05, "loss": 0.0345, "step": 413 }, { "epoch": 0.31057764441110275, "grad_norm": 0.1427725851535797, "learning_rate": 9.907642458371111e-05, "loss": 0.0429, "step": 414 }, { "epoch": 0.3113278319579895, "grad_norm": 0.18175150454044342, "learning_rate": 9.906385568284629e-05, "loss": 0.0428, "step": 415 }, { "epoch": 0.31207801950487624, "grad_norm": 0.1440231204032898, "learning_rate": 9.905120264230036e-05, "loss": 0.0421, "step": 416 }, { "epoch": 0.31282820705176295, "grad_norm": 0.11851240694522858, "learning_rate": 9.903846548377206e-05, "loss": 0.0291, "step": 417 }, { "epoch": 0.31357839459864967, "grad_norm": 0.18592765927314758, "learning_rate": 9.902564422910436e-05, "loss": 0.0565, "step": 418 }, { "epoch": 0.3143285821455364, "grad_norm": 0.15560060739517212, "learning_rate": 9.901273890028444e-05, "loss": 0.0378, "step": 419 }, { "epoch": 0.3150787696924231, "grad_norm": 0.1025664433836937, "learning_rate": 9.899974951944367e-05, "loss": 0.0271, "step": 420 }, { "epoch": 0.3158289572393098, "grad_norm": 0.11080783605575562, "learning_rate": 9.898667610885757e-05, "loss": 0.0287, "step": 421 }, { "epoch": 0.31657914478619653, "grad_norm": 0.13097728788852692, "learning_rate": 9.897351869094573e-05, "loss": 0.0388, "step": 422 }, { "epoch": 0.31732933233308325, "grad_norm": 0.13140304386615753, "learning_rate": 9.896027728827185e-05, "loss": 0.0451, "step": 423 }, { "epoch": 0.31807951987997, "grad_norm": 0.12562090158462524, "learning_rate": 9.894695192354362e-05, "loss": 0.0295, "step": 424 }, { "epoch": 0.31882970742685673, "grad_norm": 0.25448498129844666, "learning_rate": 9.893354261961274e-05, "loss": 0.0596, "step": 425 }, { "epoch": 0.31957989497374345, "grad_norm": 0.15546709299087524, "learning_rate": 9.892004939947482e-05, "loss": 0.0326, "step": 426 }, { "epoch": 0.32033008252063017, "grad_norm": 0.17915265262126923, "learning_rate": 9.890647228626944e-05, "loss": 0.0502, "step": 427 }, { "epoch": 0.3210802700675169, "grad_norm": 0.17885051667690277, "learning_rate": 9.889281130327997e-05, "loss": 0.0376, "step": 428 }, { "epoch": 0.3218304576144036, "grad_norm": 0.1253713220357895, "learning_rate": 9.887906647393368e-05, "loss": 0.033, "step": 429 }, { "epoch": 0.3225806451612903, "grad_norm": 0.10489869117736816, "learning_rate": 9.88652378218016e-05, "loss": 0.0314, "step": 430 }, { "epoch": 0.32333083270817703, "grad_norm": 0.15992328524589539, "learning_rate": 9.885132537059849e-05, "loss": 0.0396, "step": 431 }, { "epoch": 0.32408102025506375, "grad_norm": 0.08546454459428787, "learning_rate": 9.883732914418285e-05, "loss": 0.027, "step": 432 }, { "epoch": 0.32483120780195046, "grad_norm": 0.13443313539028168, "learning_rate": 9.882324916655681e-05, "loss": 0.0361, "step": 433 }, { "epoch": 0.32558139534883723, "grad_norm": 0.21272841095924377, "learning_rate": 9.880908546186616e-05, "loss": 0.0636, "step": 434 }, { "epoch": 0.32633158289572395, "grad_norm": 0.13114754855632782, "learning_rate": 9.879483805440027e-05, "loss": 0.0352, "step": 435 }, { "epoch": 0.32708177044261066, "grad_norm": 0.11861104518175125, "learning_rate": 9.8780506968592e-05, "loss": 0.0294, "step": 436 }, { "epoch": 0.3278319579894974, "grad_norm": 0.07027596980333328, "learning_rate": 9.876609222901781e-05, "loss": 0.0179, "step": 437 }, { "epoch": 0.3285821455363841, "grad_norm": 0.1381312608718872, "learning_rate": 9.875159386039749e-05, "loss": 0.0468, "step": 438 }, { "epoch": 0.3293323330832708, "grad_norm": 0.12370853126049042, "learning_rate": 9.873701188759438e-05, "loss": 0.044, "step": 439 }, { "epoch": 0.3300825206301575, "grad_norm": 0.12424737215042114, "learning_rate": 9.872234633561509e-05, "loss": 0.0432, "step": 440 }, { "epoch": 0.33083270817704424, "grad_norm": 0.09881707280874252, "learning_rate": 9.87075972296096e-05, "loss": 0.0286, "step": 441 }, { "epoch": 0.33158289572393096, "grad_norm": 0.11166679114103317, "learning_rate": 9.86927645948712e-05, "loss": 0.0392, "step": 442 }, { "epoch": 0.33233308327081773, "grad_norm": 0.0945601686835289, "learning_rate": 9.867784845683637e-05, "loss": 0.0339, "step": 443 }, { "epoch": 0.33308327081770445, "grad_norm": 0.13213807344436646, "learning_rate": 9.866284884108481e-05, "loss": 0.0415, "step": 444 }, { "epoch": 0.33383345836459116, "grad_norm": 0.12910525500774384, "learning_rate": 9.864776577333941e-05, "loss": 0.0405, "step": 445 }, { "epoch": 0.3345836459114779, "grad_norm": 0.07470346242189407, "learning_rate": 9.863259927946613e-05, "loss": 0.0243, "step": 446 }, { "epoch": 0.3353338334583646, "grad_norm": 0.10762255638837814, "learning_rate": 9.861734938547405e-05, "loss": 0.0223, "step": 447 }, { "epoch": 0.3360840210052513, "grad_norm": 0.11334626376628876, "learning_rate": 9.860201611751518e-05, "loss": 0.0396, "step": 448 }, { "epoch": 0.336834208552138, "grad_norm": 0.1437179446220398, "learning_rate": 9.858659950188458e-05, "loss": 0.0364, "step": 449 }, { "epoch": 0.33758439609902474, "grad_norm": 0.12946391105651855, "learning_rate": 9.857109956502027e-05, "loss": 0.0362, "step": 450 }, { "epoch": 0.33833458364591146, "grad_norm": 0.14568856358528137, "learning_rate": 9.855551633350306e-05, "loss": 0.0421, "step": 451 }, { "epoch": 0.3390847711927982, "grad_norm": 0.14602363109588623, "learning_rate": 9.853984983405668e-05, "loss": 0.0438, "step": 452 }, { "epoch": 0.33983495873968494, "grad_norm": 0.11090512573719025, "learning_rate": 9.852410009354766e-05, "loss": 0.0285, "step": 453 }, { "epoch": 0.34058514628657166, "grad_norm": 0.23449061810970306, "learning_rate": 9.850826713898521e-05, "loss": 0.0442, "step": 454 }, { "epoch": 0.3413353338334584, "grad_norm": 0.10066236555576324, "learning_rate": 9.849235099752132e-05, "loss": 0.038, "step": 455 }, { "epoch": 0.3420855213803451, "grad_norm": 0.14898282289505005, "learning_rate": 9.847635169645058e-05, "loss": 0.0331, "step": 456 }, { "epoch": 0.3428357089272318, "grad_norm": 0.17724516987800598, "learning_rate": 9.846026926321024e-05, "loss": 0.0434, "step": 457 }, { "epoch": 0.3435858964741185, "grad_norm": 0.11125597357749939, "learning_rate": 9.844410372538006e-05, "loss": 0.0286, "step": 458 }, { "epoch": 0.34433608402100524, "grad_norm": 0.13614225387573242, "learning_rate": 9.842785511068239e-05, "loss": 0.0407, "step": 459 }, { "epoch": 0.34508627156789196, "grad_norm": 0.14966627955436707, "learning_rate": 9.841152344698197e-05, "loss": 0.0244, "step": 460 }, { "epoch": 0.34583645911477867, "grad_norm": 0.140168234705925, "learning_rate": 9.8395108762286e-05, "loss": 0.0316, "step": 461 }, { "epoch": 0.34658664666166544, "grad_norm": 0.10689567774534225, "learning_rate": 9.837861108474404e-05, "loss": 0.0305, "step": 462 }, { "epoch": 0.34733683420855216, "grad_norm": 0.12915848195552826, "learning_rate": 9.8362030442648e-05, "loss": 0.0201, "step": 463 }, { "epoch": 0.3480870217554389, "grad_norm": 0.12917649745941162, "learning_rate": 9.834536686443204e-05, "loss": 0.0289, "step": 464 }, { "epoch": 0.3488372093023256, "grad_norm": 0.07282043993473053, "learning_rate": 9.832862037867257e-05, "loss": 0.0236, "step": 465 }, { "epoch": 0.3495873968492123, "grad_norm": 0.2947641611099243, "learning_rate": 9.831179101408813e-05, "loss": 0.0541, "step": 466 }, { "epoch": 0.350337584396099, "grad_norm": 0.1508365124464035, "learning_rate": 9.829487879953946e-05, "loss": 0.0361, "step": 467 }, { "epoch": 0.35108777194298574, "grad_norm": 0.14020177721977234, "learning_rate": 9.827788376402932e-05, "loss": 0.0381, "step": 468 }, { "epoch": 0.35183795948987245, "grad_norm": 0.12377166002988815, "learning_rate": 9.826080593670253e-05, "loss": 0.029, "step": 469 }, { "epoch": 0.35258814703675917, "grad_norm": 0.32600975036621094, "learning_rate": 9.82436453468459e-05, "loss": 0.0506, "step": 470 }, { "epoch": 0.3533383345836459, "grad_norm": 0.22649474442005157, "learning_rate": 9.822640202388812e-05, "loss": 0.0376, "step": 471 }, { "epoch": 0.35408852213053266, "grad_norm": 0.11873234808444977, "learning_rate": 9.820907599739979e-05, "loss": 0.0351, "step": 472 }, { "epoch": 0.3548387096774194, "grad_norm": 0.1265113800764084, "learning_rate": 9.819166729709336e-05, "loss": 0.0415, "step": 473 }, { "epoch": 0.3555888972243061, "grad_norm": 0.15935584902763367, "learning_rate": 9.817417595282304e-05, "loss": 0.0452, "step": 474 }, { "epoch": 0.3563390847711928, "grad_norm": 0.16310295462608337, "learning_rate": 9.815660199458476e-05, "loss": 0.0297, "step": 475 }, { "epoch": 0.3570892723180795, "grad_norm": 0.1761324107646942, "learning_rate": 9.81389454525161e-05, "loss": 0.0462, "step": 476 }, { "epoch": 0.35783945986496624, "grad_norm": 0.13628017902374268, "learning_rate": 9.812120635689632e-05, "loss": 0.0442, "step": 477 }, { "epoch": 0.35858964741185295, "grad_norm": 0.17877016961574554, "learning_rate": 9.810338473814621e-05, "loss": 0.0537, "step": 478 }, { "epoch": 0.35933983495873967, "grad_norm": 0.10660474747419357, "learning_rate": 9.808548062682812e-05, "loss": 0.0316, "step": 479 }, { "epoch": 0.3600900225056264, "grad_norm": 0.11876129359006882, "learning_rate": 9.80674940536458e-05, "loss": 0.0343, "step": 480 }, { "epoch": 0.36084021005251316, "grad_norm": 0.08890905231237411, "learning_rate": 9.804942504944445e-05, "loss": 0.0267, "step": 481 }, { "epoch": 0.36159039759939987, "grad_norm": 0.12817738950252533, "learning_rate": 9.803127364521067e-05, "loss": 0.0354, "step": 482 }, { "epoch": 0.3623405851462866, "grad_norm": 0.06357493251562119, "learning_rate": 9.801303987207229e-05, "loss": 0.0236, "step": 483 }, { "epoch": 0.3630907726931733, "grad_norm": 0.22393284738063812, "learning_rate": 9.799472376129846e-05, "loss": 0.0551, "step": 484 }, { "epoch": 0.36384096024006, "grad_norm": 0.17032992839813232, "learning_rate": 9.79763253442995e-05, "loss": 0.0424, "step": 485 }, { "epoch": 0.36459114778694673, "grad_norm": 0.2818201780319214, "learning_rate": 9.795784465262689e-05, "loss": 0.0532, "step": 486 }, { "epoch": 0.36534133533383345, "grad_norm": 0.2822514772415161, "learning_rate": 9.79392817179732e-05, "loss": 0.0402, "step": 487 }, { "epoch": 0.36609152288072017, "grad_norm": 0.07287586480379105, "learning_rate": 9.792063657217201e-05, "loss": 0.0231, "step": 488 }, { "epoch": 0.3668417104276069, "grad_norm": 0.10715020447969437, "learning_rate": 9.790190924719793e-05, "loss": 0.0179, "step": 489 }, { "epoch": 0.3675918979744936, "grad_norm": 0.1892417073249817, "learning_rate": 9.788309977516648e-05, "loss": 0.0476, "step": 490 }, { "epoch": 0.36834208552138037, "grad_norm": 0.1349543333053589, "learning_rate": 9.786420818833404e-05, "loss": 0.0295, "step": 491 }, { "epoch": 0.3690922730682671, "grad_norm": 0.13802854716777802, "learning_rate": 9.784523451909782e-05, "loss": 0.0326, "step": 492 }, { "epoch": 0.3698424606151538, "grad_norm": 0.28222379088401794, "learning_rate": 9.78261787999958e-05, "loss": 0.0546, "step": 493 }, { "epoch": 0.3705926481620405, "grad_norm": 0.18687193095684052, "learning_rate": 9.780704106370667e-05, "loss": 0.0535, "step": 494 }, { "epoch": 0.37134283570892723, "grad_norm": 0.11292416602373123, "learning_rate": 9.778782134304976e-05, "loss": 0.0327, "step": 495 }, { "epoch": 0.37209302325581395, "grad_norm": 0.07824314385652542, "learning_rate": 9.776851967098499e-05, "loss": 0.0218, "step": 496 }, { "epoch": 0.37284321080270066, "grad_norm": 0.1915801614522934, "learning_rate": 9.774913608061282e-05, "loss": 0.0423, "step": 497 }, { "epoch": 0.3735933983495874, "grad_norm": 0.18689598143100739, "learning_rate": 9.772967060517421e-05, "loss": 0.0493, "step": 498 }, { "epoch": 0.3743435858964741, "grad_norm": 0.11407961696386337, "learning_rate": 9.771012327805055e-05, "loss": 0.0302, "step": 499 }, { "epoch": 0.37509377344336087, "grad_norm": 0.20327265560626984, "learning_rate": 9.769049413276355e-05, "loss": 0.0459, "step": 500 }, { "epoch": 0.3758439609902476, "grad_norm": 0.1307540237903595, "learning_rate": 9.767078320297528e-05, "loss": 0.0364, "step": 501 }, { "epoch": 0.3765941485371343, "grad_norm": 0.18840287625789642, "learning_rate": 9.765099052248805e-05, "loss": 0.0419, "step": 502 }, { "epoch": 0.377344336084021, "grad_norm": 0.1403331160545349, "learning_rate": 9.763111612524434e-05, "loss": 0.0356, "step": 503 }, { "epoch": 0.37809452363090773, "grad_norm": 0.09807176887989044, "learning_rate": 9.761116004532679e-05, "loss": 0.0265, "step": 504 }, { "epoch": 0.37884471117779445, "grad_norm": 0.16782943904399872, "learning_rate": 9.759112231695811e-05, "loss": 0.0425, "step": 505 }, { "epoch": 0.37959489872468116, "grad_norm": 0.14254771173000336, "learning_rate": 9.757100297450103e-05, "loss": 0.0527, "step": 506 }, { "epoch": 0.3803450862715679, "grad_norm": 0.12506116926670074, "learning_rate": 9.755080205245826e-05, "loss": 0.0248, "step": 507 }, { "epoch": 0.3810952738184546, "grad_norm": 0.12891031801700592, "learning_rate": 9.753051958547238e-05, "loss": 0.0268, "step": 508 }, { "epoch": 0.3818454613653413, "grad_norm": 0.11933682858943939, "learning_rate": 9.751015560832582e-05, "loss": 0.0384, "step": 509 }, { "epoch": 0.3825956489122281, "grad_norm": 0.14737223088741302, "learning_rate": 9.748971015594078e-05, "loss": 0.0355, "step": 510 }, { "epoch": 0.3833458364591148, "grad_norm": 0.1678527295589447, "learning_rate": 9.746918326337923e-05, "loss": 0.0389, "step": 511 }, { "epoch": 0.3840960240060015, "grad_norm": 0.16946767270565033, "learning_rate": 9.744857496584274e-05, "loss": 0.0455, "step": 512 }, { "epoch": 0.38484621155288823, "grad_norm": 0.19964195787906647, "learning_rate": 9.742788529867255e-05, "loss": 0.0417, "step": 513 }, { "epoch": 0.38559639909977494, "grad_norm": 0.10407441854476929, "learning_rate": 9.740711429734936e-05, "loss": 0.0335, "step": 514 }, { "epoch": 0.38634658664666166, "grad_norm": 0.19779853522777557, "learning_rate": 9.738626199749341e-05, "loss": 0.0372, "step": 515 }, { "epoch": 0.3870967741935484, "grad_norm": 0.22248423099517822, "learning_rate": 9.736532843486433e-05, "loss": 0.049, "step": 516 }, { "epoch": 0.3878469617404351, "grad_norm": 0.1619381308555603, "learning_rate": 9.734431364536114e-05, "loss": 0.0435, "step": 517 }, { "epoch": 0.3885971492873218, "grad_norm": 0.16517984867095947, "learning_rate": 9.732321766502213e-05, "loss": 0.0418, "step": 518 }, { "epoch": 0.3893473368342086, "grad_norm": 0.04582435265183449, "learning_rate": 9.730204053002481e-05, "loss": 0.0173, "step": 519 }, { "epoch": 0.3900975243810953, "grad_norm": 0.09578097611665726, "learning_rate": 9.728078227668588e-05, "loss": 0.0275, "step": 520 }, { "epoch": 0.390847711927982, "grad_norm": 0.11759772896766663, "learning_rate": 9.725944294146119e-05, "loss": 0.0419, "step": 521 }, { "epoch": 0.3915978994748687, "grad_norm": 0.11090448498725891, "learning_rate": 9.723802256094555e-05, "loss": 0.0372, "step": 522 }, { "epoch": 0.39234808702175544, "grad_norm": 0.16954998672008514, "learning_rate": 9.721652117187283e-05, "loss": 0.0348, "step": 523 }, { "epoch": 0.39309827456864216, "grad_norm": 0.08991193026304245, "learning_rate": 9.71949388111158e-05, "loss": 0.0333, "step": 524 }, { "epoch": 0.3938484621155289, "grad_norm": 0.144531711935997, "learning_rate": 9.717327551568608e-05, "loss": 0.0413, "step": 525 }, { "epoch": 0.3945986496624156, "grad_norm": 0.08531095832586288, "learning_rate": 9.715153132273407e-05, "loss": 0.0289, "step": 526 }, { "epoch": 0.3953488372093023, "grad_norm": 0.09487048536539078, "learning_rate": 9.712970626954893e-05, "loss": 0.0267, "step": 527 }, { "epoch": 0.396099024756189, "grad_norm": 0.08603309094905853, "learning_rate": 9.71078003935585e-05, "loss": 0.0286, "step": 528 }, { "epoch": 0.3968492123030758, "grad_norm": 0.10319013148546219, "learning_rate": 9.708581373232917e-05, "loss": 0.0334, "step": 529 }, { "epoch": 0.3975993998499625, "grad_norm": 0.15875476598739624, "learning_rate": 9.70637463235659e-05, "loss": 0.0403, "step": 530 }, { "epoch": 0.3983495873968492, "grad_norm": 0.1145065426826477, "learning_rate": 9.704159820511214e-05, "loss": 0.0393, "step": 531 }, { "epoch": 0.39909977494373594, "grad_norm": 0.11985963582992554, "learning_rate": 9.701936941494971e-05, "loss": 0.0285, "step": 532 }, { "epoch": 0.39984996249062266, "grad_norm": 0.18910159170627594, "learning_rate": 9.699705999119882e-05, "loss": 0.037, "step": 533 }, { "epoch": 0.4006001500375094, "grad_norm": 0.11701525002717972, "learning_rate": 9.697466997211793e-05, "loss": 0.0235, "step": 534 }, { "epoch": 0.4013503375843961, "grad_norm": 0.1602649986743927, "learning_rate": 9.69521993961037e-05, "loss": 0.0384, "step": 535 }, { "epoch": 0.4021005251312828, "grad_norm": 0.12196832150220871, "learning_rate": 9.692964830169098e-05, "loss": 0.0386, "step": 536 }, { "epoch": 0.4028507126781695, "grad_norm": 0.11243263632059097, "learning_rate": 9.690701672755266e-05, "loss": 0.0361, "step": 537 }, { "epoch": 0.4036009002250563, "grad_norm": 0.15681670606136322, "learning_rate": 9.688430471249967e-05, "loss": 0.0411, "step": 538 }, { "epoch": 0.404351087771943, "grad_norm": 0.09906280040740967, "learning_rate": 9.686151229548088e-05, "loss": 0.024, "step": 539 }, { "epoch": 0.4051012753188297, "grad_norm": 0.18042565882205963, "learning_rate": 9.683863951558301e-05, "loss": 0.0415, "step": 540 }, { "epoch": 0.40585146286571644, "grad_norm": 0.09993312507867813, "learning_rate": 9.681568641203068e-05, "loss": 0.0127, "step": 541 }, { "epoch": 0.40660165041260315, "grad_norm": 0.1347557008266449, "learning_rate": 9.679265302418615e-05, "loss": 0.0318, "step": 542 }, { "epoch": 0.40735183795948987, "grad_norm": 0.14245480298995972, "learning_rate": 9.676953939154945e-05, "loss": 0.0406, "step": 543 }, { "epoch": 0.4081020255063766, "grad_norm": 0.2006424218416214, "learning_rate": 9.674634555375817e-05, "loss": 0.0336, "step": 544 }, { "epoch": 0.4088522130532633, "grad_norm": 0.16163894534111023, "learning_rate": 9.672307155058744e-05, "loss": 0.048, "step": 545 }, { "epoch": 0.40960240060015, "grad_norm": 0.1653139442205429, "learning_rate": 9.669971742194992e-05, "loss": 0.0529, "step": 546 }, { "epoch": 0.41035258814703673, "grad_norm": 0.17910104990005493, "learning_rate": 9.667628320789562e-05, "loss": 0.0492, "step": 547 }, { "epoch": 0.4111027756939235, "grad_norm": 0.10363556444644928, "learning_rate": 9.665276894861188e-05, "loss": 0.0375, "step": 548 }, { "epoch": 0.4118529632408102, "grad_norm": 0.07300784438848495, "learning_rate": 9.66291746844234e-05, "loss": 0.0223, "step": 549 }, { "epoch": 0.41260315078769694, "grad_norm": 0.1361912339925766, "learning_rate": 9.660550045579199e-05, "loss": 0.0395, "step": 550 }, { "epoch": 0.41335333833458365, "grad_norm": 0.21163193881511688, "learning_rate": 9.65817463033166e-05, "loss": 0.0494, "step": 551 }, { "epoch": 0.41410352588147037, "grad_norm": 0.10939466208219528, "learning_rate": 9.655791226773331e-05, "loss": 0.0379, "step": 552 }, { "epoch": 0.4148537134283571, "grad_norm": 0.14424750208854675, "learning_rate": 9.65339983899151e-05, "loss": 0.0385, "step": 553 }, { "epoch": 0.4156039009752438, "grad_norm": 0.09869871288537979, "learning_rate": 9.651000471087193e-05, "loss": 0.0306, "step": 554 }, { "epoch": 0.4163540885221305, "grad_norm": 0.09752748161554337, "learning_rate": 9.64859312717506e-05, "loss": 0.0272, "step": 555 }, { "epoch": 0.41710427606901723, "grad_norm": 0.19783945381641388, "learning_rate": 9.64617781138347e-05, "loss": 0.0307, "step": 556 }, { "epoch": 0.41785446361590395, "grad_norm": 0.15972763299942017, "learning_rate": 9.643754527854451e-05, "loss": 0.0443, "step": 557 }, { "epoch": 0.4186046511627907, "grad_norm": 0.11784256994724274, "learning_rate": 9.641323280743693e-05, "loss": 0.0295, "step": 558 }, { "epoch": 0.41935483870967744, "grad_norm": 0.09688984602689743, "learning_rate": 9.638884074220548e-05, "loss": 0.0434, "step": 559 }, { "epoch": 0.42010502625656415, "grad_norm": 0.09953712671995163, "learning_rate": 9.636436912468015e-05, "loss": 0.0259, "step": 560 }, { "epoch": 0.42085521380345087, "grad_norm": 0.13692741096019745, "learning_rate": 9.633981799682735e-05, "loss": 0.0441, "step": 561 }, { "epoch": 0.4216054013503376, "grad_norm": 0.09490803629159927, "learning_rate": 9.631518740074985e-05, "loss": 0.0141, "step": 562 }, { "epoch": 0.4223555888972243, "grad_norm": 0.12184827029705048, "learning_rate": 9.629047737868669e-05, "loss": 0.0328, "step": 563 }, { "epoch": 0.423105776444111, "grad_norm": 0.1606912910938263, "learning_rate": 9.626568797301311e-05, "loss": 0.0454, "step": 564 }, { "epoch": 0.42385596399099773, "grad_norm": 0.09314081072807312, "learning_rate": 9.624081922624053e-05, "loss": 0.0251, "step": 565 }, { "epoch": 0.42460615153788445, "grad_norm": 0.1758616864681244, "learning_rate": 9.621587118101638e-05, "loss": 0.0317, "step": 566 }, { "epoch": 0.4253563390847712, "grad_norm": 0.12884719669818878, "learning_rate": 9.619084388012412e-05, "loss": 0.0356, "step": 567 }, { "epoch": 0.42610652663165793, "grad_norm": 0.14522381126880646, "learning_rate": 9.616573736648308e-05, "loss": 0.0344, "step": 568 }, { "epoch": 0.42685671417854465, "grad_norm": 0.11167056858539581, "learning_rate": 9.61405516831485e-05, "loss": 0.0292, "step": 569 }, { "epoch": 0.42760690172543137, "grad_norm": 0.1222648173570633, "learning_rate": 9.61152868733113e-05, "loss": 0.0246, "step": 570 }, { "epoch": 0.4283570892723181, "grad_norm": 0.11057785153388977, "learning_rate": 9.608994298029818e-05, "loss": 0.0351, "step": 571 }, { "epoch": 0.4291072768192048, "grad_norm": 0.1606517732143402, "learning_rate": 9.60645200475714e-05, "loss": 0.0384, "step": 572 }, { "epoch": 0.4298574643660915, "grad_norm": 0.1263444721698761, "learning_rate": 9.603901811872877e-05, "loss": 0.042, "step": 573 }, { "epoch": 0.43060765191297823, "grad_norm": 0.16173186898231506, "learning_rate": 9.601343723750363e-05, "loss": 0.0365, "step": 574 }, { "epoch": 0.43135783945986494, "grad_norm": 0.07905527204275131, "learning_rate": 9.598777744776464e-05, "loss": 0.0223, "step": 575 }, { "epoch": 0.43210802700675166, "grad_norm": 0.13041624426841736, "learning_rate": 9.596203879351582e-05, "loss": 0.0288, "step": 576 }, { "epoch": 0.43285821455363843, "grad_norm": 0.14066483080387115, "learning_rate": 9.593622131889643e-05, "loss": 0.0418, "step": 577 }, { "epoch": 0.43360840210052515, "grad_norm": 0.10811105370521545, "learning_rate": 9.591032506818089e-05, "loss": 0.0283, "step": 578 }, { "epoch": 0.43435858964741186, "grad_norm": 0.15928411483764648, "learning_rate": 9.588435008577873e-05, "loss": 0.0369, "step": 579 }, { "epoch": 0.4351087771942986, "grad_norm": 0.07709395885467529, "learning_rate": 9.585829641623448e-05, "loss": 0.02, "step": 580 }, { "epoch": 0.4358589647411853, "grad_norm": 0.13441017270088196, "learning_rate": 9.583216410422762e-05, "loss": 0.0369, "step": 581 }, { "epoch": 0.436609152288072, "grad_norm": 0.1272798478603363, "learning_rate": 9.580595319457249e-05, "loss": 0.0337, "step": 582 }, { "epoch": 0.4373593398349587, "grad_norm": 0.2848840653896332, "learning_rate": 9.577966373221823e-05, "loss": 0.0627, "step": 583 }, { "epoch": 0.43810952738184544, "grad_norm": 0.12250744551420212, "learning_rate": 9.575329576224868e-05, "loss": 0.0348, "step": 584 }, { "epoch": 0.43885971492873216, "grad_norm": 0.08176639676094055, "learning_rate": 9.572684932988227e-05, "loss": 0.0236, "step": 585 }, { "epoch": 0.43960990247561893, "grad_norm": 0.11825791746377945, "learning_rate": 9.570032448047208e-05, "loss": 0.0392, "step": 586 }, { "epoch": 0.44036009002250565, "grad_norm": 0.13997545838356018, "learning_rate": 9.567372125950559e-05, "loss": 0.0255, "step": 587 }, { "epoch": 0.44111027756939236, "grad_norm": 0.07865186035633087, "learning_rate": 9.564703971260472e-05, "loss": 0.0253, "step": 588 }, { "epoch": 0.4418604651162791, "grad_norm": 0.08131557703018188, "learning_rate": 9.562027988552567e-05, "loss": 0.0295, "step": 589 }, { "epoch": 0.4426106526631658, "grad_norm": 0.15946389734745026, "learning_rate": 9.559344182415891e-05, "loss": 0.0626, "step": 590 }, { "epoch": 0.4433608402100525, "grad_norm": 0.13182489573955536, "learning_rate": 9.55665255745291e-05, "loss": 0.0403, "step": 591 }, { "epoch": 0.4441110277569392, "grad_norm": 0.14255809783935547, "learning_rate": 9.553953118279496e-05, "loss": 0.0445, "step": 592 }, { "epoch": 0.44486121530382594, "grad_norm": 0.1584695428609848, "learning_rate": 9.551245869524916e-05, "loss": 0.0415, "step": 593 }, { "epoch": 0.44561140285071266, "grad_norm": 0.08315354585647583, "learning_rate": 9.54853081583184e-05, "loss": 0.0288, "step": 594 }, { "epoch": 0.4463615903975994, "grad_norm": 0.09172634780406952, "learning_rate": 9.545807961856317e-05, "loss": 0.0261, "step": 595 }, { "epoch": 0.44711177794448614, "grad_norm": 0.13752834498882294, "learning_rate": 9.543077312267773e-05, "loss": 0.0385, "step": 596 }, { "epoch": 0.44786196549137286, "grad_norm": 0.09433577209711075, "learning_rate": 9.540338871749002e-05, "loss": 0.0369, "step": 597 }, { "epoch": 0.4486121530382596, "grad_norm": 0.1977624148130417, "learning_rate": 9.537592644996162e-05, "loss": 0.0442, "step": 598 }, { "epoch": 0.4493623405851463, "grad_norm": 0.0797150656580925, "learning_rate": 9.534838636718759e-05, "loss": 0.0285, "step": 599 }, { "epoch": 0.450112528132033, "grad_norm": 0.1593594253063202, "learning_rate": 9.532076851639649e-05, "loss": 0.0451, "step": 600 }, { "epoch": 0.450112528132033, "eval_loss": 0.039720457047224045, "eval_runtime": 5.1368, "eval_samples_per_second": 10.512, "eval_steps_per_second": 2.725, "step": 600 }, { "epoch": 0.4508627156789197, "grad_norm": 0.05833355337381363, "learning_rate": 9.529307294495018e-05, "loss": 0.0185, "step": 601 }, { "epoch": 0.45161290322580644, "grad_norm": 0.1370234489440918, "learning_rate": 9.526529970034386e-05, "loss": 0.0461, "step": 602 }, { "epoch": 0.45236309077269315, "grad_norm": 0.11114072799682617, "learning_rate": 9.52374488302059e-05, "loss": 0.0355, "step": 603 }, { "epoch": 0.45311327831957987, "grad_norm": 0.11904539167881012, "learning_rate": 9.52095203822978e-05, "loss": 0.0278, "step": 604 }, { "epoch": 0.45386346586646664, "grad_norm": 0.14749003946781158, "learning_rate": 9.518151440451411e-05, "loss": 0.0367, "step": 605 }, { "epoch": 0.45461365341335336, "grad_norm": 0.12626592814922333, "learning_rate": 9.515343094488232e-05, "loss": 0.0326, "step": 606 }, { "epoch": 0.4553638409602401, "grad_norm": 0.1684754639863968, "learning_rate": 9.51252700515628e-05, "loss": 0.0349, "step": 607 }, { "epoch": 0.4561140285071268, "grad_norm": 0.1260501742362976, "learning_rate": 9.509703177284869e-05, "loss": 0.0336, "step": 608 }, { "epoch": 0.4568642160540135, "grad_norm": 0.21763825416564941, "learning_rate": 9.506871615716587e-05, "loss": 0.0622, "step": 609 }, { "epoch": 0.4576144036009002, "grad_norm": 0.12283841520547867, "learning_rate": 9.504032325307284e-05, "loss": 0.0307, "step": 610 }, { "epoch": 0.45836459114778694, "grad_norm": 0.17834042012691498, "learning_rate": 9.501185310926062e-05, "loss": 0.0459, "step": 611 }, { "epoch": 0.45911477869467365, "grad_norm": 0.20170065760612488, "learning_rate": 9.498330577455273e-05, "loss": 0.0412, "step": 612 }, { "epoch": 0.45986496624156037, "grad_norm": 0.1369791477918625, "learning_rate": 9.495468129790499e-05, "loss": 0.0353, "step": 613 }, { "epoch": 0.4606151537884471, "grad_norm": 0.10395601391792297, "learning_rate": 9.49259797284056e-05, "loss": 0.0361, "step": 614 }, { "epoch": 0.46136534133533386, "grad_norm": 0.12245897203683853, "learning_rate": 9.489720111527492e-05, "loss": 0.0541, "step": 615 }, { "epoch": 0.46211552888222057, "grad_norm": 0.11299577355384827, "learning_rate": 9.486834550786543e-05, "loss": 0.0309, "step": 616 }, { "epoch": 0.4628657164291073, "grad_norm": 0.08725673705339432, "learning_rate": 9.483941295566165e-05, "loss": 0.0261, "step": 617 }, { "epoch": 0.463615903975994, "grad_norm": 0.12845538556575775, "learning_rate": 9.481040350828006e-05, "loss": 0.0218, "step": 618 }, { "epoch": 0.4643660915228807, "grad_norm": 0.09690548479557037, "learning_rate": 9.4781317215469e-05, "loss": 0.0274, "step": 619 }, { "epoch": 0.46511627906976744, "grad_norm": 0.09320426732301712, "learning_rate": 9.475215412710864e-05, "loss": 0.0293, "step": 620 }, { "epoch": 0.46586646661665415, "grad_norm": 0.08421576768159866, "learning_rate": 9.472291429321075e-05, "loss": 0.0265, "step": 621 }, { "epoch": 0.46661665416354087, "grad_norm": 0.18840914964675903, "learning_rate": 9.469359776391879e-05, "loss": 0.0451, "step": 622 }, { "epoch": 0.4673668417104276, "grad_norm": 0.22850267589092255, "learning_rate": 9.466420458950773e-05, "loss": 0.0506, "step": 623 }, { "epoch": 0.46811702925731435, "grad_norm": 0.13049699366092682, "learning_rate": 9.463473482038395e-05, "loss": 0.035, "step": 624 }, { "epoch": 0.46886721680420107, "grad_norm": 0.1595403105020523, "learning_rate": 9.46051885070852e-05, "loss": 0.0391, "step": 625 }, { "epoch": 0.4696174043510878, "grad_norm": 0.149023175239563, "learning_rate": 9.457556570028052e-05, "loss": 0.0383, "step": 626 }, { "epoch": 0.4703675918979745, "grad_norm": 0.12804150581359863, "learning_rate": 9.454586645077011e-05, "loss": 0.0288, "step": 627 }, { "epoch": 0.4711177794448612, "grad_norm": 0.16897211968898773, "learning_rate": 9.451609080948522e-05, "loss": 0.055, "step": 628 }, { "epoch": 0.47186796699174793, "grad_norm": 0.10514383763074875, "learning_rate": 9.448623882748817e-05, "loss": 0.033, "step": 629 }, { "epoch": 0.47261815453863465, "grad_norm": 0.11677896976470947, "learning_rate": 9.445631055597217e-05, "loss": 0.0272, "step": 630 }, { "epoch": 0.47336834208552137, "grad_norm": 0.15391424298286438, "learning_rate": 9.442630604626126e-05, "loss": 0.0485, "step": 631 }, { "epoch": 0.4741185296324081, "grad_norm": 0.0809544175863266, "learning_rate": 9.43962253498102e-05, "loss": 0.0232, "step": 632 }, { "epoch": 0.4748687171792948, "grad_norm": 0.09526669234037399, "learning_rate": 9.436606851820444e-05, "loss": 0.0285, "step": 633 }, { "epoch": 0.47561890472618157, "grad_norm": 0.12024281173944473, "learning_rate": 9.433583560315999e-05, "loss": 0.0358, "step": 634 }, { "epoch": 0.4763690922730683, "grad_norm": 0.12453112006187439, "learning_rate": 9.430552665652328e-05, "loss": 0.0365, "step": 635 }, { "epoch": 0.477119279819955, "grad_norm": 0.10970939695835114, "learning_rate": 9.427514173027121e-05, "loss": 0.0293, "step": 636 }, { "epoch": 0.4778694673668417, "grad_norm": 0.10942260921001434, "learning_rate": 9.424468087651092e-05, "loss": 0.03, "step": 637 }, { "epoch": 0.47861965491372843, "grad_norm": 0.06871689856052399, "learning_rate": 9.421414414747978e-05, "loss": 0.019, "step": 638 }, { "epoch": 0.47936984246061515, "grad_norm": 0.08539148420095444, "learning_rate": 9.418353159554526e-05, "loss": 0.031, "step": 639 }, { "epoch": 0.48012003000750186, "grad_norm": 0.1259005069732666, "learning_rate": 9.415284327320489e-05, "loss": 0.0259, "step": 640 }, { "epoch": 0.4808702175543886, "grad_norm": 0.12335602939128876, "learning_rate": 9.41220792330861e-05, "loss": 0.0417, "step": 641 }, { "epoch": 0.4816204051012753, "grad_norm": 0.12800176441669464, "learning_rate": 9.40912395279462e-05, "loss": 0.0388, "step": 642 }, { "epoch": 0.48237059264816207, "grad_norm": 0.13285093009471893, "learning_rate": 9.406032421067224e-05, "loss": 0.0347, "step": 643 }, { "epoch": 0.4831207801950488, "grad_norm": 0.13197124004364014, "learning_rate": 9.402933333428097e-05, "loss": 0.0358, "step": 644 }, { "epoch": 0.4838709677419355, "grad_norm": 0.1399867832660675, "learning_rate": 9.399826695191868e-05, "loss": 0.0265, "step": 645 }, { "epoch": 0.4846211552888222, "grad_norm": 0.12037494033575058, "learning_rate": 9.396712511686114e-05, "loss": 0.0298, "step": 646 }, { "epoch": 0.48537134283570893, "grad_norm": 0.11193617433309555, "learning_rate": 9.393590788251354e-05, "loss": 0.0287, "step": 647 }, { "epoch": 0.48612153038259565, "grad_norm": 0.09210868179798126, "learning_rate": 9.390461530241037e-05, "loss": 0.0206, "step": 648 }, { "epoch": 0.48687171792948236, "grad_norm": 0.2255852371454239, "learning_rate": 9.38732474302153e-05, "loss": 0.0494, "step": 649 }, { "epoch": 0.4876219054763691, "grad_norm": 0.13794954121112823, "learning_rate": 9.384180431972119e-05, "loss": 0.0282, "step": 650 }, { "epoch": 0.4883720930232558, "grad_norm": 0.11250860244035721, "learning_rate": 9.381028602484984e-05, "loss": 0.0306, "step": 651 }, { "epoch": 0.4891222805701425, "grad_norm": 0.15982091426849365, "learning_rate": 9.377869259965202e-05, "loss": 0.0386, "step": 652 }, { "epoch": 0.4898724681170293, "grad_norm": 0.1598033607006073, "learning_rate": 9.374702409830736e-05, "loss": 0.0452, "step": 653 }, { "epoch": 0.490622655663916, "grad_norm": 0.20759710669517517, "learning_rate": 9.37152805751242e-05, "loss": 0.0378, "step": 654 }, { "epoch": 0.4913728432108027, "grad_norm": 0.130874902009964, "learning_rate": 9.36834620845396e-05, "loss": 0.0317, "step": 655 }, { "epoch": 0.49212303075768943, "grad_norm": 0.12012483179569244, "learning_rate": 9.365156868111908e-05, "loss": 0.0302, "step": 656 }, { "epoch": 0.49287321830457614, "grad_norm": 0.14452581107616425, "learning_rate": 9.361960041955672e-05, "loss": 0.0292, "step": 657 }, { "epoch": 0.49362340585146286, "grad_norm": 0.11331783980131149, "learning_rate": 9.358755735467494e-05, "loss": 0.0416, "step": 658 }, { "epoch": 0.4943735933983496, "grad_norm": 0.10472054779529572, "learning_rate": 9.355543954142446e-05, "loss": 0.0372, "step": 659 }, { "epoch": 0.4951237809452363, "grad_norm": 0.18476296961307526, "learning_rate": 9.352324703488412e-05, "loss": 0.0546, "step": 660 }, { "epoch": 0.495873968492123, "grad_norm": 0.15593048930168152, "learning_rate": 9.349097989026093e-05, "loss": 0.0359, "step": 661 }, { "epoch": 0.4966241560390098, "grad_norm": 0.1576404720544815, "learning_rate": 9.345863816288985e-05, "loss": 0.0433, "step": 662 }, { "epoch": 0.4973743435858965, "grad_norm": 0.10880573093891144, "learning_rate": 9.342622190823378e-05, "loss": 0.0277, "step": 663 }, { "epoch": 0.4981245311327832, "grad_norm": 0.1214156374335289, "learning_rate": 9.339373118188338e-05, "loss": 0.0298, "step": 664 }, { "epoch": 0.4988747186796699, "grad_norm": 0.07884982973337173, "learning_rate": 9.336116603955707e-05, "loss": 0.0221, "step": 665 }, { "epoch": 0.49962490622655664, "grad_norm": 0.09125591069459915, "learning_rate": 9.332852653710084e-05, "loss": 0.0289, "step": 666 }, { "epoch": 0.5003750937734434, "grad_norm": 0.18234345316886902, "learning_rate": 9.329581273048822e-05, "loss": 0.0373, "step": 667 }, { "epoch": 0.5011252813203301, "grad_norm": 0.13476620614528656, "learning_rate": 9.32630246758202e-05, "loss": 0.0298, "step": 668 }, { "epoch": 0.5018754688672168, "grad_norm": 0.11463253945112228, "learning_rate": 9.323016242932504e-05, "loss": 0.0448, "step": 669 }, { "epoch": 0.5026256564141035, "grad_norm": 0.09115209430456161, "learning_rate": 9.319722604735825e-05, "loss": 0.0261, "step": 670 }, { "epoch": 0.5033758439609902, "grad_norm": 0.1378633677959442, "learning_rate": 9.31642155864025e-05, "loss": 0.029, "step": 671 }, { "epoch": 0.5041260315078769, "grad_norm": 0.11013531684875488, "learning_rate": 9.313113110306748e-05, "loss": 0.025, "step": 672 }, { "epoch": 0.5048762190547637, "grad_norm": 0.1313575953245163, "learning_rate": 9.309797265408979e-05, "loss": 0.0355, "step": 673 }, { "epoch": 0.5056264066016504, "grad_norm": 0.0926070511341095, "learning_rate": 9.306474029633294e-05, "loss": 0.0244, "step": 674 }, { "epoch": 0.5063765941485371, "grad_norm": 0.11532235890626907, "learning_rate": 9.303143408678716e-05, "loss": 0.0347, "step": 675 }, { "epoch": 0.5071267816954238, "grad_norm": 0.11281560361385345, "learning_rate": 9.299805408256928e-05, "loss": 0.0226, "step": 676 }, { "epoch": 0.5078769692423106, "grad_norm": 0.17721490561962128, "learning_rate": 9.296460034092274e-05, "loss": 0.0347, "step": 677 }, { "epoch": 0.5086271567891973, "grad_norm": 0.09870696067810059, "learning_rate": 9.293107291921741e-05, "loss": 0.0255, "step": 678 }, { "epoch": 0.5093773443360841, "grad_norm": 0.08905252069234848, "learning_rate": 9.289747187494952e-05, "loss": 0.0202, "step": 679 }, { "epoch": 0.5101275318829708, "grad_norm": 0.18132604658603668, "learning_rate": 9.286379726574155e-05, "loss": 0.0331, "step": 680 }, { "epoch": 0.5108777194298575, "grad_norm": 0.10676857084035873, "learning_rate": 9.283004914934215e-05, "loss": 0.0293, "step": 681 }, { "epoch": 0.5116279069767442, "grad_norm": 0.13029693067073822, "learning_rate": 9.2796227583626e-05, "loss": 0.037, "step": 682 }, { "epoch": 0.5123780945236309, "grad_norm": 0.17007900774478912, "learning_rate": 9.276233262659375e-05, "loss": 0.0348, "step": 683 }, { "epoch": 0.5131282820705176, "grad_norm": 0.125155508518219, "learning_rate": 9.272836433637193e-05, "loss": 0.0345, "step": 684 }, { "epoch": 0.5138784696174044, "grad_norm": 0.1830897033214569, "learning_rate": 9.269432277121281e-05, "loss": 0.0329, "step": 685 }, { "epoch": 0.5146286571642911, "grad_norm": 0.15975606441497803, "learning_rate": 9.266020798949433e-05, "loss": 0.0311, "step": 686 }, { "epoch": 0.5153788447111778, "grad_norm": 0.12618814408779144, "learning_rate": 9.262602004971996e-05, "loss": 0.032, "step": 687 }, { "epoch": 0.5161290322580645, "grad_norm": 0.0827803909778595, "learning_rate": 9.259175901051867e-05, "loss": 0.0228, "step": 688 }, { "epoch": 0.5168792198049512, "grad_norm": 0.194037526845932, "learning_rate": 9.255742493064474e-05, "loss": 0.0437, "step": 689 }, { "epoch": 0.5176294073518379, "grad_norm": 0.19778898358345032, "learning_rate": 9.252301786897776e-05, "loss": 0.0482, "step": 690 }, { "epoch": 0.5183795948987246, "grad_norm": 0.190335214138031, "learning_rate": 9.248853788452247e-05, "loss": 0.048, "step": 691 }, { "epoch": 0.5191297824456114, "grad_norm": 0.2096565216779709, "learning_rate": 9.24539850364086e-05, "loss": 0.0326, "step": 692 }, { "epoch": 0.5198799699924981, "grad_norm": 0.19046910107135773, "learning_rate": 9.241935938389093e-05, "loss": 0.0405, "step": 693 }, { "epoch": 0.5206301575393848, "grad_norm": 0.12360625714063644, "learning_rate": 9.238466098634902e-05, "loss": 0.033, "step": 694 }, { "epoch": 0.5213803450862715, "grad_norm": 0.16132117807865143, "learning_rate": 9.234988990328719e-05, "loss": 0.0487, "step": 695 }, { "epoch": 0.5221305326331583, "grad_norm": 0.10174372047185898, "learning_rate": 9.231504619433445e-05, "loss": 0.0304, "step": 696 }, { "epoch": 0.522880720180045, "grad_norm": 0.13074737787246704, "learning_rate": 9.228012991924433e-05, "loss": 0.0397, "step": 697 }, { "epoch": 0.5236309077269318, "grad_norm": 0.10839260369539261, "learning_rate": 9.224514113789477e-05, "loss": 0.0317, "step": 698 }, { "epoch": 0.5243810952738185, "grad_norm": 0.14983467757701874, "learning_rate": 9.221007991028814e-05, "loss": 0.0411, "step": 699 }, { "epoch": 0.5251312828207052, "grad_norm": 0.08731064200401306, "learning_rate": 9.217494629655094e-05, "loss": 0.0227, "step": 700 }, { "epoch": 0.5258814703675919, "grad_norm": 0.1175181195139885, "learning_rate": 9.213974035693389e-05, "loss": 0.0285, "step": 701 }, { "epoch": 0.5266316579144786, "grad_norm": 0.12022960186004639, "learning_rate": 9.21044621518117e-05, "loss": 0.0292, "step": 702 }, { "epoch": 0.5273818454613654, "grad_norm": 0.12367639690637589, "learning_rate": 9.206911174168301e-05, "loss": 0.039, "step": 703 }, { "epoch": 0.5281320330082521, "grad_norm": 0.21845479309558868, "learning_rate": 9.20336891871703e-05, "loss": 0.0486, "step": 704 }, { "epoch": 0.5288822205551388, "grad_norm": 0.11741850525140762, "learning_rate": 9.199819454901977e-05, "loss": 0.0344, "step": 705 }, { "epoch": 0.5296324081020255, "grad_norm": 0.24551405012607574, "learning_rate": 9.196262788810121e-05, "loss": 0.0664, "step": 706 }, { "epoch": 0.5303825956489122, "grad_norm": 0.15015265345573425, "learning_rate": 9.192698926540795e-05, "loss": 0.0361, "step": 707 }, { "epoch": 0.5311327831957989, "grad_norm": 0.16593538224697113, "learning_rate": 9.189127874205674e-05, "loss": 0.0423, "step": 708 }, { "epoch": 0.5318829707426856, "grad_norm": 0.11336637288331985, "learning_rate": 9.185549637928758e-05, "loss": 0.0302, "step": 709 }, { "epoch": 0.5326331582895724, "grad_norm": 0.17877517640590668, "learning_rate": 9.181964223846371e-05, "loss": 0.0448, "step": 710 }, { "epoch": 0.5333833458364591, "grad_norm": 0.12126030772924423, "learning_rate": 9.178371638107146e-05, "loss": 0.0384, "step": 711 }, { "epoch": 0.5341335333833458, "grad_norm": 0.12825068831443787, "learning_rate": 9.174771886872011e-05, "loss": 0.0283, "step": 712 }, { "epoch": 0.5348837209302325, "grad_norm": 0.19938157498836517, "learning_rate": 9.17116497631419e-05, "loss": 0.0415, "step": 713 }, { "epoch": 0.5356339084771192, "grad_norm": 0.12208958715200424, "learning_rate": 9.167550912619173e-05, "loss": 0.034, "step": 714 }, { "epoch": 0.536384096024006, "grad_norm": 0.11766844242811203, "learning_rate": 9.16392970198473e-05, "loss": 0.0261, "step": 715 }, { "epoch": 0.5371342835708928, "grad_norm": 0.09077750146389008, "learning_rate": 9.160301350620875e-05, "loss": 0.0263, "step": 716 }, { "epoch": 0.5378844711177795, "grad_norm": 0.08629114925861359, "learning_rate": 9.156665864749876e-05, "loss": 0.0233, "step": 717 }, { "epoch": 0.5386346586646662, "grad_norm": 0.09420029819011688, "learning_rate": 9.153023250606234e-05, "loss": 0.0274, "step": 718 }, { "epoch": 0.5393848462115529, "grad_norm": 0.14515256881713867, "learning_rate": 9.14937351443667e-05, "loss": 0.0402, "step": 719 }, { "epoch": 0.5401350337584396, "grad_norm": 0.14991170167922974, "learning_rate": 9.145716662500126e-05, "loss": 0.0379, "step": 720 }, { "epoch": 0.5408852213053263, "grad_norm": 0.13430874049663544, "learning_rate": 9.142052701067741e-05, "loss": 0.0499, "step": 721 }, { "epoch": 0.5416354088522131, "grad_norm": 0.16690127551555634, "learning_rate": 9.13838163642285e-05, "loss": 0.0396, "step": 722 }, { "epoch": 0.5423855963990998, "grad_norm": 0.1311984360218048, "learning_rate": 9.134703474860963e-05, "loss": 0.0251, "step": 723 }, { "epoch": 0.5431357839459865, "grad_norm": 0.11423755437135696, "learning_rate": 9.13101822268977e-05, "loss": 0.0339, "step": 724 }, { "epoch": 0.5438859714928732, "grad_norm": 0.24795664846897125, "learning_rate": 9.127325886229115e-05, "loss": 0.0458, "step": 725 }, { "epoch": 0.5446361590397599, "grad_norm": 0.09746094048023224, "learning_rate": 9.123626471810988e-05, "loss": 0.0247, "step": 726 }, { "epoch": 0.5453863465866466, "grad_norm": 0.19780874252319336, "learning_rate": 9.119919985779521e-05, "loss": 0.0467, "step": 727 }, { "epoch": 0.5461365341335334, "grad_norm": 0.1802770495414734, "learning_rate": 9.116206434490976e-05, "loss": 0.0508, "step": 728 }, { "epoch": 0.5468867216804201, "grad_norm": 0.16529236733913422, "learning_rate": 9.112485824313726e-05, "loss": 0.0376, "step": 729 }, { "epoch": 0.5476369092273068, "grad_norm": 0.11283475160598755, "learning_rate": 9.10875816162825e-05, "loss": 0.0307, "step": 730 }, { "epoch": 0.5483870967741935, "grad_norm": 0.09951961785554886, "learning_rate": 9.105023452827121e-05, "loss": 0.0285, "step": 731 }, { "epoch": 0.5491372843210802, "grad_norm": 0.09138637781143188, "learning_rate": 9.101281704315002e-05, "loss": 0.0274, "step": 732 }, { "epoch": 0.5498874718679669, "grad_norm": 0.12191305309534073, "learning_rate": 9.097532922508619e-05, "loss": 0.0398, "step": 733 }, { "epoch": 0.5506376594148538, "grad_norm": 0.12989825010299683, "learning_rate": 9.093777113836765e-05, "loss": 0.0294, "step": 734 }, { "epoch": 0.5513878469617405, "grad_norm": 0.13126561045646667, "learning_rate": 9.090014284740283e-05, "loss": 0.0353, "step": 735 }, { "epoch": 0.5521380345086272, "grad_norm": 0.11954871565103531, "learning_rate": 9.086244441672052e-05, "loss": 0.0298, "step": 736 }, { "epoch": 0.5528882220555139, "grad_norm": 0.20153184235095978, "learning_rate": 9.082467591096982e-05, "loss": 0.0482, "step": 737 }, { "epoch": 0.5536384096024006, "grad_norm": 0.1392621099948883, "learning_rate": 9.078683739492002e-05, "loss": 0.0382, "step": 738 }, { "epoch": 0.5543885971492873, "grad_norm": 0.16660861670970917, "learning_rate": 9.074892893346043e-05, "loss": 0.0419, "step": 739 }, { "epoch": 0.5551387846961741, "grad_norm": 0.0964227095246315, "learning_rate": 9.071095059160035e-05, "loss": 0.0267, "step": 740 }, { "epoch": 0.5558889722430608, "grad_norm": 0.0833270475268364, "learning_rate": 9.067290243446887e-05, "loss": 0.0233, "step": 741 }, { "epoch": 0.5566391597899475, "grad_norm": 0.09658671915531158, "learning_rate": 9.063478452731484e-05, "loss": 0.0246, "step": 742 }, { "epoch": 0.5573893473368342, "grad_norm": 0.12930825352668762, "learning_rate": 9.059659693550673e-05, "loss": 0.0336, "step": 743 }, { "epoch": 0.5581395348837209, "grad_norm": 0.09614095836877823, "learning_rate": 9.055833972453249e-05, "loss": 0.0335, "step": 744 }, { "epoch": 0.5588897224306076, "grad_norm": 0.22925931215286255, "learning_rate": 9.052001295999947e-05, "loss": 0.0664, "step": 745 }, { "epoch": 0.5596399099774944, "grad_norm": 0.13005341589450836, "learning_rate": 9.048161670763429e-05, "loss": 0.0297, "step": 746 }, { "epoch": 0.5603900975243811, "grad_norm": 0.10627295076847076, "learning_rate": 9.044315103328276e-05, "loss": 0.0259, "step": 747 }, { "epoch": 0.5611402850712678, "grad_norm": 0.1018269807100296, "learning_rate": 9.04046160029097e-05, "loss": 0.0275, "step": 748 }, { "epoch": 0.5618904726181545, "grad_norm": 0.13973957300186157, "learning_rate": 9.036601168259893e-05, "loss": 0.0283, "step": 749 }, { "epoch": 0.5626406601650412, "grad_norm": 0.17691271007061005, "learning_rate": 9.032733813855301e-05, "loss": 0.0356, "step": 750 }, { "epoch": 0.5633908477119279, "grad_norm": 0.10310542583465576, "learning_rate": 9.02885954370933e-05, "loss": 0.0272, "step": 751 }, { "epoch": 0.5641410352588146, "grad_norm": 0.1504839062690735, "learning_rate": 9.02497836446597e-05, "loss": 0.0411, "step": 752 }, { "epoch": 0.5648912228057015, "grad_norm": 0.13666382431983948, "learning_rate": 9.021090282781059e-05, "loss": 0.0311, "step": 753 }, { "epoch": 0.5656414103525882, "grad_norm": 0.13605177402496338, "learning_rate": 9.01719530532228e-05, "loss": 0.0389, "step": 754 }, { "epoch": 0.5663915978994749, "grad_norm": 0.07885899394750595, "learning_rate": 9.01329343876913e-05, "loss": 0.021, "step": 755 }, { "epoch": 0.5671417854463616, "grad_norm": 0.13754671812057495, "learning_rate": 9.009384689812928e-05, "loss": 0.0401, "step": 756 }, { "epoch": 0.5678919729932483, "grad_norm": 0.09757539629936218, "learning_rate": 9.005469065156795e-05, "loss": 0.0228, "step": 757 }, { "epoch": 0.568642160540135, "grad_norm": 0.17368543148040771, "learning_rate": 9.00154657151564e-05, "loss": 0.04, "step": 758 }, { "epoch": 0.5693923480870218, "grad_norm": 0.11420293897390366, "learning_rate": 8.997617215616154e-05, "loss": 0.0335, "step": 759 }, { "epoch": 0.5701425356339085, "grad_norm": 0.11507721245288849, "learning_rate": 8.993681004196797e-05, "loss": 0.0282, "step": 760 }, { "epoch": 0.5708927231807952, "grad_norm": 0.18800199031829834, "learning_rate": 8.989737944007781e-05, "loss": 0.0238, "step": 761 }, { "epoch": 0.5716429107276819, "grad_norm": 0.174385666847229, "learning_rate": 8.985788041811068e-05, "loss": 0.0454, "step": 762 }, { "epoch": 0.5723930982745686, "grad_norm": 0.14115120470523834, "learning_rate": 8.981831304380348e-05, "loss": 0.0399, "step": 763 }, { "epoch": 0.5731432858214554, "grad_norm": 0.11411836743354797, "learning_rate": 8.97786773850104e-05, "loss": 0.0284, "step": 764 }, { "epoch": 0.5738934733683421, "grad_norm": 0.08721485733985901, "learning_rate": 8.973897350970269e-05, "loss": 0.0292, "step": 765 }, { "epoch": 0.5746436609152288, "grad_norm": 0.17708098888397217, "learning_rate": 8.969920148596857e-05, "loss": 0.0521, "step": 766 }, { "epoch": 0.5753938484621155, "grad_norm": 0.16419649124145508, "learning_rate": 8.965936138201314e-05, "loss": 0.0354, "step": 767 }, { "epoch": 0.5761440360090022, "grad_norm": 0.16116437315940857, "learning_rate": 8.961945326615829e-05, "loss": 0.0466, "step": 768 }, { "epoch": 0.5768942235558889, "grad_norm": 0.09195220470428467, "learning_rate": 8.957947720684246e-05, "loss": 0.0319, "step": 769 }, { "epoch": 0.5776444111027756, "grad_norm": 0.13473393023014069, "learning_rate": 8.953943327262066e-05, "loss": 0.0383, "step": 770 }, { "epoch": 0.5783945986496624, "grad_norm": 0.14092779159545898, "learning_rate": 8.949932153216434e-05, "loss": 0.0282, "step": 771 }, { "epoch": 0.5791447861965492, "grad_norm": 0.07207917422056198, "learning_rate": 8.945914205426116e-05, "loss": 0.0196, "step": 772 }, { "epoch": 0.5798949737434359, "grad_norm": 0.14177407324314117, "learning_rate": 8.941889490781494e-05, "loss": 0.0343, "step": 773 }, { "epoch": 0.5806451612903226, "grad_norm": 0.12721553444862366, "learning_rate": 8.937858016184563e-05, "loss": 0.0355, "step": 774 }, { "epoch": 0.5813953488372093, "grad_norm": 0.11577153205871582, "learning_rate": 8.933819788548899e-05, "loss": 0.0388, "step": 775 }, { "epoch": 0.582145536384096, "grad_norm": 0.06961927562952042, "learning_rate": 8.92977481479967e-05, "loss": 0.0222, "step": 776 }, { "epoch": 0.5828957239309828, "grad_norm": 0.12361857295036316, "learning_rate": 8.925723101873603e-05, "loss": 0.0346, "step": 777 }, { "epoch": 0.5836459114778695, "grad_norm": 0.08191924542188644, "learning_rate": 8.92166465671899e-05, "loss": 0.027, "step": 778 }, { "epoch": 0.5843960990247562, "grad_norm": 0.17663846909999847, "learning_rate": 8.917599486295664e-05, "loss": 0.0468, "step": 779 }, { "epoch": 0.5851462865716429, "grad_norm": 0.09279919415712357, "learning_rate": 8.913527597574991e-05, "loss": 0.0266, "step": 780 }, { "epoch": 0.5858964741185296, "grad_norm": 0.0862743929028511, "learning_rate": 8.90944899753986e-05, "loss": 0.0243, "step": 781 }, { "epoch": 0.5866466616654163, "grad_norm": 0.1053123027086258, "learning_rate": 8.905363693184668e-05, "loss": 0.0215, "step": 782 }, { "epoch": 0.5873968492123031, "grad_norm": 0.15901346504688263, "learning_rate": 8.901271691515309e-05, "loss": 0.0405, "step": 783 }, { "epoch": 0.5881470367591898, "grad_norm": 0.09581103920936584, "learning_rate": 8.897172999549165e-05, "loss": 0.0254, "step": 784 }, { "epoch": 0.5888972243060765, "grad_norm": 0.09868829697370529, "learning_rate": 8.893067624315088e-05, "loss": 0.0188, "step": 785 }, { "epoch": 0.5896474118529632, "grad_norm": 0.16848154366016388, "learning_rate": 8.888955572853392e-05, "loss": 0.0334, "step": 786 }, { "epoch": 0.5903975993998499, "grad_norm": 0.08405663818120956, "learning_rate": 8.884836852215841e-05, "loss": 0.0233, "step": 787 }, { "epoch": 0.5911477869467366, "grad_norm": 0.20354647934436798, "learning_rate": 8.880711469465635e-05, "loss": 0.0414, "step": 788 }, { "epoch": 0.5918979744936234, "grad_norm": 0.10922642797231674, "learning_rate": 8.876579431677398e-05, "loss": 0.0304, "step": 789 }, { "epoch": 0.5926481620405101, "grad_norm": 0.17113949358463287, "learning_rate": 8.87244074593717e-05, "loss": 0.035, "step": 790 }, { "epoch": 0.5933983495873969, "grad_norm": 0.16370461881160736, "learning_rate": 8.868295419342389e-05, "loss": 0.0338, "step": 791 }, { "epoch": 0.5941485371342836, "grad_norm": 0.15194040536880493, "learning_rate": 8.86414345900188e-05, "loss": 0.0466, "step": 792 }, { "epoch": 0.5948987246811703, "grad_norm": 0.13242392241954803, "learning_rate": 8.859984872035849e-05, "loss": 0.0378, "step": 793 }, { "epoch": 0.595648912228057, "grad_norm": 0.10735834389925003, "learning_rate": 8.85581966557586e-05, "loss": 0.0319, "step": 794 }, { "epoch": 0.5963990997749438, "grad_norm": 0.11282511800527573, "learning_rate": 8.851647846764835e-05, "loss": 0.0244, "step": 795 }, { "epoch": 0.5971492873218305, "grad_norm": 0.14782768487930298, "learning_rate": 8.847469422757031e-05, "loss": 0.0274, "step": 796 }, { "epoch": 0.5978994748687172, "grad_norm": 0.09905406087636948, "learning_rate": 8.843284400718033e-05, "loss": 0.0215, "step": 797 }, { "epoch": 0.5986496624156039, "grad_norm": 0.1274043172597885, "learning_rate": 8.839092787824743e-05, "loss": 0.0331, "step": 798 }, { "epoch": 0.5993998499624906, "grad_norm": 0.06478995829820633, "learning_rate": 8.834894591265364e-05, "loss": 0.0224, "step": 799 }, { "epoch": 0.6001500375093773, "grad_norm": 0.09536460041999817, "learning_rate": 8.830689818239388e-05, "loss": 0.0267, "step": 800 }, { "epoch": 0.6001500375093773, "eval_loss": 0.034302037209272385, "eval_runtime": 5.1278, "eval_samples_per_second": 10.531, "eval_steps_per_second": 2.73, "step": 800 }, { "epoch": 0.6009002250562641, "grad_norm": 0.09608499705791473, "learning_rate": 8.826478475957589e-05, "loss": 0.0193, "step": 801 }, { "epoch": 0.6016504126031508, "grad_norm": 0.11023902148008347, "learning_rate": 8.822260571642005e-05, "loss": 0.027, "step": 802 }, { "epoch": 0.6024006001500375, "grad_norm": 0.07215134054422379, "learning_rate": 8.818036112525924e-05, "loss": 0.0268, "step": 803 }, { "epoch": 0.6031507876969242, "grad_norm": 0.14665848016738892, "learning_rate": 8.813805105853879e-05, "loss": 0.046, "step": 804 }, { "epoch": 0.6039009752438109, "grad_norm": 0.08387989550828934, "learning_rate": 8.809567558881628e-05, "loss": 0.0212, "step": 805 }, { "epoch": 0.6046511627906976, "grad_norm": 0.15180405974388123, "learning_rate": 8.805323478876149e-05, "loss": 0.0384, "step": 806 }, { "epoch": 0.6054013503375844, "grad_norm": 0.12858478724956512, "learning_rate": 8.80107287311562e-05, "loss": 0.0353, "step": 807 }, { "epoch": 0.6061515378844711, "grad_norm": 0.11187388002872467, "learning_rate": 8.796815748889413e-05, "loss": 0.0276, "step": 808 }, { "epoch": 0.6069017254313578, "grad_norm": 0.07408938556909561, "learning_rate": 8.792552113498073e-05, "loss": 0.0224, "step": 809 }, { "epoch": 0.6076519129782446, "grad_norm": 0.1739557683467865, "learning_rate": 8.788281974253318e-05, "loss": 0.0476, "step": 810 }, { "epoch": 0.6084021005251313, "grad_norm": 0.1207609549164772, "learning_rate": 8.784005338478017e-05, "loss": 0.0421, "step": 811 }, { "epoch": 0.609152288072018, "grad_norm": 0.1739661693572998, "learning_rate": 8.779722213506178e-05, "loss": 0.0517, "step": 812 }, { "epoch": 0.6099024756189048, "grad_norm": 0.1354176253080368, "learning_rate": 8.775432606682937e-05, "loss": 0.049, "step": 813 }, { "epoch": 0.6106526631657915, "grad_norm": 0.12110552936792374, "learning_rate": 8.77113652536455e-05, "loss": 0.0367, "step": 814 }, { "epoch": 0.6114028507126782, "grad_norm": 0.26730018854141235, "learning_rate": 8.766833976918371e-05, "loss": 0.0467, "step": 815 }, { "epoch": 0.6121530382595649, "grad_norm": 0.19855856895446777, "learning_rate": 8.76252496872285e-05, "loss": 0.04, "step": 816 }, { "epoch": 0.6129032258064516, "grad_norm": 0.07605290412902832, "learning_rate": 8.758209508167508e-05, "loss": 0.0223, "step": 817 }, { "epoch": 0.6136534133533383, "grad_norm": 0.11542303115129471, "learning_rate": 8.753887602652937e-05, "loss": 0.0439, "step": 818 }, { "epoch": 0.614403600900225, "grad_norm": 0.1014133095741272, "learning_rate": 8.74955925959078e-05, "loss": 0.0233, "step": 819 }, { "epoch": 0.6151537884471118, "grad_norm": 0.14494650065898895, "learning_rate": 8.745224486403718e-05, "loss": 0.043, "step": 820 }, { "epoch": 0.6159039759939985, "grad_norm": 0.1557653397321701, "learning_rate": 8.74088329052546e-05, "loss": 0.0375, "step": 821 }, { "epoch": 0.6166541635408852, "grad_norm": 0.13315404951572418, "learning_rate": 8.73653567940073e-05, "loss": 0.0413, "step": 822 }, { "epoch": 0.6174043510877719, "grad_norm": 0.10892429202795029, "learning_rate": 8.732181660485252e-05, "loss": 0.0272, "step": 823 }, { "epoch": 0.6181545386346586, "grad_norm": 0.10928978025913239, "learning_rate": 8.727821241245742e-05, "loss": 0.0236, "step": 824 }, { "epoch": 0.6189047261815454, "grad_norm": 0.08895453810691833, "learning_rate": 8.723454429159888e-05, "loss": 0.0205, "step": 825 }, { "epoch": 0.6196549137284321, "grad_norm": 0.122958704829216, "learning_rate": 8.719081231716341e-05, "loss": 0.0264, "step": 826 }, { "epoch": 0.6204051012753188, "grad_norm": 0.09298314154148102, "learning_rate": 8.714701656414708e-05, "loss": 0.021, "step": 827 }, { "epoch": 0.6211552888222055, "grad_norm": 0.17051957547664642, "learning_rate": 8.710315710765526e-05, "loss": 0.0351, "step": 828 }, { "epoch": 0.6219054763690923, "grad_norm": 0.15878413617610931, "learning_rate": 8.705923402290261e-05, "loss": 0.0339, "step": 829 }, { "epoch": 0.622655663915979, "grad_norm": 0.08549269288778305, "learning_rate": 8.701524738521291e-05, "loss": 0.0239, "step": 830 }, { "epoch": 0.6234058514628658, "grad_norm": 0.17661143839359283, "learning_rate": 8.697119727001887e-05, "loss": 0.0477, "step": 831 }, { "epoch": 0.6241560390097525, "grad_norm": 0.13153639435768127, "learning_rate": 8.692708375286217e-05, "loss": 0.0311, "step": 832 }, { "epoch": 0.6249062265566392, "grad_norm": 0.18589378893375397, "learning_rate": 8.688290690939307e-05, "loss": 0.0434, "step": 833 }, { "epoch": 0.6256564141035259, "grad_norm": 0.1077880933880806, "learning_rate": 8.683866681537054e-05, "loss": 0.0281, "step": 834 }, { "epoch": 0.6264066016504126, "grad_norm": 0.062748022377491, "learning_rate": 8.679436354666202e-05, "loss": 0.016, "step": 835 }, { "epoch": 0.6271567891972993, "grad_norm": 0.16130079329013824, "learning_rate": 8.67499971792432e-05, "loss": 0.0409, "step": 836 }, { "epoch": 0.627906976744186, "grad_norm": 0.10744091123342514, "learning_rate": 8.670556778919805e-05, "loss": 0.0301, "step": 837 }, { "epoch": 0.6286571642910728, "grad_norm": 0.36741015315055847, "learning_rate": 8.666107545271859e-05, "loss": 0.0771, "step": 838 }, { "epoch": 0.6294073518379595, "grad_norm": 0.1694938838481903, "learning_rate": 8.661652024610482e-05, "loss": 0.0511, "step": 839 }, { "epoch": 0.6301575393848462, "grad_norm": 0.10456728935241699, "learning_rate": 8.657190224576453e-05, "loss": 0.0228, "step": 840 }, { "epoch": 0.6309077269317329, "grad_norm": 0.086213119328022, "learning_rate": 8.652722152821318e-05, "loss": 0.0285, "step": 841 }, { "epoch": 0.6316579144786196, "grad_norm": 0.08290264755487442, "learning_rate": 8.64824781700738e-05, "loss": 0.0269, "step": 842 }, { "epoch": 0.6324081020255063, "grad_norm": 0.12326061725616455, "learning_rate": 8.643767224807685e-05, "loss": 0.0391, "step": 843 }, { "epoch": 0.6331582895723931, "grad_norm": 0.09560957551002502, "learning_rate": 8.639280383906008e-05, "loss": 0.0352, "step": 844 }, { "epoch": 0.6339084771192798, "grad_norm": 0.23914098739624023, "learning_rate": 8.634787301996839e-05, "loss": 0.0523, "step": 845 }, { "epoch": 0.6346586646661665, "grad_norm": 0.10551799088716507, "learning_rate": 8.630287986785368e-05, "loss": 0.0322, "step": 846 }, { "epoch": 0.6354088522130532, "grad_norm": 0.12217076122760773, "learning_rate": 8.625782445987483e-05, "loss": 0.0344, "step": 847 }, { "epoch": 0.63615903975994, "grad_norm": 0.1267174631357193, "learning_rate": 8.621270687329738e-05, "loss": 0.0341, "step": 848 }, { "epoch": 0.6369092273068268, "grad_norm": 0.12219149619340897, "learning_rate": 8.616752718549359e-05, "loss": 0.0411, "step": 849 }, { "epoch": 0.6376594148537135, "grad_norm": 0.13776132464408875, "learning_rate": 8.612228547394215e-05, "loss": 0.0464, "step": 850 }, { "epoch": 0.6384096024006002, "grad_norm": 0.19186848402023315, "learning_rate": 8.607698181622814e-05, "loss": 0.0501, "step": 851 }, { "epoch": 0.6391597899474869, "grad_norm": 0.14366553723812103, "learning_rate": 8.603161629004287e-05, "loss": 0.0402, "step": 852 }, { "epoch": 0.6399099774943736, "grad_norm": 0.14673426747322083, "learning_rate": 8.598618897318375e-05, "loss": 0.0432, "step": 853 }, { "epoch": 0.6406601650412603, "grad_norm": 0.1311863660812378, "learning_rate": 8.594069994355419e-05, "loss": 0.0285, "step": 854 }, { "epoch": 0.641410352588147, "grad_norm": 0.1472454071044922, "learning_rate": 8.589514927916336e-05, "loss": 0.0316, "step": 855 }, { "epoch": 0.6421605401350338, "grad_norm": 0.09931936115026474, "learning_rate": 8.584953705812615e-05, "loss": 0.0284, "step": 856 }, { "epoch": 0.6429107276819205, "grad_norm": 0.11538329720497131, "learning_rate": 8.580386335866308e-05, "loss": 0.0325, "step": 857 }, { "epoch": 0.6436609152288072, "grad_norm": 0.09810412675142288, "learning_rate": 8.575812825909998e-05, "loss": 0.0266, "step": 858 }, { "epoch": 0.6444111027756939, "grad_norm": 0.11734863370656967, "learning_rate": 8.57123318378681e-05, "loss": 0.0347, "step": 859 }, { "epoch": 0.6451612903225806, "grad_norm": 0.1653948426246643, "learning_rate": 8.566647417350378e-05, "loss": 0.0318, "step": 860 }, { "epoch": 0.6459114778694673, "grad_norm": 0.14309543371200562, "learning_rate": 8.562055534464838e-05, "loss": 0.0332, "step": 861 }, { "epoch": 0.6466616654163541, "grad_norm": 0.1541086882352829, "learning_rate": 8.557457543004819e-05, "loss": 0.0399, "step": 862 }, { "epoch": 0.6474118529632408, "grad_norm": 0.11427025496959686, "learning_rate": 8.552853450855422e-05, "loss": 0.0332, "step": 863 }, { "epoch": 0.6481620405101275, "grad_norm": 0.10530763864517212, "learning_rate": 8.548243265912213e-05, "loss": 0.0365, "step": 864 }, { "epoch": 0.6489122280570142, "grad_norm": 0.10539809614419937, "learning_rate": 8.543626996081202e-05, "loss": 0.0223, "step": 865 }, { "epoch": 0.6496624156039009, "grad_norm": 0.1444813460111618, "learning_rate": 8.539004649278841e-05, "loss": 0.0325, "step": 866 }, { "epoch": 0.6504126031507877, "grad_norm": 0.10531052201986313, "learning_rate": 8.534376233432e-05, "loss": 0.0231, "step": 867 }, { "epoch": 0.6511627906976745, "grad_norm": 0.0951962098479271, "learning_rate": 8.529741756477953e-05, "loss": 0.0311, "step": 868 }, { "epoch": 0.6519129782445612, "grad_norm": 0.09423990547657013, "learning_rate": 8.525101226364374e-05, "loss": 0.0328, "step": 869 }, { "epoch": 0.6526631657914479, "grad_norm": 0.10251491516828537, "learning_rate": 8.520454651049313e-05, "loss": 0.0225, "step": 870 }, { "epoch": 0.6534133533383346, "grad_norm": 0.09833940863609314, "learning_rate": 8.51580203850119e-05, "loss": 0.023, "step": 871 }, { "epoch": 0.6541635408852213, "grad_norm": 0.10047540813684464, "learning_rate": 8.511143396698781e-05, "loss": 0.0324, "step": 872 }, { "epoch": 0.654913728432108, "grad_norm": 0.3961799740791321, "learning_rate": 8.506478733631193e-05, "loss": 0.06, "step": 873 }, { "epoch": 0.6556639159789948, "grad_norm": 0.10680108517408371, "learning_rate": 8.501808057297866e-05, "loss": 0.0269, "step": 874 }, { "epoch": 0.6564141035258815, "grad_norm": 0.10586558282375336, "learning_rate": 8.49713137570855e-05, "loss": 0.025, "step": 875 }, { "epoch": 0.6571642910727682, "grad_norm": 0.10135544836521149, "learning_rate": 8.492448696883292e-05, "loss": 0.0236, "step": 876 }, { "epoch": 0.6579144786196549, "grad_norm": 0.10831738263368607, "learning_rate": 8.487760028852427e-05, "loss": 0.0235, "step": 877 }, { "epoch": 0.6586646661665416, "grad_norm": 0.11996128410100937, "learning_rate": 8.483065379656558e-05, "loss": 0.0302, "step": 878 }, { "epoch": 0.6594148537134283, "grad_norm": 0.11158309131860733, "learning_rate": 8.478364757346546e-05, "loss": 0.026, "step": 879 }, { "epoch": 0.660165041260315, "grad_norm": 0.0804918184876442, "learning_rate": 8.473658169983496e-05, "loss": 0.0189, "step": 880 }, { "epoch": 0.6609152288072018, "grad_norm": 0.13769303262233734, "learning_rate": 8.468945625638742e-05, "loss": 0.0333, "step": 881 }, { "epoch": 0.6616654163540885, "grad_norm": 0.19014306366443634, "learning_rate": 8.464227132393831e-05, "loss": 0.0488, "step": 882 }, { "epoch": 0.6624156039009752, "grad_norm": 0.11594495177268982, "learning_rate": 8.459502698340519e-05, "loss": 0.0305, "step": 883 }, { "epoch": 0.6631657914478619, "grad_norm": 0.11026813834905624, "learning_rate": 8.45477233158074e-05, "loss": 0.0232, "step": 884 }, { "epoch": 0.6639159789947486, "grad_norm": 0.12670838832855225, "learning_rate": 8.450036040226612e-05, "loss": 0.0301, "step": 885 }, { "epoch": 0.6646661665416355, "grad_norm": 0.09368249028921127, "learning_rate": 8.445293832400402e-05, "loss": 0.0208, "step": 886 }, { "epoch": 0.6654163540885222, "grad_norm": 0.10260524600744247, "learning_rate": 8.440545716234538e-05, "loss": 0.0235, "step": 887 }, { "epoch": 0.6661665416354089, "grad_norm": 0.0919303148984909, "learning_rate": 8.435791699871564e-05, "loss": 0.0212, "step": 888 }, { "epoch": 0.6669167291822956, "grad_norm": 0.19766244292259216, "learning_rate": 8.431031791464155e-05, "loss": 0.0611, "step": 889 }, { "epoch": 0.6676669167291823, "grad_norm": 0.08774958550930023, "learning_rate": 8.426265999175081e-05, "loss": 0.0241, "step": 890 }, { "epoch": 0.668417104276069, "grad_norm": 0.16776250302791595, "learning_rate": 8.421494331177208e-05, "loss": 0.04, "step": 891 }, { "epoch": 0.6691672918229558, "grad_norm": 0.10904256254434586, "learning_rate": 8.41671679565348e-05, "loss": 0.0304, "step": 892 }, { "epoch": 0.6699174793698425, "grad_norm": 0.2265564650297165, "learning_rate": 8.411933400796896e-05, "loss": 0.047, "step": 893 }, { "epoch": 0.6706676669167292, "grad_norm": 0.10385122150182724, "learning_rate": 8.407144154810509e-05, "loss": 0.0331, "step": 894 }, { "epoch": 0.6714178544636159, "grad_norm": 0.11064758896827698, "learning_rate": 8.402349065907403e-05, "loss": 0.0298, "step": 895 }, { "epoch": 0.6721680420105026, "grad_norm": 0.17340891063213348, "learning_rate": 8.397548142310685e-05, "loss": 0.0362, "step": 896 }, { "epoch": 0.6729182295573893, "grad_norm": 0.16428762674331665, "learning_rate": 8.392741392253465e-05, "loss": 0.0454, "step": 897 }, { "epoch": 0.673668417104276, "grad_norm": 0.12004006654024124, "learning_rate": 8.387928823978846e-05, "loss": 0.0302, "step": 898 }, { "epoch": 0.6744186046511628, "grad_norm": 0.07734087109565735, "learning_rate": 8.383110445739907e-05, "loss": 0.0182, "step": 899 }, { "epoch": 0.6751687921980495, "grad_norm": 0.06279458105564117, "learning_rate": 8.378286265799698e-05, "loss": 0.0239, "step": 900 }, { "epoch": 0.6759189797449362, "grad_norm": 0.1248641386628151, "learning_rate": 8.373456292431206e-05, "loss": 0.0394, "step": 901 }, { "epoch": 0.6766691672918229, "grad_norm": 0.10201413929462433, "learning_rate": 8.368620533917363e-05, "loss": 0.0307, "step": 902 }, { "epoch": 0.6774193548387096, "grad_norm": 0.09084980189800262, "learning_rate": 8.363778998551018e-05, "loss": 0.0331, "step": 903 }, { "epoch": 0.6781695423855963, "grad_norm": 0.09921452403068542, "learning_rate": 8.358931694634928e-05, "loss": 0.0287, "step": 904 }, { "epoch": 0.6789197299324832, "grad_norm": 0.1051078587770462, "learning_rate": 8.35407863048174e-05, "loss": 0.0284, "step": 905 }, { "epoch": 0.6796699174793699, "grad_norm": 0.09181918948888779, "learning_rate": 8.349219814413984e-05, "loss": 0.0207, "step": 906 }, { "epoch": 0.6804201050262566, "grad_norm": 0.19342802464962006, "learning_rate": 8.344355254764049e-05, "loss": 0.0405, "step": 907 }, { "epoch": 0.6811702925731433, "grad_norm": 0.0752057135105133, "learning_rate": 8.339484959874178e-05, "loss": 0.0169, "step": 908 }, { "epoch": 0.68192048012003, "grad_norm": 0.1359151005744934, "learning_rate": 8.334608938096443e-05, "loss": 0.0338, "step": 909 }, { "epoch": 0.6826706676669168, "grad_norm": 0.15660357475280762, "learning_rate": 8.329727197792744e-05, "loss": 0.029, "step": 910 }, { "epoch": 0.6834208552138035, "grad_norm": 0.06472530961036682, "learning_rate": 8.324839747334787e-05, "loss": 0.0203, "step": 911 }, { "epoch": 0.6841710427606902, "grad_norm": 0.25108665227890015, "learning_rate": 8.319946595104065e-05, "loss": 0.0393, "step": 912 }, { "epoch": 0.6849212303075769, "grad_norm": 0.08752661198377609, "learning_rate": 8.315047749491851e-05, "loss": 0.0255, "step": 913 }, { "epoch": 0.6856714178544636, "grad_norm": 0.0976942628622055, "learning_rate": 8.310143218899187e-05, "loss": 0.0242, "step": 914 }, { "epoch": 0.6864216054013503, "grad_norm": 0.15198934078216553, "learning_rate": 8.305233011736857e-05, "loss": 0.0419, "step": 915 }, { "epoch": 0.687171792948237, "grad_norm": 0.11538293212652206, "learning_rate": 8.300317136425385e-05, "loss": 0.0273, "step": 916 }, { "epoch": 0.6879219804951238, "grad_norm": 0.16572171449661255, "learning_rate": 8.295395601395011e-05, "loss": 0.031, "step": 917 }, { "epoch": 0.6886721680420105, "grad_norm": 0.12102951854467392, "learning_rate": 8.290468415085683e-05, "loss": 0.0286, "step": 918 }, { "epoch": 0.6894223555888972, "grad_norm": 0.24656544625759125, "learning_rate": 8.285535585947042e-05, "loss": 0.0555, "step": 919 }, { "epoch": 0.6901725431357839, "grad_norm": 0.1117316484451294, "learning_rate": 8.280597122438404e-05, "loss": 0.0255, "step": 920 }, { "epoch": 0.6909227306826706, "grad_norm": 0.15054990351200104, "learning_rate": 8.275653033028745e-05, "loss": 0.0357, "step": 921 }, { "epoch": 0.6916729182295573, "grad_norm": 0.1011425033211708, "learning_rate": 8.270703326196696e-05, "loss": 0.0295, "step": 922 }, { "epoch": 0.6924231057764441, "grad_norm": 0.09935387969017029, "learning_rate": 8.265748010430513e-05, "loss": 0.029, "step": 923 }, { "epoch": 0.6931732933233309, "grad_norm": 0.1303011029958725, "learning_rate": 8.260787094228076e-05, "loss": 0.0336, "step": 924 }, { "epoch": 0.6939234808702176, "grad_norm": 0.09792095422744751, "learning_rate": 8.255820586096867e-05, "loss": 0.0303, "step": 925 }, { "epoch": 0.6946736684171043, "grad_norm": 0.11036955565214157, "learning_rate": 8.25084849455396e-05, "loss": 0.0345, "step": 926 }, { "epoch": 0.695423855963991, "grad_norm": 0.1368400603532791, "learning_rate": 8.245870828126e-05, "loss": 0.0487, "step": 927 }, { "epoch": 0.6961740435108777, "grad_norm": 0.13145577907562256, "learning_rate": 8.240887595349197e-05, "loss": 0.0379, "step": 928 }, { "epoch": 0.6969242310577645, "grad_norm": 0.10650297999382019, "learning_rate": 8.235898804769303e-05, "loss": 0.034, "step": 929 }, { "epoch": 0.6976744186046512, "grad_norm": 0.11778818815946579, "learning_rate": 8.230904464941604e-05, "loss": 0.0358, "step": 930 }, { "epoch": 0.6984246061515379, "grad_norm": 0.12128084897994995, "learning_rate": 8.225904584430901e-05, "loss": 0.0426, "step": 931 }, { "epoch": 0.6991747936984246, "grad_norm": 0.1473531872034073, "learning_rate": 8.220899171811495e-05, "loss": 0.0399, "step": 932 }, { "epoch": 0.6999249812453113, "grad_norm": 0.10071301460266113, "learning_rate": 8.215888235667176e-05, "loss": 0.0311, "step": 933 }, { "epoch": 0.700675168792198, "grad_norm": 0.1421838104724884, "learning_rate": 8.210871784591207e-05, "loss": 0.0399, "step": 934 }, { "epoch": 0.7014253563390848, "grad_norm": 0.11759334057569504, "learning_rate": 8.205849827186308e-05, "loss": 0.0273, "step": 935 }, { "epoch": 0.7021755438859715, "grad_norm": 0.09371329843997955, "learning_rate": 8.200822372064641e-05, "loss": 0.0282, "step": 936 }, { "epoch": 0.7029257314328582, "grad_norm": 0.08694262057542801, "learning_rate": 8.195789427847796e-05, "loss": 0.0212, "step": 937 }, { "epoch": 0.7036759189797449, "grad_norm": 0.08702065050601959, "learning_rate": 8.190751003166778e-05, "loss": 0.0205, "step": 938 }, { "epoch": 0.7044261065266316, "grad_norm": 0.11808331310749054, "learning_rate": 8.185707106661986e-05, "loss": 0.0356, "step": 939 }, { "epoch": 0.7051762940735183, "grad_norm": 0.12836730480194092, "learning_rate": 8.18065774698321e-05, "loss": 0.0312, "step": 940 }, { "epoch": 0.705926481620405, "grad_norm": 0.14114151895046234, "learning_rate": 8.175602932789601e-05, "loss": 0.0335, "step": 941 }, { "epoch": 0.7066766691672918, "grad_norm": 0.17943109571933746, "learning_rate": 8.17054267274967e-05, "loss": 0.0359, "step": 942 }, { "epoch": 0.7074268567141786, "grad_norm": 0.12193148583173752, "learning_rate": 8.165476975541264e-05, "loss": 0.0343, "step": 943 }, { "epoch": 0.7081770442610653, "grad_norm": 0.1203569769859314, "learning_rate": 8.160405849851556e-05, "loss": 0.0263, "step": 944 }, { "epoch": 0.708927231807952, "grad_norm": 0.08798082917928696, "learning_rate": 8.155329304377025e-05, "loss": 0.0249, "step": 945 }, { "epoch": 0.7096774193548387, "grad_norm": 0.12763750553131104, "learning_rate": 8.150247347823448e-05, "loss": 0.0363, "step": 946 }, { "epoch": 0.7104276069017255, "grad_norm": 0.12717604637145996, "learning_rate": 8.145159988905879e-05, "loss": 0.0406, "step": 947 }, { "epoch": 0.7111777944486122, "grad_norm": 0.10531704872846603, "learning_rate": 8.140067236348638e-05, "loss": 0.0323, "step": 948 }, { "epoch": 0.7119279819954989, "grad_norm": 0.15488344430923462, "learning_rate": 8.134969098885294e-05, "loss": 0.0403, "step": 949 }, { "epoch": 0.7126781695423856, "grad_norm": 0.12809032201766968, "learning_rate": 8.129865585258653e-05, "loss": 0.0329, "step": 950 }, { "epoch": 0.7134283570892723, "grad_norm": 0.09889086335897446, "learning_rate": 8.124756704220735e-05, "loss": 0.0277, "step": 951 }, { "epoch": 0.714178544636159, "grad_norm": 0.1394570767879486, "learning_rate": 8.11964246453277e-05, "loss": 0.0397, "step": 952 }, { "epoch": 0.7149287321830458, "grad_norm": 0.1902426928281784, "learning_rate": 8.114522874965174e-05, "loss": 0.0428, "step": 953 }, { "epoch": 0.7156789197299325, "grad_norm": 0.15730929374694824, "learning_rate": 8.10939794429754e-05, "loss": 0.0356, "step": 954 }, { "epoch": 0.7164291072768192, "grad_norm": 0.16836601495742798, "learning_rate": 8.10426768131862e-05, "loss": 0.0433, "step": 955 }, { "epoch": 0.7171792948237059, "grad_norm": 0.09496788680553436, "learning_rate": 8.099132094826308e-05, "loss": 0.0266, "step": 956 }, { "epoch": 0.7179294823705926, "grad_norm": 0.11788193881511688, "learning_rate": 8.093991193627631e-05, "loss": 0.0386, "step": 957 }, { "epoch": 0.7186796699174793, "grad_norm": 0.17791517078876495, "learning_rate": 8.088844986538727e-05, "loss": 0.0376, "step": 958 }, { "epoch": 0.719429857464366, "grad_norm": 0.13959939777851105, "learning_rate": 8.083693482384836e-05, "loss": 0.0319, "step": 959 }, { "epoch": 0.7201800450112528, "grad_norm": 0.21248754858970642, "learning_rate": 8.078536690000278e-05, "loss": 0.0425, "step": 960 }, { "epoch": 0.7209302325581395, "grad_norm": 0.131417915225029, "learning_rate": 8.073374618228445e-05, "loss": 0.0343, "step": 961 }, { "epoch": 0.7216804201050263, "grad_norm": 0.10697544366121292, "learning_rate": 8.068207275921782e-05, "loss": 0.0337, "step": 962 }, { "epoch": 0.722430607651913, "grad_norm": 0.09917831420898438, "learning_rate": 8.063034671941774e-05, "loss": 0.0225, "step": 963 }, { "epoch": 0.7231807951987997, "grad_norm": 0.14584000408649445, "learning_rate": 8.057856815158924e-05, "loss": 0.0366, "step": 964 }, { "epoch": 0.7239309827456865, "grad_norm": 0.08406969904899597, "learning_rate": 8.05267371445275e-05, "loss": 0.025, "step": 965 }, { "epoch": 0.7246811702925732, "grad_norm": 0.13167530298233032, "learning_rate": 8.047485378711756e-05, "loss": 0.0236, "step": 966 }, { "epoch": 0.7254313578394599, "grad_norm": 0.09242197871208191, "learning_rate": 8.042291816833429e-05, "loss": 0.0261, "step": 967 }, { "epoch": 0.7261815453863466, "grad_norm": 0.16659840941429138, "learning_rate": 8.037093037724216e-05, "loss": 0.0366, "step": 968 }, { "epoch": 0.7269317329332333, "grad_norm": 0.12860220670700073, "learning_rate": 8.031889050299511e-05, "loss": 0.0319, "step": 969 }, { "epoch": 0.72768192048012, "grad_norm": 0.1668856143951416, "learning_rate": 8.02667986348364e-05, "loss": 0.0578, "step": 970 }, { "epoch": 0.7284321080270068, "grad_norm": 0.11582063883543015, "learning_rate": 8.021465486209846e-05, "loss": 0.0374, "step": 971 }, { "epoch": 0.7291822955738935, "grad_norm": 0.10971240699291229, "learning_rate": 8.016245927420272e-05, "loss": 0.0355, "step": 972 }, { "epoch": 0.7299324831207802, "grad_norm": 0.11834264546632767, "learning_rate": 8.011021196065946e-05, "loss": 0.041, "step": 973 }, { "epoch": 0.7306826706676669, "grad_norm": 0.14551803469657898, "learning_rate": 8.005791301106769e-05, "loss": 0.037, "step": 974 }, { "epoch": 0.7314328582145536, "grad_norm": 0.10361959785223007, "learning_rate": 8.000556251511498e-05, "loss": 0.033, "step": 975 }, { "epoch": 0.7321830457614403, "grad_norm": 0.08322405815124512, "learning_rate": 7.995316056257723e-05, "loss": 0.0223, "step": 976 }, { "epoch": 0.732933233308327, "grad_norm": 0.10396233201026917, "learning_rate": 7.990070724331866e-05, "loss": 0.0228, "step": 977 }, { "epoch": 0.7336834208552138, "grad_norm": 0.1658412218093872, "learning_rate": 7.984820264729156e-05, "loss": 0.0441, "step": 978 }, { "epoch": 0.7344336084021005, "grad_norm": 0.09091618657112122, "learning_rate": 7.979564686453612e-05, "loss": 0.0258, "step": 979 }, { "epoch": 0.7351837959489872, "grad_norm": 0.10041514039039612, "learning_rate": 7.974303998518031e-05, "loss": 0.0229, "step": 980 }, { "epoch": 0.735933983495874, "grad_norm": 0.15248219668865204, "learning_rate": 7.96903820994398e-05, "loss": 0.0278, "step": 981 }, { "epoch": 0.7366841710427607, "grad_norm": 0.1488843411207199, "learning_rate": 7.963767329761762e-05, "loss": 0.0473, "step": 982 }, { "epoch": 0.7374343585896475, "grad_norm": 0.1008438840508461, "learning_rate": 7.958491367010423e-05, "loss": 0.027, "step": 983 }, { "epoch": 0.7381845461365342, "grad_norm": 0.10297491401433945, "learning_rate": 7.953210330737718e-05, "loss": 0.0319, "step": 984 }, { "epoch": 0.7389347336834209, "grad_norm": 0.11224748194217682, "learning_rate": 7.947924230000102e-05, "loss": 0.0254, "step": 985 }, { "epoch": 0.7396849212303076, "grad_norm": 0.1465216726064682, "learning_rate": 7.942633073862718e-05, "loss": 0.0352, "step": 986 }, { "epoch": 0.7404351087771943, "grad_norm": 0.18327566981315613, "learning_rate": 7.937336871399379e-05, "loss": 0.0414, "step": 987 }, { "epoch": 0.741185296324081, "grad_norm": 0.17351044714450836, "learning_rate": 7.932035631692549e-05, "loss": 0.0432, "step": 988 }, { "epoch": 0.7419354838709677, "grad_norm": 0.2046956866979599, "learning_rate": 7.926729363833335e-05, "loss": 0.0374, "step": 989 }, { "epoch": 0.7426856714178545, "grad_norm": 0.18247826397418976, "learning_rate": 7.921418076921461e-05, "loss": 0.0433, "step": 990 }, { "epoch": 0.7434358589647412, "grad_norm": 0.11379354447126389, "learning_rate": 7.916101780065263e-05, "loss": 0.0276, "step": 991 }, { "epoch": 0.7441860465116279, "grad_norm": 0.10715629160404205, "learning_rate": 7.910780482381665e-05, "loss": 0.0303, "step": 992 }, { "epoch": 0.7449362340585146, "grad_norm": 0.10846918076276779, "learning_rate": 7.905454192996169e-05, "loss": 0.029, "step": 993 }, { "epoch": 0.7456864216054013, "grad_norm": 0.1614159196615219, "learning_rate": 7.900122921042837e-05, "loss": 0.0332, "step": 994 }, { "epoch": 0.746436609152288, "grad_norm": 0.12448745965957642, "learning_rate": 7.894786675664277e-05, "loss": 0.0313, "step": 995 }, { "epoch": 0.7471867966991748, "grad_norm": 0.12879522144794464, "learning_rate": 7.88944546601162e-05, "loss": 0.0327, "step": 996 }, { "epoch": 0.7479369842460615, "grad_norm": 0.08149383217096329, "learning_rate": 7.884099301244519e-05, "loss": 0.0193, "step": 997 }, { "epoch": 0.7486871717929482, "grad_norm": 0.09928504377603531, "learning_rate": 7.878748190531118e-05, "loss": 0.0201, "step": 998 }, { "epoch": 0.7494373593398349, "grad_norm": 0.09547089040279388, "learning_rate": 7.873392143048047e-05, "loss": 0.0263, "step": 999 }, { "epoch": 0.7501875468867217, "grad_norm": 0.15654493868350983, "learning_rate": 7.868031167980397e-05, "loss": 0.0499, "step": 1000 }, { "epoch": 0.7501875468867217, "eval_loss": 0.033354200422763824, "eval_runtime": 5.1415, "eval_samples_per_second": 10.503, "eval_steps_per_second": 2.723, "step": 1000 }, { "epoch": 0.7509377344336085, "grad_norm": 0.15055902302265167, "learning_rate": 7.862665274521712e-05, "loss": 0.0369, "step": 1001 }, { "epoch": 0.7516879219804952, "grad_norm": 0.15537412464618683, "learning_rate": 7.857294471873975e-05, "loss": 0.0342, "step": 1002 }, { "epoch": 0.7524381095273819, "grad_norm": 0.20609122514724731, "learning_rate": 7.851918769247582e-05, "loss": 0.041, "step": 1003 }, { "epoch": 0.7531882970742686, "grad_norm": 0.09634378552436829, "learning_rate": 7.846538175861332e-05, "loss": 0.023, "step": 1004 }, { "epoch": 0.7539384846211553, "grad_norm": 0.147333025932312, "learning_rate": 7.841152700942413e-05, "loss": 0.0266, "step": 1005 }, { "epoch": 0.754688672168042, "grad_norm": 0.13298511505126953, "learning_rate": 7.835762353726386e-05, "loss": 0.0347, "step": 1006 }, { "epoch": 0.7554388597149287, "grad_norm": 0.15836963057518005, "learning_rate": 7.830367143457165e-05, "loss": 0.0396, "step": 1007 }, { "epoch": 0.7561890472618155, "grad_norm": 0.3826274275779724, "learning_rate": 7.824967079387002e-05, "loss": 0.0618, "step": 1008 }, { "epoch": 0.7569392348087022, "grad_norm": 0.18430115282535553, "learning_rate": 7.81956217077648e-05, "loss": 0.0463, "step": 1009 }, { "epoch": 0.7576894223555889, "grad_norm": 0.12182758003473282, "learning_rate": 7.814152426894478e-05, "loss": 0.0296, "step": 1010 }, { "epoch": 0.7584396099024756, "grad_norm": 0.1450658142566681, "learning_rate": 7.808737857018182e-05, "loss": 0.0414, "step": 1011 }, { "epoch": 0.7591897974493623, "grad_norm": 0.15949197113513947, "learning_rate": 7.803318470433042e-05, "loss": 0.0319, "step": 1012 }, { "epoch": 0.759939984996249, "grad_norm": 0.1380879431962967, "learning_rate": 7.797894276432772e-05, "loss": 0.0359, "step": 1013 }, { "epoch": 0.7606901725431358, "grad_norm": 0.10468295961618423, "learning_rate": 7.792465284319332e-05, "loss": 0.0292, "step": 1014 }, { "epoch": 0.7614403600900225, "grad_norm": 0.11396298557519913, "learning_rate": 7.787031503402907e-05, "loss": 0.0277, "step": 1015 }, { "epoch": 0.7621905476369092, "grad_norm": 0.154616579413414, "learning_rate": 7.781592943001899e-05, "loss": 0.0413, "step": 1016 }, { "epoch": 0.7629407351837959, "grad_norm": 0.10276743024587631, "learning_rate": 7.776149612442899e-05, "loss": 0.0276, "step": 1017 }, { "epoch": 0.7636909227306826, "grad_norm": 0.15479224920272827, "learning_rate": 7.770701521060688e-05, "loss": 0.0345, "step": 1018 }, { "epoch": 0.7644411102775694, "grad_norm": 0.11939147859811783, "learning_rate": 7.765248678198203e-05, "loss": 0.0259, "step": 1019 }, { "epoch": 0.7651912978244562, "grad_norm": 0.09478700160980225, "learning_rate": 7.759791093206534e-05, "loss": 0.0259, "step": 1020 }, { "epoch": 0.7659414853713429, "grad_norm": 0.1022457480430603, "learning_rate": 7.754328775444903e-05, "loss": 0.0284, "step": 1021 }, { "epoch": 0.7666916729182296, "grad_norm": 0.1470392495393753, "learning_rate": 7.748861734280643e-05, "loss": 0.0369, "step": 1022 }, { "epoch": 0.7674418604651163, "grad_norm": 0.10798605531454086, "learning_rate": 7.743389979089196e-05, "loss": 0.0243, "step": 1023 }, { "epoch": 0.768192048012003, "grad_norm": 0.09053485095500946, "learning_rate": 7.737913519254079e-05, "loss": 0.0252, "step": 1024 }, { "epoch": 0.7689422355588897, "grad_norm": 0.20014923810958862, "learning_rate": 7.732432364166884e-05, "loss": 0.0538, "step": 1025 }, { "epoch": 0.7696924231057765, "grad_norm": 0.2505134642124176, "learning_rate": 7.726946523227251e-05, "loss": 0.0463, "step": 1026 }, { "epoch": 0.7704426106526632, "grad_norm": 0.09509759396314621, "learning_rate": 7.721456005842861e-05, "loss": 0.0261, "step": 1027 }, { "epoch": 0.7711927981995499, "grad_norm": 0.09843521565198898, "learning_rate": 7.715960821429404e-05, "loss": 0.0269, "step": 1028 }, { "epoch": 0.7719429857464366, "grad_norm": 0.13212156295776367, "learning_rate": 7.710460979410585e-05, "loss": 0.0334, "step": 1029 }, { "epoch": 0.7726931732933233, "grad_norm": 0.18599221110343933, "learning_rate": 7.704956489218091e-05, "loss": 0.0477, "step": 1030 }, { "epoch": 0.77344336084021, "grad_norm": 0.15956397354602814, "learning_rate": 7.699447360291576e-05, "loss": 0.0424, "step": 1031 }, { "epoch": 0.7741935483870968, "grad_norm": 0.10117616504430771, "learning_rate": 7.69393360207866e-05, "loss": 0.0224, "step": 1032 }, { "epoch": 0.7749437359339835, "grad_norm": 0.09441590309143066, "learning_rate": 7.688415224034893e-05, "loss": 0.0273, "step": 1033 }, { "epoch": 0.7756939234808702, "grad_norm": 0.08132372796535492, "learning_rate": 7.682892235623749e-05, "loss": 0.0216, "step": 1034 }, { "epoch": 0.7764441110277569, "grad_norm": 0.08580277860164642, "learning_rate": 7.67736464631661e-05, "loss": 0.0218, "step": 1035 }, { "epoch": 0.7771942985746436, "grad_norm": 0.14287903904914856, "learning_rate": 7.671832465592746e-05, "loss": 0.0366, "step": 1036 }, { "epoch": 0.7779444861215303, "grad_norm": 0.11992555856704712, "learning_rate": 7.666295702939305e-05, "loss": 0.0336, "step": 1037 }, { "epoch": 0.7786946736684172, "grad_norm": 0.12070818245410919, "learning_rate": 7.660754367851286e-05, "loss": 0.0308, "step": 1038 }, { "epoch": 0.7794448612153039, "grad_norm": 0.17960987985134125, "learning_rate": 7.655208469831536e-05, "loss": 0.0467, "step": 1039 }, { "epoch": 0.7801950487621906, "grad_norm": 0.1322227567434311, "learning_rate": 7.649658018390725e-05, "loss": 0.0258, "step": 1040 }, { "epoch": 0.7809452363090773, "grad_norm": 0.07779379934072495, "learning_rate": 7.644103023047327e-05, "loss": 0.0215, "step": 1041 }, { "epoch": 0.781695423855964, "grad_norm": 0.11337679624557495, "learning_rate": 7.638543493327613e-05, "loss": 0.0267, "step": 1042 }, { "epoch": 0.7824456114028507, "grad_norm": 0.1161460429430008, "learning_rate": 7.63297943876563e-05, "loss": 0.0345, "step": 1043 }, { "epoch": 0.7831957989497375, "grad_norm": 0.12373264878988266, "learning_rate": 7.627410868903184e-05, "loss": 0.0338, "step": 1044 }, { "epoch": 0.7839459864966242, "grad_norm": 0.14728686213493347, "learning_rate": 7.621837793289824e-05, "loss": 0.0487, "step": 1045 }, { "epoch": 0.7846961740435109, "grad_norm": 0.16242580115795135, "learning_rate": 7.616260221482825e-05, "loss": 0.0378, "step": 1046 }, { "epoch": 0.7854463615903976, "grad_norm": 0.17540179193019867, "learning_rate": 7.610678163047174e-05, "loss": 0.0487, "step": 1047 }, { "epoch": 0.7861965491372843, "grad_norm": 0.0956827774643898, "learning_rate": 7.60509162755555e-05, "loss": 0.0257, "step": 1048 }, { "epoch": 0.786946736684171, "grad_norm": 0.14103733003139496, "learning_rate": 7.599500624588314e-05, "loss": 0.0357, "step": 1049 }, { "epoch": 0.7876969242310577, "grad_norm": 0.1205405592918396, "learning_rate": 7.593905163733484e-05, "loss": 0.0335, "step": 1050 }, { "epoch": 0.7884471117779445, "grad_norm": 0.1581084281206131, "learning_rate": 7.588305254586724e-05, "loss": 0.0392, "step": 1051 }, { "epoch": 0.7891972993248312, "grad_norm": 0.08614162355661392, "learning_rate": 7.582700906751325e-05, "loss": 0.0246, "step": 1052 }, { "epoch": 0.7899474868717179, "grad_norm": 0.15688768029212952, "learning_rate": 7.577092129838197e-05, "loss": 0.0421, "step": 1053 }, { "epoch": 0.7906976744186046, "grad_norm": 0.08980410546064377, "learning_rate": 7.571478933465836e-05, "loss": 0.0237, "step": 1054 }, { "epoch": 0.7914478619654913, "grad_norm": 0.11821522563695908, "learning_rate": 7.565861327260322e-05, "loss": 0.0315, "step": 1055 }, { "epoch": 0.792198049512378, "grad_norm": 0.2338874340057373, "learning_rate": 7.560239320855296e-05, "loss": 0.066, "step": 1056 }, { "epoch": 0.7929482370592649, "grad_norm": 0.2845582365989685, "learning_rate": 7.554612923891946e-05, "loss": 0.0546, "step": 1057 }, { "epoch": 0.7936984246061516, "grad_norm": 0.15993359684944153, "learning_rate": 7.548982146018988e-05, "loss": 0.0538, "step": 1058 }, { "epoch": 0.7944486121530383, "grad_norm": 0.12397005409002304, "learning_rate": 7.543346996892654e-05, "loss": 0.0415, "step": 1059 }, { "epoch": 0.795198799699925, "grad_norm": 0.11393854767084122, "learning_rate": 7.537707486176667e-05, "loss": 0.0327, "step": 1060 }, { "epoch": 0.7959489872468117, "grad_norm": 0.0845160260796547, "learning_rate": 7.532063623542231e-05, "loss": 0.031, "step": 1061 }, { "epoch": 0.7966991747936985, "grad_norm": 0.11460570245981216, "learning_rate": 7.52641541866802e-05, "loss": 0.0232, "step": 1062 }, { "epoch": 0.7974493623405852, "grad_norm": 0.13715198636054993, "learning_rate": 7.520762881240147e-05, "loss": 0.0486, "step": 1063 }, { "epoch": 0.7981995498874719, "grad_norm": 0.14128798246383667, "learning_rate": 7.515106020952156e-05, "loss": 0.044, "step": 1064 }, { "epoch": 0.7989497374343586, "grad_norm": 0.09283804148435593, "learning_rate": 7.509444847505005e-05, "loss": 0.0308, "step": 1065 }, { "epoch": 0.7996999249812453, "grad_norm": 0.12887823581695557, "learning_rate": 7.503779370607049e-05, "loss": 0.0282, "step": 1066 }, { "epoch": 0.800450112528132, "grad_norm": 0.10709967464208603, "learning_rate": 7.498109599974024e-05, "loss": 0.0328, "step": 1067 }, { "epoch": 0.8012003000750187, "grad_norm": 0.09748120605945587, "learning_rate": 7.49243554532903e-05, "loss": 0.0306, "step": 1068 }, { "epoch": 0.8019504876219055, "grad_norm": 0.13216429948806763, "learning_rate": 7.486757216402509e-05, "loss": 0.0276, "step": 1069 }, { "epoch": 0.8027006751687922, "grad_norm": 0.10359996557235718, "learning_rate": 7.481074622932236e-05, "loss": 0.0302, "step": 1070 }, { "epoch": 0.8034508627156789, "grad_norm": 0.10987622290849686, "learning_rate": 7.475387774663302e-05, "loss": 0.0258, "step": 1071 }, { "epoch": 0.8042010502625656, "grad_norm": 0.12450256943702698, "learning_rate": 7.469696681348088e-05, "loss": 0.0281, "step": 1072 }, { "epoch": 0.8049512378094523, "grad_norm": 0.2062714844942093, "learning_rate": 7.464001352746263e-05, "loss": 0.052, "step": 1073 }, { "epoch": 0.805701425356339, "grad_norm": 0.12377651035785675, "learning_rate": 7.45830179862475e-05, "loss": 0.0331, "step": 1074 }, { "epoch": 0.8064516129032258, "grad_norm": 0.09847312420606613, "learning_rate": 7.452598028757729e-05, "loss": 0.0373, "step": 1075 }, { "epoch": 0.8072018004501126, "grad_norm": 0.1826242357492447, "learning_rate": 7.446890052926598e-05, "loss": 0.0441, "step": 1076 }, { "epoch": 0.8079519879969993, "grad_norm": 0.0950358659029007, "learning_rate": 7.441177880919976e-05, "loss": 0.0283, "step": 1077 }, { "epoch": 0.808702175543886, "grad_norm": 0.13997478783130646, "learning_rate": 7.435461522533674e-05, "loss": 0.034, "step": 1078 }, { "epoch": 0.8094523630907727, "grad_norm": 0.12771092355251312, "learning_rate": 7.429740987570686e-05, "loss": 0.0349, "step": 1079 }, { "epoch": 0.8102025506376594, "grad_norm": 0.10692954063415527, "learning_rate": 7.424016285841165e-05, "loss": 0.0281, "step": 1080 }, { "epoch": 0.8109527381845462, "grad_norm": 0.11578011512756348, "learning_rate": 7.41828742716241e-05, "loss": 0.0303, "step": 1081 }, { "epoch": 0.8117029257314329, "grad_norm": 0.17760562896728516, "learning_rate": 7.41255442135885e-05, "loss": 0.0412, "step": 1082 }, { "epoch": 0.8124531132783196, "grad_norm": 0.236124649643898, "learning_rate": 7.406817278262027e-05, "loss": 0.0483, "step": 1083 }, { "epoch": 0.8132033008252063, "grad_norm": 0.08512605726718903, "learning_rate": 7.401076007710575e-05, "loss": 0.0261, "step": 1084 }, { "epoch": 0.813953488372093, "grad_norm": 0.10752329975366592, "learning_rate": 7.395330619550207e-05, "loss": 0.0291, "step": 1085 }, { "epoch": 0.8147036759189797, "grad_norm": 0.17289288341999054, "learning_rate": 7.3895811236337e-05, "loss": 0.0485, "step": 1086 }, { "epoch": 0.8154538634658665, "grad_norm": 0.143681600689888, "learning_rate": 7.38382752982087e-05, "loss": 0.0384, "step": 1087 }, { "epoch": 0.8162040510127532, "grad_norm": 0.15613281726837158, "learning_rate": 7.378069847978568e-05, "loss": 0.0471, "step": 1088 }, { "epoch": 0.8169542385596399, "grad_norm": 0.16713473200798035, "learning_rate": 7.372308087980647e-05, "loss": 0.0347, "step": 1089 }, { "epoch": 0.8177044261065266, "grad_norm": 0.1506505310535431, "learning_rate": 7.366542259707962e-05, "loss": 0.035, "step": 1090 }, { "epoch": 0.8184546136534133, "grad_norm": 0.14795953035354614, "learning_rate": 7.360772373048338e-05, "loss": 0.0426, "step": 1091 }, { "epoch": 0.8192048012003, "grad_norm": 0.095690056681633, "learning_rate": 7.354998437896565e-05, "loss": 0.0278, "step": 1092 }, { "epoch": 0.8199549887471868, "grad_norm": 0.13107137382030487, "learning_rate": 7.349220464154371e-05, "loss": 0.0433, "step": 1093 }, { "epoch": 0.8207051762940735, "grad_norm": 0.08680702745914459, "learning_rate": 7.343438461730411e-05, "loss": 0.0194, "step": 1094 }, { "epoch": 0.8214553638409603, "grad_norm": 0.12979038059711456, "learning_rate": 7.337652440540252e-05, "loss": 0.042, "step": 1095 }, { "epoch": 0.822205551387847, "grad_norm": 0.14768902957439423, "learning_rate": 7.331862410506353e-05, "loss": 0.038, "step": 1096 }, { "epoch": 0.8229557389347337, "grad_norm": 0.14361713826656342, "learning_rate": 7.32606838155804e-05, "loss": 0.029, "step": 1097 }, { "epoch": 0.8237059264816204, "grad_norm": 0.09816596657037735, "learning_rate": 7.320270363631505e-05, "loss": 0.0312, "step": 1098 }, { "epoch": 0.8244561140285072, "grad_norm": 0.13732968270778656, "learning_rate": 7.314468366669777e-05, "loss": 0.0355, "step": 1099 }, { "epoch": 0.8252063015753939, "grad_norm": 0.11866525560617447, "learning_rate": 7.30866240062271e-05, "loss": 0.0293, "step": 1100 }, { "epoch": 0.8259564891222806, "grad_norm": 0.1313275694847107, "learning_rate": 7.302852475446963e-05, "loss": 0.0384, "step": 1101 }, { "epoch": 0.8267066766691673, "grad_norm": 0.12741775810718536, "learning_rate": 7.297038601105988e-05, "loss": 0.042, "step": 1102 }, { "epoch": 0.827456864216054, "grad_norm": 0.12329462915658951, "learning_rate": 7.291220787570005e-05, "loss": 0.0292, "step": 1103 }, { "epoch": 0.8282070517629407, "grad_norm": 0.1349187195301056, "learning_rate": 7.28539904481599e-05, "loss": 0.0427, "step": 1104 }, { "epoch": 0.8289572393098275, "grad_norm": 0.10525055974721909, "learning_rate": 7.279573382827662e-05, "loss": 0.0241, "step": 1105 }, { "epoch": 0.8297074268567142, "grad_norm": 0.08475113660097122, "learning_rate": 7.273743811595454e-05, "loss": 0.0231, "step": 1106 }, { "epoch": 0.8304576144036009, "grad_norm": 0.13441896438598633, "learning_rate": 7.267910341116512e-05, "loss": 0.0374, "step": 1107 }, { "epoch": 0.8312078019504876, "grad_norm": 0.12029675394296646, "learning_rate": 7.262072981394656e-05, "loss": 0.0267, "step": 1108 }, { "epoch": 0.8319579894973743, "grad_norm": 0.16057652235031128, "learning_rate": 7.256231742440389e-05, "loss": 0.0457, "step": 1109 }, { "epoch": 0.832708177044261, "grad_norm": 0.17000728845596313, "learning_rate": 7.25038663427086e-05, "loss": 0.0313, "step": 1110 }, { "epoch": 0.8334583645911477, "grad_norm": 0.18660447001457214, "learning_rate": 7.24453766690985e-05, "loss": 0.0315, "step": 1111 }, { "epoch": 0.8342085521380345, "grad_norm": 0.1456262469291687, "learning_rate": 7.238684850387765e-05, "loss": 0.0291, "step": 1112 }, { "epoch": 0.8349587396849212, "grad_norm": 0.142116978764534, "learning_rate": 7.232828194741611e-05, "loss": 0.0404, "step": 1113 }, { "epoch": 0.8357089272318079, "grad_norm": 0.17252390086650848, "learning_rate": 7.226967710014971e-05, "loss": 0.0371, "step": 1114 }, { "epoch": 0.8364591147786947, "grad_norm": 0.12147200107574463, "learning_rate": 7.221103406258003e-05, "loss": 0.024, "step": 1115 }, { "epoch": 0.8372093023255814, "grad_norm": 0.09407100081443787, "learning_rate": 7.215235293527409e-05, "loss": 0.0267, "step": 1116 }, { "epoch": 0.8379594898724682, "grad_norm": 0.1135459691286087, "learning_rate": 7.209363381886423e-05, "loss": 0.0363, "step": 1117 }, { "epoch": 0.8387096774193549, "grad_norm": 0.08867119997739792, "learning_rate": 7.203487681404798e-05, "loss": 0.0192, "step": 1118 }, { "epoch": 0.8394598649662416, "grad_norm": 0.13186821341514587, "learning_rate": 7.19760820215878e-05, "loss": 0.0371, "step": 1119 }, { "epoch": 0.8402100525131283, "grad_norm": 0.14221522212028503, "learning_rate": 7.191724954231098e-05, "loss": 0.0341, "step": 1120 }, { "epoch": 0.840960240060015, "grad_norm": 0.10038655251264572, "learning_rate": 7.185837947710943e-05, "loss": 0.0271, "step": 1121 }, { "epoch": 0.8417104276069017, "grad_norm": 0.09764457494020462, "learning_rate": 7.17994719269395e-05, "loss": 0.0276, "step": 1122 }, { "epoch": 0.8424606151537885, "grad_norm": 0.11716485768556595, "learning_rate": 7.174052699282183e-05, "loss": 0.0355, "step": 1123 }, { "epoch": 0.8432108027006752, "grad_norm": 0.517447829246521, "learning_rate": 7.168154477584123e-05, "loss": 0.0535, "step": 1124 }, { "epoch": 0.8439609902475619, "grad_norm": 0.1132436990737915, "learning_rate": 7.162252537714633e-05, "loss": 0.0324, "step": 1125 }, { "epoch": 0.8447111777944486, "grad_norm": 0.0971536934375763, "learning_rate": 7.156346889794962e-05, "loss": 0.0354, "step": 1126 }, { "epoch": 0.8454613653413353, "grad_norm": 0.18410785496234894, "learning_rate": 7.150437543952715e-05, "loss": 0.0564, "step": 1127 }, { "epoch": 0.846211552888222, "grad_norm": 0.07700716704130173, "learning_rate": 7.144524510321837e-05, "loss": 0.0267, "step": 1128 }, { "epoch": 0.8469617404351087, "grad_norm": 0.14188776910305023, "learning_rate": 7.138607799042598e-05, "loss": 0.0381, "step": 1129 }, { "epoch": 0.8477119279819955, "grad_norm": 0.13262468576431274, "learning_rate": 7.132687420261576e-05, "loss": 0.0405, "step": 1130 }, { "epoch": 0.8484621155288822, "grad_norm": 0.15925203263759613, "learning_rate": 7.126763384131638e-05, "loss": 0.035, "step": 1131 }, { "epoch": 0.8492123030757689, "grad_norm": 0.09985039383172989, "learning_rate": 7.120835700811923e-05, "loss": 0.0285, "step": 1132 }, { "epoch": 0.8499624906226556, "grad_norm": 0.09418608248233795, "learning_rate": 7.114904380467823e-05, "loss": 0.0273, "step": 1133 }, { "epoch": 0.8507126781695424, "grad_norm": 0.07992705702781677, "learning_rate": 7.108969433270968e-05, "loss": 0.0226, "step": 1134 }, { "epoch": 0.8514628657164292, "grad_norm": 0.11864115297794342, "learning_rate": 7.10303086939921e-05, "loss": 0.0299, "step": 1135 }, { "epoch": 0.8522130532633159, "grad_norm": 0.107135109603405, "learning_rate": 7.097088699036598e-05, "loss": 0.041, "step": 1136 }, { "epoch": 0.8529632408102026, "grad_norm": 0.12338218092918396, "learning_rate": 7.091142932373371e-05, "loss": 0.0331, "step": 1137 }, { "epoch": 0.8537134283570893, "grad_norm": 0.24427056312561035, "learning_rate": 7.085193579605935e-05, "loss": 0.0508, "step": 1138 }, { "epoch": 0.854463615903976, "grad_norm": 0.08631362020969391, "learning_rate": 7.079240650936843e-05, "loss": 0.0284, "step": 1139 }, { "epoch": 0.8552138034508627, "grad_norm": 0.12822337448596954, "learning_rate": 7.07328415657478e-05, "loss": 0.0295, "step": 1140 }, { "epoch": 0.8559639909977494, "grad_norm": 0.13125036656856537, "learning_rate": 7.067324106734548e-05, "loss": 0.029, "step": 1141 }, { "epoch": 0.8567141785446362, "grad_norm": 0.1482788324356079, "learning_rate": 7.061360511637045e-05, "loss": 0.0271, "step": 1142 }, { "epoch": 0.8574643660915229, "grad_norm": 0.7413152456283569, "learning_rate": 7.055393381509253e-05, "loss": 0.0391, "step": 1143 }, { "epoch": 0.8582145536384096, "grad_norm": 0.10552649945020676, "learning_rate": 7.049422726584206e-05, "loss": 0.0316, "step": 1144 }, { "epoch": 0.8589647411852963, "grad_norm": 0.12541364133358002, "learning_rate": 7.043448557100995e-05, "loss": 0.0448, "step": 1145 }, { "epoch": 0.859714928732183, "grad_norm": 0.0947171300649643, "learning_rate": 7.037470883304731e-05, "loss": 0.0212, "step": 1146 }, { "epoch": 0.8604651162790697, "grad_norm": 0.11750858277082443, "learning_rate": 7.031489715446535e-05, "loss": 0.0274, "step": 1147 }, { "epoch": 0.8612153038259565, "grad_norm": 0.17968003451824188, "learning_rate": 7.02550506378352e-05, "loss": 0.0411, "step": 1148 }, { "epoch": 0.8619654913728432, "grad_norm": 0.1670011729001999, "learning_rate": 7.019516938578777e-05, "loss": 0.0287, "step": 1149 }, { "epoch": 0.8627156789197299, "grad_norm": 0.12383407354354858, "learning_rate": 7.013525350101348e-05, "loss": 0.028, "step": 1150 }, { "epoch": 0.8634658664666166, "grad_norm": 0.13687686622142792, "learning_rate": 7.00753030862622e-05, "loss": 0.0394, "step": 1151 }, { "epoch": 0.8642160540135033, "grad_norm": 0.1233229711651802, "learning_rate": 7.001531824434299e-05, "loss": 0.0326, "step": 1152 }, { "epoch": 0.8649662415603901, "grad_norm": 0.13290193676948547, "learning_rate": 6.995529907812391e-05, "loss": 0.039, "step": 1153 }, { "epoch": 0.8657164291072769, "grad_norm": 0.09817846119403839, "learning_rate": 6.989524569053196e-05, "loss": 0.0312, "step": 1154 }, { "epoch": 0.8664666166541636, "grad_norm": 0.10229942947626114, "learning_rate": 6.983515818455275e-05, "loss": 0.0261, "step": 1155 }, { "epoch": 0.8672168042010503, "grad_norm": 0.1152266263961792, "learning_rate": 6.977503666323048e-05, "loss": 0.0427, "step": 1156 }, { "epoch": 0.867966991747937, "grad_norm": 0.14483705163002014, "learning_rate": 6.971488122966758e-05, "loss": 0.0435, "step": 1157 }, { "epoch": 0.8687171792948237, "grad_norm": 0.1095462441444397, "learning_rate": 6.965469198702475e-05, "loss": 0.0274, "step": 1158 }, { "epoch": 0.8694673668417104, "grad_norm": 0.11495321244001389, "learning_rate": 6.95944690385206e-05, "loss": 0.0265, "step": 1159 }, { "epoch": 0.8702175543885972, "grad_norm": 0.1372273713350296, "learning_rate": 6.953421248743154e-05, "loss": 0.0423, "step": 1160 }, { "epoch": 0.8709677419354839, "grad_norm": 0.11597606539726257, "learning_rate": 6.947392243709163e-05, "loss": 0.0286, "step": 1161 }, { "epoch": 0.8717179294823706, "grad_norm": 0.13315455615520477, "learning_rate": 6.941359899089238e-05, "loss": 0.0301, "step": 1162 }, { "epoch": 0.8724681170292573, "grad_norm": 0.1332310438156128, "learning_rate": 6.935324225228254e-05, "loss": 0.0325, "step": 1163 }, { "epoch": 0.873218304576144, "grad_norm": 0.13525554537773132, "learning_rate": 6.929285232476797e-05, "loss": 0.0379, "step": 1164 }, { "epoch": 0.8739684921230307, "grad_norm": 0.09915309399366379, "learning_rate": 6.923242931191148e-05, "loss": 0.0292, "step": 1165 }, { "epoch": 0.8747186796699175, "grad_norm": 0.15619905292987823, "learning_rate": 6.917197331733257e-05, "loss": 0.0391, "step": 1166 }, { "epoch": 0.8754688672168042, "grad_norm": 0.13476809859275818, "learning_rate": 6.911148444470731e-05, "loss": 0.0227, "step": 1167 }, { "epoch": 0.8762190547636909, "grad_norm": 0.12116993218660355, "learning_rate": 6.905096279776819e-05, "loss": 0.0332, "step": 1168 }, { "epoch": 0.8769692423105776, "grad_norm": 0.07770241796970367, "learning_rate": 6.899040848030384e-05, "loss": 0.0199, "step": 1169 }, { "epoch": 0.8777194298574643, "grad_norm": 0.09852062910795212, "learning_rate": 6.892982159615895e-05, "loss": 0.0329, "step": 1170 }, { "epoch": 0.878469617404351, "grad_norm": 0.16133050620555878, "learning_rate": 6.886920224923408e-05, "loss": 0.0374, "step": 1171 }, { "epoch": 0.8792198049512379, "grad_norm": 0.11031442135572433, "learning_rate": 6.880855054348543e-05, "loss": 0.0286, "step": 1172 }, { "epoch": 0.8799699924981246, "grad_norm": 0.13426519930362701, "learning_rate": 6.874786658292472e-05, "loss": 0.0315, "step": 1173 }, { "epoch": 0.8807201800450113, "grad_norm": 0.14885659515857697, "learning_rate": 6.868715047161896e-05, "loss": 0.0389, "step": 1174 }, { "epoch": 0.881470367591898, "grad_norm": 0.17592477798461914, "learning_rate": 6.862640231369029e-05, "loss": 0.0367, "step": 1175 }, { "epoch": 0.8822205551387847, "grad_norm": 0.11507947742938995, "learning_rate": 6.856562221331581e-05, "loss": 0.0276, "step": 1176 }, { "epoch": 0.8829707426856714, "grad_norm": 0.10695502161979675, "learning_rate": 6.850481027472743e-05, "loss": 0.0297, "step": 1177 }, { "epoch": 0.8837209302325582, "grad_norm": 0.08131774514913559, "learning_rate": 6.844396660221164e-05, "loss": 0.0195, "step": 1178 }, { "epoch": 0.8844711177794449, "grad_norm": 0.11771385371685028, "learning_rate": 6.838309130010933e-05, "loss": 0.0236, "step": 1179 }, { "epoch": 0.8852213053263316, "grad_norm": 0.09990225732326508, "learning_rate": 6.832218447281566e-05, "loss": 0.0249, "step": 1180 }, { "epoch": 0.8859714928732183, "grad_norm": 0.12525203824043274, "learning_rate": 6.826124622477981e-05, "loss": 0.0318, "step": 1181 }, { "epoch": 0.886721680420105, "grad_norm": 0.19846653938293457, "learning_rate": 6.820027666050493e-05, "loss": 0.0335, "step": 1182 }, { "epoch": 0.8874718679669917, "grad_norm": 0.17230094969272614, "learning_rate": 6.813927588454778e-05, "loss": 0.0324, "step": 1183 }, { "epoch": 0.8882220555138785, "grad_norm": 0.11220695078372955, "learning_rate": 6.80782440015187e-05, "loss": 0.0291, "step": 1184 }, { "epoch": 0.8889722430607652, "grad_norm": 0.1424178034067154, "learning_rate": 6.801718111608133e-05, "loss": 0.0398, "step": 1185 }, { "epoch": 0.8897224306076519, "grad_norm": 0.1714315265417099, "learning_rate": 6.795608733295254e-05, "loss": 0.0308, "step": 1186 }, { "epoch": 0.8904726181545386, "grad_norm": 0.1779463142156601, "learning_rate": 6.789496275690215e-05, "loss": 0.0317, "step": 1187 }, { "epoch": 0.8912228057014253, "grad_norm": 0.12434078007936478, "learning_rate": 6.783380749275277e-05, "loss": 0.0288, "step": 1188 }, { "epoch": 0.891972993248312, "grad_norm": 0.09934748709201813, "learning_rate": 6.777262164537966e-05, "loss": 0.0287, "step": 1189 }, { "epoch": 0.8927231807951987, "grad_norm": 0.09271322935819626, "learning_rate": 6.771140531971054e-05, "loss": 0.0217, "step": 1190 }, { "epoch": 0.8934733683420856, "grad_norm": 0.1522030234336853, "learning_rate": 6.765015862072536e-05, "loss": 0.0347, "step": 1191 }, { "epoch": 0.8942235558889723, "grad_norm": 0.07218828052282333, "learning_rate": 6.758888165345619e-05, "loss": 0.0219, "step": 1192 }, { "epoch": 0.894973743435859, "grad_norm": 0.1766599416732788, "learning_rate": 6.7527574522987e-05, "loss": 0.0377, "step": 1193 }, { "epoch": 0.8957239309827457, "grad_norm": 0.09117019921541214, "learning_rate": 6.746623733445346e-05, "loss": 0.0208, "step": 1194 }, { "epoch": 0.8964741185296324, "grad_norm": 0.10044237971305847, "learning_rate": 6.740487019304282e-05, "loss": 0.0222, "step": 1195 }, { "epoch": 0.8972243060765192, "grad_norm": 0.11803122609853745, "learning_rate": 6.734347320399369e-05, "loss": 0.0208, "step": 1196 }, { "epoch": 0.8979744936234059, "grad_norm": 0.11625832319259644, "learning_rate": 6.728204647259586e-05, "loss": 0.0434, "step": 1197 }, { "epoch": 0.8987246811702926, "grad_norm": 0.07271845638751984, "learning_rate": 6.722059010419013e-05, "loss": 0.0222, "step": 1198 }, { "epoch": 0.8994748687171793, "grad_norm": 0.15432986617088318, "learning_rate": 6.715910420416809e-05, "loss": 0.0373, "step": 1199 }, { "epoch": 0.900225056264066, "grad_norm": 0.09750841557979584, "learning_rate": 6.709758887797205e-05, "loss": 0.0258, "step": 1200 }, { "epoch": 0.900225056264066, "eval_loss": 0.03287737816572189, "eval_runtime": 5.1474, "eval_samples_per_second": 10.491, "eval_steps_per_second": 2.72, "step": 1200 }, { "epoch": 0.9009752438109527, "grad_norm": 0.12649789452552795, "learning_rate": 6.703604423109468e-05, "loss": 0.0351, "step": 1201 }, { "epoch": 0.9017254313578394, "grad_norm": 0.11160416901111603, "learning_rate": 6.697447036907904e-05, "loss": 0.0233, "step": 1202 }, { "epoch": 0.9024756189047262, "grad_norm": 0.055738791823387146, "learning_rate": 6.691286739751824e-05, "loss": 0.0164, "step": 1203 }, { "epoch": 0.9032258064516129, "grad_norm": 0.15643253922462463, "learning_rate": 6.685123542205526e-05, "loss": 0.0255, "step": 1204 }, { "epoch": 0.9039759939984996, "grad_norm": 0.17560407519340515, "learning_rate": 6.678957454838292e-05, "loss": 0.0479, "step": 1205 }, { "epoch": 0.9047261815453863, "grad_norm": 0.12479657679796219, "learning_rate": 6.672788488224352e-05, "loss": 0.0309, "step": 1206 }, { "epoch": 0.905476369092273, "grad_norm": 0.11823344230651855, "learning_rate": 6.666616652942878e-05, "loss": 0.0282, "step": 1207 }, { "epoch": 0.9062265566391597, "grad_norm": 0.09630381315946579, "learning_rate": 6.660441959577958e-05, "loss": 0.0258, "step": 1208 }, { "epoch": 0.9069767441860465, "grad_norm": 0.14388345181941986, "learning_rate": 6.654264418718584e-05, "loss": 0.0392, "step": 1209 }, { "epoch": 0.9077269317329333, "grad_norm": 0.11177119612693787, "learning_rate": 6.64808404095863e-05, "loss": 0.0259, "step": 1210 }, { "epoch": 0.90847711927982, "grad_norm": 0.1320035457611084, "learning_rate": 6.641900836896835e-05, "loss": 0.0302, "step": 1211 }, { "epoch": 0.9092273068267067, "grad_norm": 0.10894173383712769, "learning_rate": 6.635714817136785e-05, "loss": 0.0222, "step": 1212 }, { "epoch": 0.9099774943735934, "grad_norm": 0.08480621874332428, "learning_rate": 6.629525992286898e-05, "loss": 0.02, "step": 1213 }, { "epoch": 0.9107276819204801, "grad_norm": 0.11812885105609894, "learning_rate": 6.623334372960393e-05, "loss": 0.031, "step": 1214 }, { "epoch": 0.9114778694673669, "grad_norm": 0.15104812383651733, "learning_rate": 6.617139969775292e-05, "loss": 0.0399, "step": 1215 }, { "epoch": 0.9122280570142536, "grad_norm": 0.1338520050048828, "learning_rate": 6.610942793354387e-05, "loss": 0.0343, "step": 1216 }, { "epoch": 0.9129782445611403, "grad_norm": 0.10010179877281189, "learning_rate": 6.604742854325222e-05, "loss": 0.027, "step": 1217 }, { "epoch": 0.913728432108027, "grad_norm": 0.1490030139684677, "learning_rate": 6.598540163320084e-05, "loss": 0.0243, "step": 1218 }, { "epoch": 0.9144786196549137, "grad_norm": 0.13927358388900757, "learning_rate": 6.592334730975975e-05, "loss": 0.038, "step": 1219 }, { "epoch": 0.9152288072018004, "grad_norm": 0.1642918735742569, "learning_rate": 6.586126567934605e-05, "loss": 0.0475, "step": 1220 }, { "epoch": 0.9159789947486872, "grad_norm": 0.12470587342977524, "learning_rate": 6.57991568484236e-05, "loss": 0.0282, "step": 1221 }, { "epoch": 0.9167291822955739, "grad_norm": 0.11485706269741058, "learning_rate": 6.573702092350292e-05, "loss": 0.0327, "step": 1222 }, { "epoch": 0.9174793698424606, "grad_norm": 0.15273365378379822, "learning_rate": 6.567485801114099e-05, "loss": 0.0312, "step": 1223 }, { "epoch": 0.9182295573893473, "grad_norm": 0.0842658132314682, "learning_rate": 6.561266821794111e-05, "loss": 0.0205, "step": 1224 }, { "epoch": 0.918979744936234, "grad_norm": 0.1895236372947693, "learning_rate": 6.555045165055263e-05, "loss": 0.0356, "step": 1225 }, { "epoch": 0.9197299324831207, "grad_norm": 0.13262400031089783, "learning_rate": 6.548820841567086e-05, "loss": 0.0285, "step": 1226 }, { "epoch": 0.9204801200300075, "grad_norm": 0.10285253077745438, "learning_rate": 6.54259386200368e-05, "loss": 0.0256, "step": 1227 }, { "epoch": 0.9212303075768942, "grad_norm": 0.11173094063997269, "learning_rate": 6.536364237043703e-05, "loss": 0.0263, "step": 1228 }, { "epoch": 0.921980495123781, "grad_norm": 0.1017502024769783, "learning_rate": 6.530131977370348e-05, "loss": 0.025, "step": 1229 }, { "epoch": 0.9227306826706677, "grad_norm": 0.11662982404232025, "learning_rate": 6.523897093671326e-05, "loss": 0.0265, "step": 1230 }, { "epoch": 0.9234808702175544, "grad_norm": 0.10500746220350266, "learning_rate": 6.51765959663885e-05, "loss": 0.0287, "step": 1231 }, { "epoch": 0.9242310577644411, "grad_norm": 0.17836572229862213, "learning_rate": 6.511419496969612e-05, "loss": 0.042, "step": 1232 }, { "epoch": 0.9249812453113279, "grad_norm": 0.1660510152578354, "learning_rate": 6.505176805364767e-05, "loss": 0.0501, "step": 1233 }, { "epoch": 0.9257314328582146, "grad_norm": 0.08452211320400238, "learning_rate": 6.498931532529921e-05, "loss": 0.0257, "step": 1234 }, { "epoch": 0.9264816204051013, "grad_norm": 0.1757364720106125, "learning_rate": 6.492683689175098e-05, "loss": 0.0301, "step": 1235 }, { "epoch": 0.927231807951988, "grad_norm": 0.1669311672449112, "learning_rate": 6.486433286014734e-05, "loss": 0.0374, "step": 1236 }, { "epoch": 0.9279819954988747, "grad_norm": 0.1517716944217682, "learning_rate": 6.480180333767658e-05, "loss": 0.0473, "step": 1237 }, { "epoch": 0.9287321830457614, "grad_norm": 0.11780434101819992, "learning_rate": 6.473924843157065e-05, "loss": 0.0236, "step": 1238 }, { "epoch": 0.9294823705926482, "grad_norm": 0.08752031624317169, "learning_rate": 6.467666824910505e-05, "loss": 0.026, "step": 1239 }, { "epoch": 0.9302325581395349, "grad_norm": 0.11586204916238785, "learning_rate": 6.461406289759862e-05, "loss": 0.0274, "step": 1240 }, { "epoch": 0.9309827456864216, "grad_norm": 0.10578847676515579, "learning_rate": 6.455143248441342e-05, "loss": 0.0289, "step": 1241 }, { "epoch": 0.9317329332333083, "grad_norm": 0.12548696994781494, "learning_rate": 6.44887771169544e-05, "loss": 0.0234, "step": 1242 }, { "epoch": 0.932483120780195, "grad_norm": 0.11294113844633102, "learning_rate": 6.442609690266937e-05, "loss": 0.0391, "step": 1243 }, { "epoch": 0.9332333083270817, "grad_norm": 0.10007303953170776, "learning_rate": 6.436339194904872e-05, "loss": 0.0253, "step": 1244 }, { "epoch": 0.9339834958739685, "grad_norm": 0.10004784166812897, "learning_rate": 6.430066236362524e-05, "loss": 0.0321, "step": 1245 }, { "epoch": 0.9347336834208552, "grad_norm": 0.15509480237960815, "learning_rate": 6.423790825397404e-05, "loss": 0.0314, "step": 1246 }, { "epoch": 0.9354838709677419, "grad_norm": 0.1072937548160553, "learning_rate": 6.417512972771219e-05, "loss": 0.0278, "step": 1247 }, { "epoch": 0.9362340585146287, "grad_norm": 0.2849551737308502, "learning_rate": 6.411232689249873e-05, "loss": 0.0447, "step": 1248 }, { "epoch": 0.9369842460615154, "grad_norm": 0.10778176784515381, "learning_rate": 6.40494998560343e-05, "loss": 0.0247, "step": 1249 }, { "epoch": 0.9377344336084021, "grad_norm": 0.15809431672096252, "learning_rate": 6.39866487260611e-05, "loss": 0.0276, "step": 1250 }, { "epoch": 0.9384846211552889, "grad_norm": 0.08884350955486298, "learning_rate": 6.392377361036262e-05, "loss": 0.0189, "step": 1251 }, { "epoch": 0.9392348087021756, "grad_norm": 0.07642669975757599, "learning_rate": 6.386087461676351e-05, "loss": 0.0201, "step": 1252 }, { "epoch": 0.9399849962490623, "grad_norm": 0.10072652995586395, "learning_rate": 6.379795185312933e-05, "loss": 0.0202, "step": 1253 }, { "epoch": 0.940735183795949, "grad_norm": 0.132140651345253, "learning_rate": 6.373500542736643e-05, "loss": 0.029, "step": 1254 }, { "epoch": 0.9414853713428357, "grad_norm": 0.09724639356136322, "learning_rate": 6.367203544742171e-05, "loss": 0.0252, "step": 1255 }, { "epoch": 0.9422355588897224, "grad_norm": 0.20375050604343414, "learning_rate": 6.360904202128252e-05, "loss": 0.0373, "step": 1256 }, { "epoch": 0.9429857464366092, "grad_norm": 0.14257559180259705, "learning_rate": 6.354602525697638e-05, "loss": 0.033, "step": 1257 }, { "epoch": 0.9437359339834959, "grad_norm": 0.12351656705141068, "learning_rate": 6.348298526257082e-05, "loss": 0.0328, "step": 1258 }, { "epoch": 0.9444861215303826, "grad_norm": 0.12928931415081024, "learning_rate": 6.341992214617323e-05, "loss": 0.0264, "step": 1259 }, { "epoch": 0.9452363090772693, "grad_norm": 0.13477769494056702, "learning_rate": 6.335683601593062e-05, "loss": 0.0402, "step": 1260 }, { "epoch": 0.945986496624156, "grad_norm": 0.1282130926847458, "learning_rate": 6.329372698002954e-05, "loss": 0.0261, "step": 1261 }, { "epoch": 0.9467366841710427, "grad_norm": 0.3520214259624481, "learning_rate": 6.323059514669571e-05, "loss": 0.0611, "step": 1262 }, { "epoch": 0.9474868717179294, "grad_norm": 0.1838792860507965, "learning_rate": 6.316744062419409e-05, "loss": 0.0302, "step": 1263 }, { "epoch": 0.9482370592648162, "grad_norm": 0.21650949120521545, "learning_rate": 6.310426352082838e-05, "loss": 0.0434, "step": 1264 }, { "epoch": 0.9489872468117029, "grad_norm": 0.10584396123886108, "learning_rate": 6.304106394494116e-05, "loss": 0.0273, "step": 1265 }, { "epoch": 0.9497374343585896, "grad_norm": 0.16731998324394226, "learning_rate": 6.297784200491343e-05, "loss": 0.0417, "step": 1266 }, { "epoch": 0.9504876219054764, "grad_norm": 0.10184057801961899, "learning_rate": 6.291459780916463e-05, "loss": 0.0207, "step": 1267 }, { "epoch": 0.9512378094523631, "grad_norm": 0.14981421828269958, "learning_rate": 6.285133146615228e-05, "loss": 0.0292, "step": 1268 }, { "epoch": 0.9519879969992499, "grad_norm": 0.12305141985416412, "learning_rate": 6.278804308437198e-05, "loss": 0.0255, "step": 1269 }, { "epoch": 0.9527381845461366, "grad_norm": 0.1167244091629982, "learning_rate": 6.272473277235703e-05, "loss": 0.03, "step": 1270 }, { "epoch": 0.9534883720930233, "grad_norm": 0.12276820093393326, "learning_rate": 6.266140063867843e-05, "loss": 0.0307, "step": 1271 }, { "epoch": 0.95423855963991, "grad_norm": 0.16493280231952667, "learning_rate": 6.25980467919445e-05, "loss": 0.039, "step": 1272 }, { "epoch": 0.9549887471867967, "grad_norm": 0.11039330810308456, "learning_rate": 6.253467134080088e-05, "loss": 0.0265, "step": 1273 }, { "epoch": 0.9557389347336834, "grad_norm": 0.08698292821645737, "learning_rate": 6.247127439393023e-05, "loss": 0.0236, "step": 1274 }, { "epoch": 0.9564891222805701, "grad_norm": 0.13140971958637238, "learning_rate": 6.240785606005206e-05, "loss": 0.0295, "step": 1275 }, { "epoch": 0.9572393098274569, "grad_norm": 0.21670088171958923, "learning_rate": 6.234441644792256e-05, "loss": 0.0256, "step": 1276 }, { "epoch": 0.9579894973743436, "grad_norm": 0.1584072709083557, "learning_rate": 6.228095566633443e-05, "loss": 0.0393, "step": 1277 }, { "epoch": 0.9587396849212303, "grad_norm": 0.07527758181095123, "learning_rate": 6.221747382411667e-05, "loss": 0.0181, "step": 1278 }, { "epoch": 0.959489872468117, "grad_norm": 0.121820367872715, "learning_rate": 6.215397103013436e-05, "loss": 0.0328, "step": 1279 }, { "epoch": 0.9602400600150037, "grad_norm": 0.09028357267379761, "learning_rate": 6.209044739328858e-05, "loss": 0.0247, "step": 1280 }, { "epoch": 0.9609902475618904, "grad_norm": 0.13452784717082977, "learning_rate": 6.202690302251606e-05, "loss": 0.0375, "step": 1281 }, { "epoch": 0.9617404351087772, "grad_norm": 0.10062450915575027, "learning_rate": 6.196333802678918e-05, "loss": 0.029, "step": 1282 }, { "epoch": 0.9624906226556639, "grad_norm": 0.2688952088356018, "learning_rate": 6.189975251511562e-05, "loss": 0.0416, "step": 1283 }, { "epoch": 0.9632408102025506, "grad_norm": 0.1521732211112976, "learning_rate": 6.18361465965383e-05, "loss": 0.0431, "step": 1284 }, { "epoch": 0.9639909977494373, "grad_norm": 0.11925928294658661, "learning_rate": 6.177252038013509e-05, "loss": 0.0282, "step": 1285 }, { "epoch": 0.9647411852963241, "grad_norm": 0.07919792085886002, "learning_rate": 6.170887397501868e-05, "loss": 0.0203, "step": 1286 }, { "epoch": 0.9654913728432108, "grad_norm": 0.16996318101882935, "learning_rate": 6.16452074903364e-05, "loss": 0.0437, "step": 1287 }, { "epoch": 0.9662415603900976, "grad_norm": 0.17046809196472168, "learning_rate": 6.158152103527e-05, "loss": 0.046, "step": 1288 }, { "epoch": 0.9669917479369843, "grad_norm": 0.14924894273281097, "learning_rate": 6.151781471903548e-05, "loss": 0.0272, "step": 1289 }, { "epoch": 0.967741935483871, "grad_norm": 0.16980625689029694, "learning_rate": 6.14540886508829e-05, "loss": 0.051, "step": 1290 }, { "epoch": 0.9684921230307577, "grad_norm": 0.12601691484451294, "learning_rate": 6.139034294009617e-05, "loss": 0.0333, "step": 1291 }, { "epoch": 0.9692423105776444, "grad_norm": 0.10268208384513855, "learning_rate": 6.132657769599293e-05, "loss": 0.0283, "step": 1292 }, { "epoch": 0.9699924981245311, "grad_norm": 0.16088128089904785, "learning_rate": 6.126279302792429e-05, "loss": 0.0515, "step": 1293 }, { "epoch": 0.9707426856714179, "grad_norm": 0.07889743149280548, "learning_rate": 6.119898904527468e-05, "loss": 0.0165, "step": 1294 }, { "epoch": 0.9714928732183046, "grad_norm": 0.11629171669483185, "learning_rate": 6.113516585746164e-05, "loss": 0.0484, "step": 1295 }, { "epoch": 0.9722430607651913, "grad_norm": 0.11696446686983109, "learning_rate": 6.107132357393563e-05, "loss": 0.0304, "step": 1296 }, { "epoch": 0.972993248312078, "grad_norm": 0.1155184879899025, "learning_rate": 6.100746230417993e-05, "loss": 0.0218, "step": 1297 }, { "epoch": 0.9737434358589647, "grad_norm": 0.13009968400001526, "learning_rate": 6.0943582157710285e-05, "loss": 0.0274, "step": 1298 }, { "epoch": 0.9744936234058514, "grad_norm": 0.11266110092401505, "learning_rate": 6.0879683244074894e-05, "loss": 0.0321, "step": 1299 }, { "epoch": 0.9752438109527382, "grad_norm": 0.0928129181265831, "learning_rate": 6.0815765672854065e-05, "loss": 0.0265, "step": 1300 }, { "epoch": 0.9759939984996249, "grad_norm": 0.10671856254339218, "learning_rate": 6.0751829553660155e-05, "loss": 0.0249, "step": 1301 }, { "epoch": 0.9767441860465116, "grad_norm": 0.09111996740102768, "learning_rate": 6.06878749961373e-05, "loss": 0.0299, "step": 1302 }, { "epoch": 0.9774943735933983, "grad_norm": 0.19780229032039642, "learning_rate": 6.0623902109961295e-05, "loss": 0.0385, "step": 1303 }, { "epoch": 0.978244561140285, "grad_norm": 0.13554957509040833, "learning_rate": 6.055991100483932e-05, "loss": 0.0384, "step": 1304 }, { "epoch": 0.9789947486871718, "grad_norm": 0.13489709794521332, "learning_rate": 6.0495901790509836e-05, "loss": 0.0292, "step": 1305 }, { "epoch": 0.9797449362340586, "grad_norm": 0.11630948632955551, "learning_rate": 6.043187457674231e-05, "loss": 0.025, "step": 1306 }, { "epoch": 0.9804951237809453, "grad_norm": 0.14980089664459229, "learning_rate": 6.0367829473337136e-05, "loss": 0.0324, "step": 1307 }, { "epoch": 0.981245311327832, "grad_norm": 0.09936217218637466, "learning_rate": 6.0303766590125365e-05, "loss": 0.0231, "step": 1308 }, { "epoch": 0.9819954988747187, "grad_norm": 0.10359448194503784, "learning_rate": 6.02396860369685e-05, "loss": 0.0348, "step": 1309 }, { "epoch": 0.9827456864216054, "grad_norm": 0.17159534990787506, "learning_rate": 6.0175587923758416e-05, "loss": 0.0413, "step": 1310 }, { "epoch": 0.9834958739684921, "grad_norm": 0.15349924564361572, "learning_rate": 6.0111472360417044e-05, "loss": 0.0321, "step": 1311 }, { "epoch": 0.9842460615153789, "grad_norm": 0.12869088351726532, "learning_rate": 6.004733945689628e-05, "loss": 0.0318, "step": 1312 }, { "epoch": 0.9849962490622656, "grad_norm": 0.1338404268026352, "learning_rate": 5.998318932317771e-05, "loss": 0.0393, "step": 1313 }, { "epoch": 0.9857464366091523, "grad_norm": 0.12612248957157135, "learning_rate": 5.991902206927252e-05, "loss": 0.0264, "step": 1314 }, { "epoch": 0.986496624156039, "grad_norm": 0.081502266228199, "learning_rate": 5.985483780522122e-05, "loss": 0.0223, "step": 1315 }, { "epoch": 0.9872468117029257, "grad_norm": 0.14809326827526093, "learning_rate": 5.9790636641093523e-05, "loss": 0.0258, "step": 1316 }, { "epoch": 0.9879969992498124, "grad_norm": 0.12299007922410965, "learning_rate": 5.972641868698805e-05, "loss": 0.0258, "step": 1317 }, { "epoch": 0.9887471867966992, "grad_norm": 0.20244580507278442, "learning_rate": 5.966218405303234e-05, "loss": 0.0429, "step": 1318 }, { "epoch": 0.9894973743435859, "grad_norm": 0.09541887789964676, "learning_rate": 5.959793284938242e-05, "loss": 0.0285, "step": 1319 }, { "epoch": 0.9902475618904726, "grad_norm": 0.1416272521018982, "learning_rate": 5.953366518622279e-05, "loss": 0.0379, "step": 1320 }, { "epoch": 0.9909977494373593, "grad_norm": 0.10651285201311111, "learning_rate": 5.946938117376616e-05, "loss": 0.0292, "step": 1321 }, { "epoch": 0.991747936984246, "grad_norm": 0.12336979806423187, "learning_rate": 5.940508092225328e-05, "loss": 0.0266, "step": 1322 }, { "epoch": 0.9924981245311327, "grad_norm": 0.1195278912782669, "learning_rate": 5.9340764541952755e-05, "loss": 0.0298, "step": 1323 }, { "epoch": 0.9932483120780196, "grad_norm": 0.08796902745962143, "learning_rate": 5.9276432143160835e-05, "loss": 0.0184, "step": 1324 }, { "epoch": 0.9939984996249063, "grad_norm": 0.10521414875984192, "learning_rate": 5.921208383620126e-05, "loss": 0.0213, "step": 1325 }, { "epoch": 0.994748687171793, "grad_norm": 0.2021062821149826, "learning_rate": 5.9147719731425034e-05, "loss": 0.0414, "step": 1326 }, { "epoch": 0.9954988747186797, "grad_norm": 0.09227265417575836, "learning_rate": 5.908333993921027e-05, "loss": 0.0245, "step": 1327 }, { "epoch": 0.9962490622655664, "grad_norm": 0.09092900902032852, "learning_rate": 5.901894456996196e-05, "loss": 0.0254, "step": 1328 }, { "epoch": 0.9969992498124531, "grad_norm": 0.18778061866760254, "learning_rate": 5.895453373411182e-05, "loss": 0.0473, "step": 1329 }, { "epoch": 0.9977494373593399, "grad_norm": 0.20227041840553284, "learning_rate": 5.889010754211809e-05, "loss": 0.0473, "step": 1330 }, { "epoch": 0.9984996249062266, "grad_norm": 0.13871866464614868, "learning_rate": 5.882566610446534e-05, "loss": 0.0239, "step": 1331 }, { "epoch": 0.9992498124531133, "grad_norm": 0.11242667585611343, "learning_rate": 5.8761209531664306e-05, "loss": 0.0278, "step": 1332 }, { "epoch": 1.0, "grad_norm": 0.16068680584430695, "learning_rate": 5.869673793425168e-05, "loss": 0.0362, "step": 1333 }, { "epoch": 1.0007501875468867, "grad_norm": 0.09198249876499176, "learning_rate": 5.863225142278985e-05, "loss": 0.027, "step": 1334 }, { "epoch": 1.0015003750937734, "grad_norm": 0.09522329270839691, "learning_rate": 5.856775010786687e-05, "loss": 0.0186, "step": 1335 }, { "epoch": 1.0022505626406601, "grad_norm": 0.10609661787748337, "learning_rate": 5.850323410009614e-05, "loss": 0.0189, "step": 1336 }, { "epoch": 1.0030007501875469, "grad_norm": 0.1119598001241684, "learning_rate": 5.8438703510116256e-05, "loss": 0.0216, "step": 1337 }, { "epoch": 1.0037509377344336, "grad_norm": 0.21976079046726227, "learning_rate": 5.8374158448590823e-05, "loss": 0.0358, "step": 1338 }, { "epoch": 1.0045011252813203, "grad_norm": 0.06545736640691757, "learning_rate": 5.830959902620826e-05, "loss": 0.0168, "step": 1339 }, { "epoch": 1.005251312828207, "grad_norm": 0.11048869788646698, "learning_rate": 5.824502535368164e-05, "loss": 0.0216, "step": 1340 }, { "epoch": 1.0060015003750937, "grad_norm": 0.0982142835855484, "learning_rate": 5.818043754174843e-05, "loss": 0.0216, "step": 1341 }, { "epoch": 1.0067516879219804, "grad_norm": 0.13148756325244904, "learning_rate": 5.81158357011704e-05, "loss": 0.0278, "step": 1342 }, { "epoch": 1.0075018754688672, "grad_norm": 0.05980212241411209, "learning_rate": 5.80512199427333e-05, "loss": 0.0121, "step": 1343 }, { "epoch": 1.0082520630157539, "grad_norm": 0.09239038079977036, "learning_rate": 5.798659037724683e-05, "loss": 0.0203, "step": 1344 }, { "epoch": 1.0090022505626406, "grad_norm": 0.09601394832134247, "learning_rate": 5.792194711554429e-05, "loss": 0.0205, "step": 1345 }, { "epoch": 1.0097524381095273, "grad_norm": 0.08131718635559082, "learning_rate": 5.7857290268482555e-05, "loss": 0.0216, "step": 1346 }, { "epoch": 1.010502625656414, "grad_norm": 0.06746546179056168, "learning_rate": 5.779261994694173e-05, "loss": 0.0093, "step": 1347 }, { "epoch": 1.0112528132033007, "grad_norm": 0.14873553812503815, "learning_rate": 5.772793626182506e-05, "loss": 0.0256, "step": 1348 }, { "epoch": 1.0120030007501875, "grad_norm": 0.09325051307678223, "learning_rate": 5.766323932405866e-05, "loss": 0.0187, "step": 1349 }, { "epoch": 1.0127531882970742, "grad_norm": 0.16514313220977783, "learning_rate": 5.7598529244591436e-05, "loss": 0.0274, "step": 1350 }, { "epoch": 1.0135033758439609, "grad_norm": 0.139047771692276, "learning_rate": 5.7533806134394806e-05, "loss": 0.0194, "step": 1351 }, { "epoch": 1.0142535633908478, "grad_norm": 0.09351517260074615, "learning_rate": 5.746907010446252e-05, "loss": 0.0155, "step": 1352 }, { "epoch": 1.0150037509377345, "grad_norm": 0.09682505577802658, "learning_rate": 5.740432126581049e-05, "loss": 0.0171, "step": 1353 }, { "epoch": 1.0157539384846213, "grad_norm": 0.14819428324699402, "learning_rate": 5.73395597294766e-05, "loss": 0.0103, "step": 1354 }, { "epoch": 1.016504126031508, "grad_norm": 0.10570473223924637, "learning_rate": 5.727478560652053e-05, "loss": 0.0219, "step": 1355 }, { "epoch": 1.0172543135783947, "grad_norm": 0.1157071441411972, "learning_rate": 5.7209999008023496e-05, "loss": 0.025, "step": 1356 }, { "epoch": 1.0180045011252814, "grad_norm": 0.15539030730724335, "learning_rate": 5.7145200045088156e-05, "loss": 0.0244, "step": 1357 }, { "epoch": 1.0187546886721681, "grad_norm": 0.06445297598838806, "learning_rate": 5.7080388828838324e-05, "loss": 0.0138, "step": 1358 }, { "epoch": 1.0195048762190548, "grad_norm": 0.14610378444194794, "learning_rate": 5.701556547041888e-05, "loss": 0.026, "step": 1359 }, { "epoch": 1.0202550637659416, "grad_norm": 0.1265845000743866, "learning_rate": 5.695073008099547e-05, "loss": 0.0182, "step": 1360 }, { "epoch": 1.0210052513128283, "grad_norm": 0.16218756139278412, "learning_rate": 5.688588277175444e-05, "loss": 0.0211, "step": 1361 }, { "epoch": 1.021755438859715, "grad_norm": 0.11030188202857971, "learning_rate": 5.6821023653902517e-05, "loss": 0.0159, "step": 1362 }, { "epoch": 1.0225056264066017, "grad_norm": 0.0763898640871048, "learning_rate": 5.675615283866671e-05, "loss": 0.013, "step": 1363 }, { "epoch": 1.0232558139534884, "grad_norm": 0.15313951671123505, "learning_rate": 5.669127043729406e-05, "loss": 0.0282, "step": 1364 }, { "epoch": 1.0240060015003751, "grad_norm": 0.07773459702730179, "learning_rate": 5.662637656105152e-05, "loss": 0.0177, "step": 1365 }, { "epoch": 1.0247561890472618, "grad_norm": 0.1399940848350525, "learning_rate": 5.6561471321225676e-05, "loss": 0.0252, "step": 1366 }, { "epoch": 1.0255063765941486, "grad_norm": 0.14890314638614655, "learning_rate": 5.649655482912265e-05, "loss": 0.0334, "step": 1367 }, { "epoch": 1.0262565641410353, "grad_norm": 0.10625971853733063, "learning_rate": 5.6431627196067816e-05, "loss": 0.0205, "step": 1368 }, { "epoch": 1.027006751687922, "grad_norm": 0.059316691011190414, "learning_rate": 5.636668853340567e-05, "loss": 0.0101, "step": 1369 }, { "epoch": 1.0277569392348087, "grad_norm": 0.05441593378782272, "learning_rate": 5.6301738952499636e-05, "loss": 0.0118, "step": 1370 }, { "epoch": 1.0285071267816954, "grad_norm": 0.12986020743846893, "learning_rate": 5.623677856473183e-05, "loss": 0.0306, "step": 1371 }, { "epoch": 1.0292573143285821, "grad_norm": 0.08668938279151917, "learning_rate": 5.617180748150295e-05, "loss": 0.017, "step": 1372 }, { "epoch": 1.0300075018754689, "grad_norm": 0.055281803011894226, "learning_rate": 5.6106825814231953e-05, "loss": 0.014, "step": 1373 }, { "epoch": 1.0307576894223556, "grad_norm": 0.14348354935646057, "learning_rate": 5.604183367435606e-05, "loss": 0.0191, "step": 1374 }, { "epoch": 1.0315078769692423, "grad_norm": 0.15625552833080292, "learning_rate": 5.597683117333036e-05, "loss": 0.0277, "step": 1375 }, { "epoch": 1.032258064516129, "grad_norm": 0.0986480638384819, "learning_rate": 5.591181842262776e-05, "loss": 0.0175, "step": 1376 }, { "epoch": 1.0330082520630157, "grad_norm": 0.16133823990821838, "learning_rate": 5.584679553373869e-05, "loss": 0.0175, "step": 1377 }, { "epoch": 1.0337584396099024, "grad_norm": 0.08799955993890762, "learning_rate": 5.578176261817104e-05, "loss": 0.0159, "step": 1378 }, { "epoch": 1.0345086271567892, "grad_norm": 0.09160026162862778, "learning_rate": 5.571671978744983e-05, "loss": 0.0143, "step": 1379 }, { "epoch": 1.0352588147036759, "grad_norm": 0.10948408395051956, "learning_rate": 5.565166715311711e-05, "loss": 0.0188, "step": 1380 }, { "epoch": 1.0360090022505626, "grad_norm": 0.05880796164274216, "learning_rate": 5.558660482673177e-05, "loss": 0.0119, "step": 1381 }, { "epoch": 1.0367591897974493, "grad_norm": 0.1001565083861351, "learning_rate": 5.552153291986927e-05, "loss": 0.0227, "step": 1382 }, { "epoch": 1.037509377344336, "grad_norm": 0.06972013413906097, "learning_rate": 5.5456451544121523e-05, "loss": 0.0175, "step": 1383 }, { "epoch": 1.0382595648912227, "grad_norm": 0.11640258133411407, "learning_rate": 5.5391360811096684e-05, "loss": 0.0156, "step": 1384 }, { "epoch": 1.0390097524381094, "grad_norm": 0.0860258936882019, "learning_rate": 5.5326260832418955e-05, "loss": 0.0173, "step": 1385 }, { "epoch": 1.0397599399849962, "grad_norm": 0.11858560144901276, "learning_rate": 5.526115171972838e-05, "loss": 0.0224, "step": 1386 }, { "epoch": 1.0405101275318829, "grad_norm": 0.10665401071310043, "learning_rate": 5.5196033584680675e-05, "loss": 0.0222, "step": 1387 }, { "epoch": 1.0412603150787696, "grad_norm": 0.08180026710033417, "learning_rate": 5.5130906538947034e-05, "loss": 0.0151, "step": 1388 }, { "epoch": 1.0420105026256563, "grad_norm": 0.10901845246553421, "learning_rate": 5.506577069421395e-05, "loss": 0.0143, "step": 1389 }, { "epoch": 1.042760690172543, "grad_norm": 0.14608041942119598, "learning_rate": 5.5000626162182944e-05, "loss": 0.0217, "step": 1390 }, { "epoch": 1.04351087771943, "grad_norm": 0.1770605593919754, "learning_rate": 5.49354730545705e-05, "loss": 0.0321, "step": 1391 }, { "epoch": 1.0442610652663167, "grad_norm": 0.11851367354393005, "learning_rate": 5.487031148310775e-05, "loss": 0.0147, "step": 1392 }, { "epoch": 1.0450112528132034, "grad_norm": 0.10560742020606995, "learning_rate": 5.480514155954042e-05, "loss": 0.0228, "step": 1393 }, { "epoch": 1.04576144036009, "grad_norm": 0.10256641358137131, "learning_rate": 5.4739963395628456e-05, "loss": 0.0153, "step": 1394 }, { "epoch": 1.0465116279069768, "grad_norm": 0.08817125111818314, "learning_rate": 5.4674777103146045e-05, "loss": 0.0142, "step": 1395 }, { "epoch": 1.0472618154538635, "grad_norm": 0.12297326326370239, "learning_rate": 5.460958279388122e-05, "loss": 0.0196, "step": 1396 }, { "epoch": 1.0480120030007503, "grad_norm": 0.07521190494298935, "learning_rate": 5.4544380579635824e-05, "loss": 0.0167, "step": 1397 }, { "epoch": 1.048762190547637, "grad_norm": 0.09659700840711594, "learning_rate": 5.447917057222523e-05, "loss": 0.0198, "step": 1398 }, { "epoch": 1.0495123780945237, "grad_norm": 0.09551558643579483, "learning_rate": 5.441395288347818e-05, "loss": 0.0156, "step": 1399 }, { "epoch": 1.0502625656414104, "grad_norm": 0.08930996060371399, "learning_rate": 5.434872762523658e-05, "loss": 0.0175, "step": 1400 }, { "epoch": 1.0502625656414104, "eval_loss": 0.03195345029234886, "eval_runtime": 5.1212, "eval_samples_per_second": 10.544, "eval_steps_per_second": 2.734, "step": 1400 }, { "epoch": 1.0510127531882971, "grad_norm": 0.08074672520160675, "learning_rate": 5.4283494909355314e-05, "loss": 0.014, "step": 1401 }, { "epoch": 1.0517629407351838, "grad_norm": 0.17933864891529083, "learning_rate": 5.42182548477021e-05, "loss": 0.0193, "step": 1402 }, { "epoch": 1.0525131282820706, "grad_norm": 0.11653754115104675, "learning_rate": 5.41530075521572e-05, "loss": 0.0163, "step": 1403 }, { "epoch": 1.0532633158289573, "grad_norm": 0.087211973965168, "learning_rate": 5.4087753134613294e-05, "loss": 0.0156, "step": 1404 }, { "epoch": 1.054013503375844, "grad_norm": 0.1614168882369995, "learning_rate": 5.40224917069753e-05, "loss": 0.0166, "step": 1405 }, { "epoch": 1.0547636909227307, "grad_norm": 0.10352689027786255, "learning_rate": 5.3957223381160126e-05, "loss": 0.0173, "step": 1406 }, { "epoch": 1.0555138784696174, "grad_norm": 0.1243826299905777, "learning_rate": 5.389194826909653e-05, "loss": 0.0165, "step": 1407 }, { "epoch": 1.0562640660165041, "grad_norm": 0.31279873847961426, "learning_rate": 5.382666648272489e-05, "loss": 0.0503, "step": 1408 }, { "epoch": 1.0570142535633908, "grad_norm": 0.10250795632600784, "learning_rate": 5.3761378133997044e-05, "loss": 0.0199, "step": 1409 }, { "epoch": 1.0577644411102776, "grad_norm": 0.10815533995628357, "learning_rate": 5.3696083334876105e-05, "loss": 0.0217, "step": 1410 }, { "epoch": 1.0585146286571643, "grad_norm": 0.09339367598295212, "learning_rate": 5.363078219733619e-05, "loss": 0.015, "step": 1411 }, { "epoch": 1.059264816204051, "grad_norm": 0.12286090105772018, "learning_rate": 5.3565474833362353e-05, "loss": 0.0152, "step": 1412 }, { "epoch": 1.0600150037509377, "grad_norm": 0.08419165015220642, "learning_rate": 5.3500161354950274e-05, "loss": 0.0136, "step": 1413 }, { "epoch": 1.0607651912978244, "grad_norm": 0.1648758202791214, "learning_rate": 5.3434841874106124e-05, "loss": 0.0245, "step": 1414 }, { "epoch": 1.0615153788447111, "grad_norm": 0.08925841748714447, "learning_rate": 5.3369516502846396e-05, "loss": 0.0206, "step": 1415 }, { "epoch": 1.0622655663915979, "grad_norm": 0.1235809400677681, "learning_rate": 5.330418535319768e-05, "loss": 0.0239, "step": 1416 }, { "epoch": 1.0630157539384846, "grad_norm": 0.09026822447776794, "learning_rate": 5.323884853719645e-05, "loss": 0.0128, "step": 1417 }, { "epoch": 1.0637659414853713, "grad_norm": 0.07726217061281204, "learning_rate": 5.31735061668889e-05, "loss": 0.0139, "step": 1418 }, { "epoch": 1.064516129032258, "grad_norm": 0.12053043395280838, "learning_rate": 5.3108158354330795e-05, "loss": 0.0218, "step": 1419 }, { "epoch": 1.0652663165791447, "grad_norm": 0.08617470413446426, "learning_rate": 5.304280521158716e-05, "loss": 0.0118, "step": 1420 }, { "epoch": 1.0660165041260314, "grad_norm": 0.17755377292633057, "learning_rate": 5.2977446850732236e-05, "loss": 0.0183, "step": 1421 }, { "epoch": 1.0667666916729182, "grad_norm": 0.12063173204660416, "learning_rate": 5.291208338384913e-05, "loss": 0.0224, "step": 1422 }, { "epoch": 1.0675168792198049, "grad_norm": 0.1364491879940033, "learning_rate": 5.2846714923029795e-05, "loss": 0.0197, "step": 1423 }, { "epoch": 1.0682670667666916, "grad_norm": 0.132155641913414, "learning_rate": 5.278134158037469e-05, "loss": 0.0181, "step": 1424 }, { "epoch": 1.0690172543135783, "grad_norm": 0.0874357596039772, "learning_rate": 5.2715963467992656e-05, "loss": 0.0146, "step": 1425 }, { "epoch": 1.069767441860465, "grad_norm": 0.17030027508735657, "learning_rate": 5.265058069800072e-05, "loss": 0.0358, "step": 1426 }, { "epoch": 1.0705176294073517, "grad_norm": 0.10025857388973236, "learning_rate": 5.258519338252389e-05, "loss": 0.0221, "step": 1427 }, { "epoch": 1.0712678169542387, "grad_norm": 0.10002993047237396, "learning_rate": 5.251980163369499e-05, "loss": 0.0142, "step": 1428 }, { "epoch": 1.0720180045011252, "grad_norm": 0.0852247104048729, "learning_rate": 5.24544055636544e-05, "loss": 0.0102, "step": 1429 }, { "epoch": 1.072768192048012, "grad_norm": 0.08871036022901535, "learning_rate": 5.2389005284549954e-05, "loss": 0.0136, "step": 1430 }, { "epoch": 1.0735183795948988, "grad_norm": 0.10242506861686707, "learning_rate": 5.232360090853671e-05, "loss": 0.022, "step": 1431 }, { "epoch": 1.0742685671417855, "grad_norm": 0.11138740926980972, "learning_rate": 5.225819254777671e-05, "loss": 0.0167, "step": 1432 }, { "epoch": 1.0750187546886723, "grad_norm": 0.1044057235121727, "learning_rate": 5.219278031443886e-05, "loss": 0.0125, "step": 1433 }, { "epoch": 1.075768942235559, "grad_norm": 0.1448916643857956, "learning_rate": 5.21273643206987e-05, "loss": 0.0182, "step": 1434 }, { "epoch": 1.0765191297824457, "grad_norm": 0.11542975157499313, "learning_rate": 5.206194467873822e-05, "loss": 0.0178, "step": 1435 }, { "epoch": 1.0772693173293324, "grad_norm": 0.10346329212188721, "learning_rate": 5.1996521500745645e-05, "loss": 0.0229, "step": 1436 }, { "epoch": 1.0780195048762191, "grad_norm": 0.1779703050851822, "learning_rate": 5.19310948989153e-05, "loss": 0.0219, "step": 1437 }, { "epoch": 1.0787696924231058, "grad_norm": 0.11344017088413239, "learning_rate": 5.186566498544737e-05, "loss": 0.0179, "step": 1438 }, { "epoch": 1.0795198799699925, "grad_norm": 0.08599342405796051, "learning_rate": 5.18002318725477e-05, "loss": 0.0121, "step": 1439 }, { "epoch": 1.0802700675168793, "grad_norm": 0.13371822237968445, "learning_rate": 5.173479567242765e-05, "loss": 0.018, "step": 1440 }, { "epoch": 1.081020255063766, "grad_norm": 0.16816803812980652, "learning_rate": 5.1669356497303835e-05, "loss": 0.0294, "step": 1441 }, { "epoch": 1.0817704426106527, "grad_norm": 0.11909490823745728, "learning_rate": 5.1603914459398016e-05, "loss": 0.0217, "step": 1442 }, { "epoch": 1.0825206301575394, "grad_norm": 0.1320752054452896, "learning_rate": 5.153846967093684e-05, "loss": 0.0255, "step": 1443 }, { "epoch": 1.0832708177044261, "grad_norm": 0.10966454446315765, "learning_rate": 5.1473022244151684e-05, "loss": 0.0141, "step": 1444 }, { "epoch": 1.0840210052513128, "grad_norm": 0.11884035170078278, "learning_rate": 5.140757229127842e-05, "loss": 0.016, "step": 1445 }, { "epoch": 1.0847711927981996, "grad_norm": 0.12369532883167267, "learning_rate": 5.1342119924557275e-05, "loss": 0.0189, "step": 1446 }, { "epoch": 1.0855213803450863, "grad_norm": 0.18551528453826904, "learning_rate": 5.127666525623264e-05, "loss": 0.0324, "step": 1447 }, { "epoch": 1.086271567891973, "grad_norm": 0.16022326052188873, "learning_rate": 5.121120839855279e-05, "loss": 0.0217, "step": 1448 }, { "epoch": 1.0870217554388597, "grad_norm": 0.19785533845424652, "learning_rate": 5.114574946376982e-05, "loss": 0.0234, "step": 1449 }, { "epoch": 1.0877719429857464, "grad_norm": 0.09503129124641418, "learning_rate": 5.1080288564139325e-05, "loss": 0.0131, "step": 1450 }, { "epoch": 1.0885221305326331, "grad_norm": 0.27920621633529663, "learning_rate": 5.101482581192033e-05, "loss": 0.0254, "step": 1451 }, { "epoch": 1.0892723180795199, "grad_norm": 0.11794818937778473, "learning_rate": 5.0949361319374996e-05, "loss": 0.0232, "step": 1452 }, { "epoch": 1.0900225056264066, "grad_norm": 0.1276724487543106, "learning_rate": 5.0883895198768494e-05, "loss": 0.0217, "step": 1453 }, { "epoch": 1.0907726931732933, "grad_norm": 0.08574303984642029, "learning_rate": 5.0818427562368764e-05, "loss": 0.0126, "step": 1454 }, { "epoch": 1.09152288072018, "grad_norm": 0.09915383160114288, "learning_rate": 5.0752958522446356e-05, "loss": 0.0235, "step": 1455 }, { "epoch": 1.0922730682670667, "grad_norm": 0.11475180089473724, "learning_rate": 5.0687488191274215e-05, "loss": 0.0229, "step": 1456 }, { "epoch": 1.0930232558139534, "grad_norm": 0.17217311263084412, "learning_rate": 5.0622016681127526e-05, "loss": 0.0183, "step": 1457 }, { "epoch": 1.0937734433608401, "grad_norm": 0.09383442997932434, "learning_rate": 5.055654410428349e-05, "loss": 0.0169, "step": 1458 }, { "epoch": 1.0945236309077269, "grad_norm": 0.10978903621435165, "learning_rate": 5.0491070573021116e-05, "loss": 0.0179, "step": 1459 }, { "epoch": 1.0952738184546136, "grad_norm": 0.13406389951705933, "learning_rate": 5.0425596199621064e-05, "loss": 0.0268, "step": 1460 }, { "epoch": 1.0960240060015003, "grad_norm": 0.09866870194673538, "learning_rate": 5.036012109636543e-05, "loss": 0.015, "step": 1461 }, { "epoch": 1.096774193548387, "grad_norm": 0.08531273901462555, "learning_rate": 5.0294645375537594e-05, "loss": 0.0162, "step": 1462 }, { "epoch": 1.0975243810952737, "grad_norm": 0.12082972377538681, "learning_rate": 5.022916914942195e-05, "loss": 0.0169, "step": 1463 }, { "epoch": 1.0982745686421604, "grad_norm": 0.1270955353975296, "learning_rate": 5.0163692530303774e-05, "loss": 0.0183, "step": 1464 }, { "epoch": 1.0990247561890472, "grad_norm": 0.17291484773159027, "learning_rate": 5.009821563046903e-05, "loss": 0.0209, "step": 1465 }, { "epoch": 1.0997749437359339, "grad_norm": 0.13547436892986298, "learning_rate": 5.003273856220415e-05, "loss": 0.022, "step": 1466 }, { "epoch": 1.1005251312828208, "grad_norm": 0.10618306696414948, "learning_rate": 4.996726143779586e-05, "loss": 0.0242, "step": 1467 }, { "epoch": 1.1012753188297075, "grad_norm": 0.1009022668004036, "learning_rate": 4.990178436953099e-05, "loss": 0.0156, "step": 1468 }, { "epoch": 1.1020255063765942, "grad_norm": 0.13409952819347382, "learning_rate": 4.9836307469696244e-05, "loss": 0.0177, "step": 1469 }, { "epoch": 1.102775693923481, "grad_norm": 0.07526332139968872, "learning_rate": 4.9770830850578075e-05, "loss": 0.0109, "step": 1470 }, { "epoch": 1.1035258814703677, "grad_norm": 0.09260010719299316, "learning_rate": 4.9705354624462424e-05, "loss": 0.0161, "step": 1471 }, { "epoch": 1.1042760690172544, "grad_norm": 0.11654193699359894, "learning_rate": 4.963987890363458e-05, "loss": 0.0219, "step": 1472 }, { "epoch": 1.105026256564141, "grad_norm": 0.08214106410741806, "learning_rate": 4.957440380037896e-05, "loss": 0.0173, "step": 1473 }, { "epoch": 1.1057764441110278, "grad_norm": 0.13250161707401276, "learning_rate": 4.9508929426978896e-05, "loss": 0.0202, "step": 1474 }, { "epoch": 1.1065266316579145, "grad_norm": 0.061449550092220306, "learning_rate": 4.944345589571651e-05, "loss": 0.0101, "step": 1475 }, { "epoch": 1.1072768192048013, "grad_norm": 0.13920247554779053, "learning_rate": 4.937798331887248e-05, "loss": 0.0229, "step": 1476 }, { "epoch": 1.108027006751688, "grad_norm": 0.0696333572268486, "learning_rate": 4.931251180872579e-05, "loss": 0.0143, "step": 1477 }, { "epoch": 1.1087771942985747, "grad_norm": 0.2159297913312912, "learning_rate": 4.9247041477553656e-05, "loss": 0.035, "step": 1478 }, { "epoch": 1.1095273818454614, "grad_norm": 0.17921189963817596, "learning_rate": 4.9181572437631255e-05, "loss": 0.0281, "step": 1479 }, { "epoch": 1.1102775693923481, "grad_norm": 0.11478715389966965, "learning_rate": 4.911610480123151e-05, "loss": 0.0205, "step": 1480 }, { "epoch": 1.1110277569392348, "grad_norm": 0.15886938571929932, "learning_rate": 4.905063868062501e-05, "loss": 0.0239, "step": 1481 }, { "epoch": 1.1117779444861215, "grad_norm": 0.13714122772216797, "learning_rate": 4.898517418807968e-05, "loss": 0.0223, "step": 1482 }, { "epoch": 1.1125281320330083, "grad_norm": 0.10357560217380524, "learning_rate": 4.891971143586069e-05, "loss": 0.0209, "step": 1483 }, { "epoch": 1.113278319579895, "grad_norm": 0.09297052770853043, "learning_rate": 4.88542505362302e-05, "loss": 0.0175, "step": 1484 }, { "epoch": 1.1140285071267817, "grad_norm": 0.12628571689128876, "learning_rate": 4.878879160144723e-05, "loss": 0.0198, "step": 1485 }, { "epoch": 1.1147786946736684, "grad_norm": 0.10400816053152084, "learning_rate": 4.872333474376739e-05, "loss": 0.0223, "step": 1486 }, { "epoch": 1.1155288822205551, "grad_norm": 0.0815277174115181, "learning_rate": 4.865788007544274e-05, "loss": 0.0126, "step": 1487 }, { "epoch": 1.1162790697674418, "grad_norm": 0.07159218937158585, "learning_rate": 4.859242770872158e-05, "loss": 0.0096, "step": 1488 }, { "epoch": 1.1170292573143286, "grad_norm": 0.1045546755194664, "learning_rate": 4.852697775584833e-05, "loss": 0.0158, "step": 1489 }, { "epoch": 1.1177794448612153, "grad_norm": 0.11663933098316193, "learning_rate": 4.846153032906316e-05, "loss": 0.0209, "step": 1490 }, { "epoch": 1.118529632408102, "grad_norm": 0.14321015775203705, "learning_rate": 4.8396085540601995e-05, "loss": 0.0244, "step": 1491 }, { "epoch": 1.1192798199549887, "grad_norm": 0.08914124220609665, "learning_rate": 4.833064350269617e-05, "loss": 0.018, "step": 1492 }, { "epoch": 1.1200300075018754, "grad_norm": 0.09482667595148087, "learning_rate": 4.826520432757236e-05, "loss": 0.0205, "step": 1493 }, { "epoch": 1.1207801950487621, "grad_norm": 0.11351495236158371, "learning_rate": 4.8199768127452314e-05, "loss": 0.0231, "step": 1494 }, { "epoch": 1.1215303825956489, "grad_norm": 0.17555253207683563, "learning_rate": 4.813433501455264e-05, "loss": 0.0224, "step": 1495 }, { "epoch": 1.1222805701425356, "grad_norm": 0.16453514993190765, "learning_rate": 4.806890510108471e-05, "loss": 0.0253, "step": 1496 }, { "epoch": 1.1230307576894223, "grad_norm": 0.1721792072057724, "learning_rate": 4.800347849925437e-05, "loss": 0.0266, "step": 1497 }, { "epoch": 1.123780945236309, "grad_norm": 0.13679584860801697, "learning_rate": 4.793805532126181e-05, "loss": 0.0189, "step": 1498 }, { "epoch": 1.1245311327831957, "grad_norm": 0.11366323381662369, "learning_rate": 4.787263567930132e-05, "loss": 0.0101, "step": 1499 }, { "epoch": 1.1252813203300824, "grad_norm": 0.17250248789787292, "learning_rate": 4.780721968556115e-05, "loss": 0.0304, "step": 1500 }, { "epoch": 1.1260315078769692, "grad_norm": 0.09911566227674484, "learning_rate": 4.774180745222331e-05, "loss": 0.0169, "step": 1501 }, { "epoch": 1.1267816954238559, "grad_norm": 0.08456264436244965, "learning_rate": 4.7676399091463296e-05, "loss": 0.0142, "step": 1502 }, { "epoch": 1.1275318829707426, "grad_norm": 0.14514417946338654, "learning_rate": 4.7610994715450044e-05, "loss": 0.0175, "step": 1503 }, { "epoch": 1.1282820705176295, "grad_norm": 0.1287779062986374, "learning_rate": 4.754559443634561e-05, "loss": 0.0159, "step": 1504 }, { "epoch": 1.129032258064516, "grad_norm": 0.13349425792694092, "learning_rate": 4.748019836630503e-05, "loss": 0.0299, "step": 1505 }, { "epoch": 1.129782445611403, "grad_norm": 0.11371131986379623, "learning_rate": 4.7414806617476124e-05, "loss": 0.0211, "step": 1506 }, { "epoch": 1.1305326331582897, "grad_norm": 0.13470502197742462, "learning_rate": 4.7349419301999294e-05, "loss": 0.0221, "step": 1507 }, { "epoch": 1.1312828207051764, "grad_norm": 0.10426650941371918, "learning_rate": 4.7284036532007356e-05, "loss": 0.0197, "step": 1508 }, { "epoch": 1.132033008252063, "grad_norm": 0.11969224363565445, "learning_rate": 4.721865841962533e-05, "loss": 0.0212, "step": 1509 }, { "epoch": 1.1327831957989498, "grad_norm": 0.08736421167850494, "learning_rate": 4.715328507697021e-05, "loss": 0.0165, "step": 1510 }, { "epoch": 1.1335333833458365, "grad_norm": 0.12223305553197861, "learning_rate": 4.7087916616150886e-05, "loss": 0.0209, "step": 1511 }, { "epoch": 1.1342835708927232, "grad_norm": 0.1267385184764862, "learning_rate": 4.702255314926779e-05, "loss": 0.0205, "step": 1512 }, { "epoch": 1.13503375843961, "grad_norm": 0.08564191311597824, "learning_rate": 4.695719478841286e-05, "loss": 0.0175, "step": 1513 }, { "epoch": 1.1357839459864967, "grad_norm": 0.12604007124900818, "learning_rate": 4.6891841645669224e-05, "loss": 0.0168, "step": 1514 }, { "epoch": 1.1365341335333834, "grad_norm": 0.08834186941385269, "learning_rate": 4.6826493833111104e-05, "loss": 0.012, "step": 1515 }, { "epoch": 1.13728432108027, "grad_norm": 0.1784517616033554, "learning_rate": 4.676115146280356e-05, "loss": 0.027, "step": 1516 }, { "epoch": 1.1380345086271568, "grad_norm": 0.07656119018793106, "learning_rate": 4.669581464680233e-05, "loss": 0.0157, "step": 1517 }, { "epoch": 1.1387846961740435, "grad_norm": 0.09516746550798416, "learning_rate": 4.66304834971536e-05, "loss": 0.0144, "step": 1518 }, { "epoch": 1.1395348837209303, "grad_norm": 0.08436454832553864, "learning_rate": 4.656515812589389e-05, "loss": 0.0111, "step": 1519 }, { "epoch": 1.140285071267817, "grad_norm": 0.06468910723924637, "learning_rate": 4.6499838645049744e-05, "loss": 0.0107, "step": 1520 }, { "epoch": 1.1410352588147037, "grad_norm": 0.11399751156568527, "learning_rate": 4.643452516663766e-05, "loss": 0.0216, "step": 1521 }, { "epoch": 1.1417854463615904, "grad_norm": 0.13065651059150696, "learning_rate": 4.636921780266381e-05, "loss": 0.022, "step": 1522 }, { "epoch": 1.1425356339084771, "grad_norm": 0.07837221771478653, "learning_rate": 4.63039166651239e-05, "loss": 0.0149, "step": 1523 }, { "epoch": 1.1432858214553638, "grad_norm": 0.10143603384494781, "learning_rate": 4.623862186600297e-05, "loss": 0.0136, "step": 1524 }, { "epoch": 1.1440360090022506, "grad_norm": 0.09773246943950653, "learning_rate": 4.617333351727513e-05, "loss": 0.0174, "step": 1525 }, { "epoch": 1.1447861965491373, "grad_norm": 0.11636144667863846, "learning_rate": 4.61080517309035e-05, "loss": 0.024, "step": 1526 }, { "epoch": 1.145536384096024, "grad_norm": 0.10740485787391663, "learning_rate": 4.604277661883989e-05, "loss": 0.0186, "step": 1527 }, { "epoch": 1.1462865716429107, "grad_norm": 0.085996612906456, "learning_rate": 4.5977508293024726e-05, "loss": 0.0108, "step": 1528 }, { "epoch": 1.1470367591897974, "grad_norm": 0.1368994265794754, "learning_rate": 4.591224686538672e-05, "loss": 0.0194, "step": 1529 }, { "epoch": 1.1477869467366841, "grad_norm": 0.17650572955608368, "learning_rate": 4.584699244784281e-05, "loss": 0.0253, "step": 1530 }, { "epoch": 1.1485371342835708, "grad_norm": 0.19057302176952362, "learning_rate": 4.578174515229789e-05, "loss": 0.0376, "step": 1531 }, { "epoch": 1.1492873218304576, "grad_norm": 0.07737153023481369, "learning_rate": 4.5716505090644684e-05, "loss": 0.015, "step": 1532 }, { "epoch": 1.1500375093773443, "grad_norm": 0.1411726176738739, "learning_rate": 4.5651272374763423e-05, "loss": 0.0221, "step": 1533 }, { "epoch": 1.150787696924231, "grad_norm": 0.10764733701944351, "learning_rate": 4.558604711652183e-05, "loss": 0.0175, "step": 1534 }, { "epoch": 1.1515378844711177, "grad_norm": 0.11171084642410278, "learning_rate": 4.552082942777478e-05, "loss": 0.0182, "step": 1535 }, { "epoch": 1.1522880720180044, "grad_norm": 0.12677061557769775, "learning_rate": 4.545561942036418e-05, "loss": 0.0183, "step": 1536 }, { "epoch": 1.1530382595648911, "grad_norm": 0.12518805265426636, "learning_rate": 4.5390417206118784e-05, "loss": 0.0187, "step": 1537 }, { "epoch": 1.1537884471117779, "grad_norm": 0.11542828381061554, "learning_rate": 4.5325222896853966e-05, "loss": 0.0224, "step": 1538 }, { "epoch": 1.1545386346586646, "grad_norm": 0.14305250346660614, "learning_rate": 4.5260036604371556e-05, "loss": 0.0222, "step": 1539 }, { "epoch": 1.1552888222055513, "grad_norm": 0.12537597119808197, "learning_rate": 4.51948584404596e-05, "loss": 0.0173, "step": 1540 }, { "epoch": 1.1560390097524382, "grad_norm": 0.0862974002957344, "learning_rate": 4.5129688516892264e-05, "loss": 0.0157, "step": 1541 }, { "epoch": 1.1567891972993247, "grad_norm": 0.1084955483675003, "learning_rate": 4.506452694542953e-05, "loss": 0.0198, "step": 1542 }, { "epoch": 1.1575393848462117, "grad_norm": 0.10898890346288681, "learning_rate": 4.499937383781708e-05, "loss": 0.0166, "step": 1543 }, { "epoch": 1.1582895723930982, "grad_norm": 0.07213619351387024, "learning_rate": 4.493422930578605e-05, "loss": 0.0117, "step": 1544 }, { "epoch": 1.159039759939985, "grad_norm": 0.11787864565849304, "learning_rate": 4.486909346105296e-05, "loss": 0.0235, "step": 1545 }, { "epoch": 1.1597899474868718, "grad_norm": 0.12363395094871521, "learning_rate": 4.480396641531932e-05, "loss": 0.0221, "step": 1546 }, { "epoch": 1.1605401350337585, "grad_norm": 0.10760311782360077, "learning_rate": 4.4738848280271626e-05, "loss": 0.0176, "step": 1547 }, { "epoch": 1.1612903225806452, "grad_norm": 0.08016765117645264, "learning_rate": 4.467373916758105e-05, "loss": 0.0098, "step": 1548 }, { "epoch": 1.162040510127532, "grad_norm": 0.19417689740657806, "learning_rate": 4.460863918890333e-05, "loss": 0.0238, "step": 1549 }, { "epoch": 1.1627906976744187, "grad_norm": 0.13339464366436005, "learning_rate": 4.454354845587849e-05, "loss": 0.0126, "step": 1550 }, { "epoch": 1.1635408852213054, "grad_norm": 0.09546097368001938, "learning_rate": 4.4478467080130734e-05, "loss": 0.0176, "step": 1551 }, { "epoch": 1.164291072768192, "grad_norm": 0.12648361921310425, "learning_rate": 4.4413395173268243e-05, "loss": 0.0207, "step": 1552 }, { "epoch": 1.1650412603150788, "grad_norm": 0.14633196592330933, "learning_rate": 4.43483328468829e-05, "loss": 0.0188, "step": 1553 }, { "epoch": 1.1657914478619655, "grad_norm": 0.12665541470050812, "learning_rate": 4.4283280212550194e-05, "loss": 0.0186, "step": 1554 }, { "epoch": 1.1665416354088523, "grad_norm": 0.08633586019277573, "learning_rate": 4.421823738182898e-05, "loss": 0.0118, "step": 1555 }, { "epoch": 1.167291822955739, "grad_norm": 0.09715209156274796, "learning_rate": 4.4153204466261334e-05, "loss": 0.0156, "step": 1556 }, { "epoch": 1.1680420105026257, "grad_norm": 0.09635809808969498, "learning_rate": 4.408818157737227e-05, "loss": 0.0135, "step": 1557 }, { "epoch": 1.1687921980495124, "grad_norm": 0.14372672140598297, "learning_rate": 4.402316882666964e-05, "loss": 0.0222, "step": 1558 }, { "epoch": 1.1695423855963991, "grad_norm": 0.12557637691497803, "learning_rate": 4.395816632564393e-05, "loss": 0.0183, "step": 1559 }, { "epoch": 1.1702925731432858, "grad_norm": 0.13715486228466034, "learning_rate": 4.3893174185768045e-05, "loss": 0.0242, "step": 1560 }, { "epoch": 1.1710427606901725, "grad_norm": 0.14162661135196686, "learning_rate": 4.382819251849707e-05, "loss": 0.0181, "step": 1561 }, { "epoch": 1.1717929482370593, "grad_norm": 0.08794243633747101, "learning_rate": 4.376322143526818e-05, "loss": 0.0149, "step": 1562 }, { "epoch": 1.172543135783946, "grad_norm": 0.13961324095726013, "learning_rate": 4.3698261047500376e-05, "loss": 0.0206, "step": 1563 }, { "epoch": 1.1732933233308327, "grad_norm": 0.1918809711933136, "learning_rate": 4.3633311466594345e-05, "loss": 0.019, "step": 1564 }, { "epoch": 1.1740435108777194, "grad_norm": 0.12562909722328186, "learning_rate": 4.3568372803932195e-05, "loss": 0.0167, "step": 1565 }, { "epoch": 1.1747936984246061, "grad_norm": 0.18379457294940948, "learning_rate": 4.3503445170877354e-05, "loss": 0.0223, "step": 1566 }, { "epoch": 1.1755438859714928, "grad_norm": 0.09681471437215805, "learning_rate": 4.343852867877433e-05, "loss": 0.0178, "step": 1567 }, { "epoch": 1.1762940735183796, "grad_norm": 0.17194296419620514, "learning_rate": 4.3373623438948496e-05, "loss": 0.0291, "step": 1568 }, { "epoch": 1.1770442610652663, "grad_norm": 0.08721692860126495, "learning_rate": 4.330872956270596e-05, "loss": 0.0164, "step": 1569 }, { "epoch": 1.177794448612153, "grad_norm": 0.13167378306388855, "learning_rate": 4.324384716133332e-05, "loss": 0.0189, "step": 1570 }, { "epoch": 1.1785446361590397, "grad_norm": 0.14486682415008545, "learning_rate": 4.317897634609751e-05, "loss": 0.0174, "step": 1571 }, { "epoch": 1.1792948237059264, "grad_norm": 0.2041822224855423, "learning_rate": 4.3114117228245565e-05, "loss": 0.0306, "step": 1572 }, { "epoch": 1.1800450112528131, "grad_norm": 0.10399215668439865, "learning_rate": 4.304926991900453e-05, "loss": 0.0162, "step": 1573 }, { "epoch": 1.1807951987996999, "grad_norm": 0.11682548373937607, "learning_rate": 4.298443452958113e-05, "loss": 0.0169, "step": 1574 }, { "epoch": 1.1815453863465866, "grad_norm": 0.10943344980478287, "learning_rate": 4.291961117116168e-05, "loss": 0.0154, "step": 1575 }, { "epoch": 1.1822955738934733, "grad_norm": 0.07967270910739899, "learning_rate": 4.285479995491185e-05, "loss": 0.0108, "step": 1576 }, { "epoch": 1.18304576144036, "grad_norm": 0.11803645640611649, "learning_rate": 4.279000099197651e-05, "loss": 0.0173, "step": 1577 }, { "epoch": 1.1837959489872467, "grad_norm": 0.06087150797247887, "learning_rate": 4.272521439347947e-05, "loss": 0.0098, "step": 1578 }, { "epoch": 1.1845461365341334, "grad_norm": 0.08632830530405045, "learning_rate": 4.26604402705234e-05, "loss": 0.0122, "step": 1579 }, { "epoch": 1.1852963240810204, "grad_norm": 0.09882350265979767, "learning_rate": 4.259567873418952e-05, "loss": 0.0162, "step": 1580 }, { "epoch": 1.1860465116279069, "grad_norm": 0.1528325378894806, "learning_rate": 4.25309298955375e-05, "loss": 0.0219, "step": 1581 }, { "epoch": 1.1867966991747938, "grad_norm": 0.10851482301950455, "learning_rate": 4.246619386560521e-05, "loss": 0.011, "step": 1582 }, { "epoch": 1.1875468867216805, "grad_norm": 0.05488813295960426, "learning_rate": 4.240147075540858e-05, "loss": 0.0105, "step": 1583 }, { "epoch": 1.1882970742685672, "grad_norm": 0.07339299470186234, "learning_rate": 4.233676067594137e-05, "loss": 0.0129, "step": 1584 }, { "epoch": 1.189047261815454, "grad_norm": 0.13282492756843567, "learning_rate": 4.227206373817497e-05, "loss": 0.0246, "step": 1585 }, { "epoch": 1.1897974493623407, "grad_norm": 0.1161782294511795, "learning_rate": 4.220738005305827e-05, "loss": 0.0171, "step": 1586 }, { "epoch": 1.1905476369092274, "grad_norm": 0.11853188276290894, "learning_rate": 4.214270973151745e-05, "loss": 0.0259, "step": 1587 }, { "epoch": 1.191297824456114, "grad_norm": 0.06727686524391174, "learning_rate": 4.207805288445571e-05, "loss": 0.0107, "step": 1588 }, { "epoch": 1.1920480120030008, "grad_norm": 0.14418961107730865, "learning_rate": 4.201340962275318e-05, "loss": 0.0303, "step": 1589 }, { "epoch": 1.1927981995498875, "grad_norm": 0.06462909281253815, "learning_rate": 4.194878005726671e-05, "loss": 0.0101, "step": 1590 }, { "epoch": 1.1935483870967742, "grad_norm": 0.11350913345813751, "learning_rate": 4.1884164298829615e-05, "loss": 0.0128, "step": 1591 }, { "epoch": 1.194298574643661, "grad_norm": 0.11162211000919342, "learning_rate": 4.181956245825158e-05, "loss": 0.014, "step": 1592 }, { "epoch": 1.1950487621905477, "grad_norm": 0.13950911164283752, "learning_rate": 4.1754974646318365e-05, "loss": 0.019, "step": 1593 }, { "epoch": 1.1957989497374344, "grad_norm": 0.17123188078403473, "learning_rate": 4.1690400973791756e-05, "loss": 0.0229, "step": 1594 }, { "epoch": 1.196549137284321, "grad_norm": 0.09660036861896515, "learning_rate": 4.1625841551409195e-05, "loss": 0.0164, "step": 1595 }, { "epoch": 1.1972993248312078, "grad_norm": 0.19667930901050568, "learning_rate": 4.156129648988376e-05, "loss": 0.0205, "step": 1596 }, { "epoch": 1.1980495123780945, "grad_norm": 0.047219861298799515, "learning_rate": 4.149676589990388e-05, "loss": 0.008, "step": 1597 }, { "epoch": 1.1987996999249813, "grad_norm": 0.17346802353858948, "learning_rate": 4.143224989213315e-05, "loss": 0.0238, "step": 1598 }, { "epoch": 1.199549887471868, "grad_norm": 0.11115290224552155, "learning_rate": 4.136774857721017e-05, "loss": 0.016, "step": 1599 }, { "epoch": 1.2003000750187547, "grad_norm": 0.13619063794612885, "learning_rate": 4.130326206574834e-05, "loss": 0.0164, "step": 1600 }, { "epoch": 1.2003000750187547, "eval_loss": 0.03162379190325737, "eval_runtime": 5.1406, "eval_samples_per_second": 10.505, "eval_steps_per_second": 2.723, "step": 1600 }, { "epoch": 1.2010502625656414, "grad_norm": 0.10646115243434906, "learning_rate": 4.1238790468335685e-05, "loss": 0.0122, "step": 1601 }, { "epoch": 1.2018004501125281, "grad_norm": 0.11590476334095001, "learning_rate": 4.117433389553466e-05, "loss": 0.017, "step": 1602 }, { "epoch": 1.2025506376594148, "grad_norm": 0.20600537955760956, "learning_rate": 4.1109892457881924e-05, "loss": 0.0302, "step": 1603 }, { "epoch": 1.2033008252063015, "grad_norm": 0.10678470134735107, "learning_rate": 4.1045466265888195e-05, "loss": 0.0148, "step": 1604 }, { "epoch": 1.2040510127531883, "grad_norm": 0.12323981523513794, "learning_rate": 4.0981055430038055e-05, "loss": 0.0178, "step": 1605 }, { "epoch": 1.204801200300075, "grad_norm": 0.09657885879278183, "learning_rate": 4.091666006078974e-05, "loss": 0.0119, "step": 1606 }, { "epoch": 1.2055513878469617, "grad_norm": 0.1022537425160408, "learning_rate": 4.085228026857498e-05, "loss": 0.012, "step": 1607 }, { "epoch": 1.2063015753938484, "grad_norm": 0.1730087846517563, "learning_rate": 4.0787916163798743e-05, "loss": 0.0217, "step": 1608 }, { "epoch": 1.2070517629407351, "grad_norm": 0.1198115199804306, "learning_rate": 4.0723567856839184e-05, "loss": 0.0172, "step": 1609 }, { "epoch": 1.2078019504876218, "grad_norm": 0.13980574905872345, "learning_rate": 4.0659235458047264e-05, "loss": 0.0192, "step": 1610 }, { "epoch": 1.2085521380345086, "grad_norm": 0.08618589490652084, "learning_rate": 4.0594919077746734e-05, "loss": 0.0109, "step": 1611 }, { "epoch": 1.2093023255813953, "grad_norm": 0.14428244531154633, "learning_rate": 4.053061882623386e-05, "loss": 0.0215, "step": 1612 }, { "epoch": 1.210052513128282, "grad_norm": 0.10157791525125504, "learning_rate": 4.0466334813777216e-05, "loss": 0.0175, "step": 1613 }, { "epoch": 1.2108027006751687, "grad_norm": 0.0878187045454979, "learning_rate": 4.040206715061758e-05, "loss": 0.0162, "step": 1614 }, { "epoch": 1.2115528882220554, "grad_norm": 0.18254134058952332, "learning_rate": 4.033781594696767e-05, "loss": 0.0277, "step": 1615 }, { "epoch": 1.2123030757689421, "grad_norm": 0.10520658642053604, "learning_rate": 4.027358131301194e-05, "loss": 0.0191, "step": 1616 }, { "epoch": 1.213053263315829, "grad_norm": 0.2057482898235321, "learning_rate": 4.0209363358906495e-05, "loss": 0.015, "step": 1617 }, { "epoch": 1.2138034508627156, "grad_norm": 0.14618700742721558, "learning_rate": 4.014516219477878e-05, "loss": 0.0178, "step": 1618 }, { "epoch": 1.2145536384096025, "grad_norm": 0.08962513506412506, "learning_rate": 4.008097793072749e-05, "loss": 0.0177, "step": 1619 }, { "epoch": 1.215303825956489, "grad_norm": 0.10952772945165634, "learning_rate": 4.00168106768223e-05, "loss": 0.0196, "step": 1620 }, { "epoch": 1.216054013503376, "grad_norm": 0.09128102660179138, "learning_rate": 3.9952660543103734e-05, "loss": 0.0151, "step": 1621 }, { "epoch": 1.2168042010502627, "grad_norm": 0.16668932139873505, "learning_rate": 3.988852763958297e-05, "loss": 0.0185, "step": 1622 }, { "epoch": 1.2175543885971494, "grad_norm": 0.13736192882061005, "learning_rate": 3.9824412076241595e-05, "loss": 0.0163, "step": 1623 }, { "epoch": 1.218304576144036, "grad_norm": 0.1512220799922943, "learning_rate": 3.9760313963031516e-05, "loss": 0.0238, "step": 1624 }, { "epoch": 1.2190547636909228, "grad_norm": 0.08548334985971451, "learning_rate": 3.9696233409874654e-05, "loss": 0.016, "step": 1625 }, { "epoch": 1.2198049512378095, "grad_norm": 0.10499925911426544, "learning_rate": 3.963217052666287e-05, "loss": 0.0154, "step": 1626 }, { "epoch": 1.2205551387846962, "grad_norm": 0.08598527312278748, "learning_rate": 3.956812542325769e-05, "loss": 0.0145, "step": 1627 }, { "epoch": 1.221305326331583, "grad_norm": 0.22342023253440857, "learning_rate": 3.950409820949018e-05, "loss": 0.0288, "step": 1628 }, { "epoch": 1.2220555138784697, "grad_norm": 0.20031727850437164, "learning_rate": 3.9440088995160676e-05, "loss": 0.0262, "step": 1629 }, { "epoch": 1.2228057014253564, "grad_norm": 0.11276500672101974, "learning_rate": 3.937609789003871e-05, "loss": 0.0149, "step": 1630 }, { "epoch": 1.223555888972243, "grad_norm": 0.05976317077875137, "learning_rate": 3.93121250038627e-05, "loss": 0.0109, "step": 1631 }, { "epoch": 1.2243060765191298, "grad_norm": 0.1105174720287323, "learning_rate": 3.924817044633985e-05, "loss": 0.0146, "step": 1632 }, { "epoch": 1.2250562640660165, "grad_norm": 0.16175299882888794, "learning_rate": 3.9184234327145954e-05, "loss": 0.0199, "step": 1633 }, { "epoch": 1.2258064516129032, "grad_norm": 0.14765483140945435, "learning_rate": 3.912031675592512e-05, "loss": 0.0169, "step": 1634 }, { "epoch": 1.22655663915979, "grad_norm": 0.10759598761796951, "learning_rate": 3.905641784228972e-05, "loss": 0.0164, "step": 1635 }, { "epoch": 1.2273068267066767, "grad_norm": 0.0886867418885231, "learning_rate": 3.899253769582008e-05, "loss": 0.0134, "step": 1636 }, { "epoch": 1.2280570142535634, "grad_norm": 0.14302989840507507, "learning_rate": 3.8928676426064376e-05, "loss": 0.016, "step": 1637 }, { "epoch": 1.22880720180045, "grad_norm": 0.15930666029453278, "learning_rate": 3.886483414253838e-05, "loss": 0.0137, "step": 1638 }, { "epoch": 1.2295573893473368, "grad_norm": 0.10902878642082214, "learning_rate": 3.880101095472535e-05, "loss": 0.0149, "step": 1639 }, { "epoch": 1.2303075768942235, "grad_norm": 0.11319921165704727, "learning_rate": 3.873720697207572e-05, "loss": 0.0202, "step": 1640 }, { "epoch": 1.2310577644411103, "grad_norm": 0.08670421689748764, "learning_rate": 3.867342230400707e-05, "loss": 0.0115, "step": 1641 }, { "epoch": 1.231807951987997, "grad_norm": 0.07328299432992935, "learning_rate": 3.860965705990383e-05, "loss": 0.0113, "step": 1642 }, { "epoch": 1.2325581395348837, "grad_norm": 0.10198184847831726, "learning_rate": 3.8545911349117114e-05, "loss": 0.0153, "step": 1643 }, { "epoch": 1.2333083270817704, "grad_norm": 0.1404053270816803, "learning_rate": 3.848218528096452e-05, "loss": 0.0196, "step": 1644 }, { "epoch": 1.2340585146286571, "grad_norm": 0.12451205402612686, "learning_rate": 3.841847896473001e-05, "loss": 0.0157, "step": 1645 }, { "epoch": 1.2348087021755438, "grad_norm": 0.09027544409036636, "learning_rate": 3.83547925096636e-05, "loss": 0.0143, "step": 1646 }, { "epoch": 1.2355588897224306, "grad_norm": 0.15218143165111542, "learning_rate": 3.829112602498132e-05, "loss": 0.0265, "step": 1647 }, { "epoch": 1.2363090772693173, "grad_norm": 0.13812431693077087, "learning_rate": 3.822747961986493e-05, "loss": 0.0179, "step": 1648 }, { "epoch": 1.237059264816204, "grad_norm": 0.07685697078704834, "learning_rate": 3.816385340346171e-05, "loss": 0.0111, "step": 1649 }, { "epoch": 1.2378094523630907, "grad_norm": 0.12320845574140549, "learning_rate": 3.81002474848844e-05, "loss": 0.0163, "step": 1650 }, { "epoch": 1.2385596399099774, "grad_norm": 0.09731096774339676, "learning_rate": 3.803666197321084e-05, "loss": 0.0166, "step": 1651 }, { "epoch": 1.2393098274568641, "grad_norm": 0.1925540417432785, "learning_rate": 3.797309697748396e-05, "loss": 0.0228, "step": 1652 }, { "epoch": 1.2400600150037508, "grad_norm": 0.08675350993871689, "learning_rate": 3.7909552606711454e-05, "loss": 0.012, "step": 1653 }, { "epoch": 1.2408102025506376, "grad_norm": 0.13772478699684143, "learning_rate": 3.784602896986566e-05, "loss": 0.0158, "step": 1654 }, { "epoch": 1.2415603900975243, "grad_norm": 0.12192375957965851, "learning_rate": 3.778252617588334e-05, "loss": 0.0109, "step": 1655 }, { "epoch": 1.2423105776444112, "grad_norm": 0.1430065929889679, "learning_rate": 3.771904433366557e-05, "loss": 0.0238, "step": 1656 }, { "epoch": 1.2430607651912977, "grad_norm": 0.09612610936164856, "learning_rate": 3.7655583552077446e-05, "loss": 0.0168, "step": 1657 }, { "epoch": 1.2438109527381846, "grad_norm": 0.10175983607769012, "learning_rate": 3.7592143939947955e-05, "loss": 0.0121, "step": 1658 }, { "epoch": 1.2445611402850711, "grad_norm": 0.09988542646169662, "learning_rate": 3.7528725606069774e-05, "loss": 0.0156, "step": 1659 }, { "epoch": 1.245311327831958, "grad_norm": 0.3816523849964142, "learning_rate": 3.746532865919913e-05, "loss": 0.038, "step": 1660 }, { "epoch": 1.2460615153788448, "grad_norm": 0.1465483009815216, "learning_rate": 3.740195320805551e-05, "loss": 0.0234, "step": 1661 }, { "epoch": 1.2468117029257315, "grad_norm": 0.1917344033718109, "learning_rate": 3.733859936132158e-05, "loss": 0.0182, "step": 1662 }, { "epoch": 1.2475618904726182, "grad_norm": 0.1488763391971588, "learning_rate": 3.727526722764297e-05, "loss": 0.0208, "step": 1663 }, { "epoch": 1.248312078019505, "grad_norm": 0.11044822633266449, "learning_rate": 3.7211956915628035e-05, "loss": 0.0175, "step": 1664 }, { "epoch": 1.2490622655663917, "grad_norm": 0.10706579685211182, "learning_rate": 3.7148668533847744e-05, "loss": 0.0158, "step": 1665 }, { "epoch": 1.2498124531132784, "grad_norm": 0.10008851438760757, "learning_rate": 3.7085402190835406e-05, "loss": 0.0148, "step": 1666 }, { "epoch": 1.250562640660165, "grad_norm": 0.10831057280302048, "learning_rate": 3.702215799508659e-05, "loss": 0.0175, "step": 1667 }, { "epoch": 1.2513128282070518, "grad_norm": 0.10080386698246002, "learning_rate": 3.695893605505887e-05, "loss": 0.0142, "step": 1668 }, { "epoch": 1.2520630157539385, "grad_norm": 0.10447536408901215, "learning_rate": 3.689573647917162e-05, "loss": 0.0181, "step": 1669 }, { "epoch": 1.2528132033008252, "grad_norm": 0.1283860206604004, "learning_rate": 3.683255937580592e-05, "loss": 0.0165, "step": 1670 }, { "epoch": 1.253563390847712, "grad_norm": 0.14649006724357605, "learning_rate": 3.6769404853304276e-05, "loss": 0.0125, "step": 1671 }, { "epoch": 1.2543135783945987, "grad_norm": 0.09096325188875198, "learning_rate": 3.670627301997047e-05, "loss": 0.0175, "step": 1672 }, { "epoch": 1.2550637659414854, "grad_norm": 0.11983542889356613, "learning_rate": 3.664316398406939e-05, "loss": 0.0154, "step": 1673 }, { "epoch": 1.255813953488372, "grad_norm": 0.077834352850914, "learning_rate": 3.658007785382679e-05, "loss": 0.0104, "step": 1674 }, { "epoch": 1.2565641410352588, "grad_norm": 0.12809742987155914, "learning_rate": 3.65170147374292e-05, "loss": 0.0214, "step": 1675 }, { "epoch": 1.2573143285821455, "grad_norm": 0.11058270931243896, "learning_rate": 3.645397474302363e-05, "loss": 0.015, "step": 1676 }, { "epoch": 1.2580645161290323, "grad_norm": 0.1564393937587738, "learning_rate": 3.639095797871748e-05, "loss": 0.0179, "step": 1677 }, { "epoch": 1.258814703675919, "grad_norm": 0.11511152237653732, "learning_rate": 3.63279645525783e-05, "loss": 0.0194, "step": 1678 }, { "epoch": 1.2595648912228057, "grad_norm": 0.05084642395377159, "learning_rate": 3.626499457263359e-05, "loss": 0.0075, "step": 1679 }, { "epoch": 1.2603150787696924, "grad_norm": 0.16043655574321747, "learning_rate": 3.620204814687069e-05, "loss": 0.0181, "step": 1680 }, { "epoch": 1.2610652663165791, "grad_norm": 0.09519901871681213, "learning_rate": 3.61391253832365e-05, "loss": 0.0131, "step": 1681 }, { "epoch": 1.2618154538634658, "grad_norm": 0.0991373062133789, "learning_rate": 3.607622638963739e-05, "loss": 0.0153, "step": 1682 }, { "epoch": 1.2625656414103525, "grad_norm": 0.15621377527713776, "learning_rate": 3.601335127393889e-05, "loss": 0.0159, "step": 1683 }, { "epoch": 1.2633158289572393, "grad_norm": 0.16243353486061096, "learning_rate": 3.59505001439657e-05, "loss": 0.0229, "step": 1684 }, { "epoch": 1.264066016504126, "grad_norm": 0.169635608792305, "learning_rate": 3.588767310750127e-05, "loss": 0.0233, "step": 1685 }, { "epoch": 1.2648162040510127, "grad_norm": 0.13255098462104797, "learning_rate": 3.5824870272287815e-05, "loss": 0.0227, "step": 1686 }, { "epoch": 1.2655663915978994, "grad_norm": 0.15259072184562683, "learning_rate": 3.576209174602597e-05, "loss": 0.0198, "step": 1687 }, { "epoch": 1.2663165791447861, "grad_norm": 0.15271423757076263, "learning_rate": 3.569933763637477e-05, "loss": 0.0186, "step": 1688 }, { "epoch": 1.2670667666916728, "grad_norm": 0.0755571573972702, "learning_rate": 3.56366080509513e-05, "loss": 0.0114, "step": 1689 }, { "epoch": 1.2678169542385596, "grad_norm": 0.14046405255794525, "learning_rate": 3.557390309733065e-05, "loss": 0.0191, "step": 1690 }, { "epoch": 1.2685671417854465, "grad_norm": 0.10311423987150192, "learning_rate": 3.551122288304561e-05, "loss": 0.0175, "step": 1691 }, { "epoch": 1.269317329332333, "grad_norm": 0.12403605133295059, "learning_rate": 3.544856751558659e-05, "loss": 0.0264, "step": 1692 }, { "epoch": 1.27006751687922, "grad_norm": 0.088347427546978, "learning_rate": 3.538593710240139e-05, "loss": 0.0105, "step": 1693 }, { "epoch": 1.2708177044261064, "grad_norm": 0.1014111265540123, "learning_rate": 3.532333175089498e-05, "loss": 0.0131, "step": 1694 }, { "epoch": 1.2715678919729934, "grad_norm": 0.1430293172597885, "learning_rate": 3.526075156842938e-05, "loss": 0.0187, "step": 1695 }, { "epoch": 1.2723180795198799, "grad_norm": 0.13322706520557404, "learning_rate": 3.519819666232345e-05, "loss": 0.0152, "step": 1696 }, { "epoch": 1.2730682670667668, "grad_norm": 0.09196791052818298, "learning_rate": 3.5135667139852654e-05, "loss": 0.0143, "step": 1697 }, { "epoch": 1.2738184546136533, "grad_norm": 0.11764800548553467, "learning_rate": 3.507316310824902e-05, "loss": 0.0174, "step": 1698 }, { "epoch": 1.2745686421605402, "grad_norm": 0.0816640555858612, "learning_rate": 3.50106846747008e-05, "loss": 0.0118, "step": 1699 }, { "epoch": 1.275318829707427, "grad_norm": 0.1807868927717209, "learning_rate": 3.4948231946352314e-05, "loss": 0.0217, "step": 1700 }, { "epoch": 1.2760690172543137, "grad_norm": 0.08453043550252914, "learning_rate": 3.488580503030389e-05, "loss": 0.0161, "step": 1701 }, { "epoch": 1.2768192048012004, "grad_norm": 0.11556815356016159, "learning_rate": 3.482340403361151e-05, "loss": 0.0186, "step": 1702 }, { "epoch": 1.277569392348087, "grad_norm": 0.0988595187664032, "learning_rate": 3.4761029063286745e-05, "loss": 0.015, "step": 1703 }, { "epoch": 1.2783195798949738, "grad_norm": 0.11000659316778183, "learning_rate": 3.4698680226296526e-05, "loss": 0.0134, "step": 1704 }, { "epoch": 1.2790697674418605, "grad_norm": 0.16800323128700256, "learning_rate": 3.4636357629562986e-05, "loss": 0.0298, "step": 1705 }, { "epoch": 1.2798199549887472, "grad_norm": 0.1235019639134407, "learning_rate": 3.457406137996321e-05, "loss": 0.0222, "step": 1706 }, { "epoch": 1.280570142535634, "grad_norm": 0.09599635750055313, "learning_rate": 3.4511791584329154e-05, "loss": 0.0165, "step": 1707 }, { "epoch": 1.2813203300825207, "grad_norm": 0.10348288714885712, "learning_rate": 3.4449548349447394e-05, "loss": 0.0161, "step": 1708 }, { "epoch": 1.2820705176294074, "grad_norm": 0.11185697466135025, "learning_rate": 3.438733178205892e-05, "loss": 0.0107, "step": 1709 }, { "epoch": 1.282820705176294, "grad_norm": 0.13753622770309448, "learning_rate": 3.4325141988859046e-05, "loss": 0.0243, "step": 1710 }, { "epoch": 1.2835708927231808, "grad_norm": 0.07685783505439758, "learning_rate": 3.426297907649711e-05, "loss": 0.0114, "step": 1711 }, { "epoch": 1.2843210802700675, "grad_norm": 0.142874076962471, "learning_rate": 3.4200843151576414e-05, "loss": 0.0253, "step": 1712 }, { "epoch": 1.2850712678169542, "grad_norm": 0.12762536108493805, "learning_rate": 3.413873432065394e-05, "loss": 0.0266, "step": 1713 }, { "epoch": 1.285821455363841, "grad_norm": 0.07396502792835236, "learning_rate": 3.407665269024024e-05, "loss": 0.011, "step": 1714 }, { "epoch": 1.2865716429107277, "grad_norm": 0.33753538131713867, "learning_rate": 3.401459836679917e-05, "loss": 0.0264, "step": 1715 }, { "epoch": 1.2873218304576144, "grad_norm": 0.10064946115016937, "learning_rate": 3.39525714567478e-05, "loss": 0.0182, "step": 1716 }, { "epoch": 1.288072018004501, "grad_norm": 0.16794368624687195, "learning_rate": 3.389057206645614e-05, "loss": 0.0284, "step": 1717 }, { "epoch": 1.2888222055513878, "grad_norm": 0.15257765352725983, "learning_rate": 3.382860030224708e-05, "loss": 0.023, "step": 1718 }, { "epoch": 1.2895723930982745, "grad_norm": 0.12148189544677734, "learning_rate": 3.3766656270396074e-05, "loss": 0.0232, "step": 1719 }, { "epoch": 1.2903225806451613, "grad_norm": 0.09580284357070923, "learning_rate": 3.3704740077131036e-05, "loss": 0.0144, "step": 1720 }, { "epoch": 1.291072768192048, "grad_norm": 0.08760684728622437, "learning_rate": 3.3642851828632155e-05, "loss": 0.0126, "step": 1721 }, { "epoch": 1.2918229557389347, "grad_norm": 0.11275685578584671, "learning_rate": 3.3580991631031656e-05, "loss": 0.0157, "step": 1722 }, { "epoch": 1.2925731432858214, "grad_norm": 0.09197841584682465, "learning_rate": 3.3519159590413715e-05, "loss": 0.0127, "step": 1723 }, { "epoch": 1.2933233308327081, "grad_norm": 0.11137830466032028, "learning_rate": 3.345735581281417e-05, "loss": 0.019, "step": 1724 }, { "epoch": 1.2940735183795948, "grad_norm": 0.09153483808040619, "learning_rate": 3.339558040422042e-05, "loss": 0.0128, "step": 1725 }, { "epoch": 1.2948237059264815, "grad_norm": 0.10245629400014877, "learning_rate": 3.333383347057123e-05, "loss": 0.0195, "step": 1726 }, { "epoch": 1.2955738934733683, "grad_norm": 0.18266040086746216, "learning_rate": 3.3272115117756476e-05, "loss": 0.0235, "step": 1727 }, { "epoch": 1.296324081020255, "grad_norm": 0.15308219194412231, "learning_rate": 3.3210425451617074e-05, "loss": 0.0152, "step": 1728 }, { "epoch": 1.2970742685671417, "grad_norm": 0.0983470156788826, "learning_rate": 3.314876457794474e-05, "loss": 0.0138, "step": 1729 }, { "epoch": 1.2978244561140286, "grad_norm": 0.1688724309206009, "learning_rate": 3.3087132602481774e-05, "loss": 0.0165, "step": 1730 }, { "epoch": 1.2985746436609151, "grad_norm": 0.08947484195232391, "learning_rate": 3.302552963092096e-05, "loss": 0.0138, "step": 1731 }, { "epoch": 1.299324831207802, "grad_norm": 0.14829127490520477, "learning_rate": 3.296395576890532e-05, "loss": 0.0171, "step": 1732 }, { "epoch": 1.3000750187546886, "grad_norm": 0.12239532917737961, "learning_rate": 3.290241112202797e-05, "loss": 0.0201, "step": 1733 }, { "epoch": 1.3008252063015755, "grad_norm": 0.16367493569850922, "learning_rate": 3.284089579583192e-05, "loss": 0.0195, "step": 1734 }, { "epoch": 1.301575393848462, "grad_norm": 0.11469226330518723, "learning_rate": 3.2779409895809886e-05, "loss": 0.0161, "step": 1735 }, { "epoch": 1.302325581395349, "grad_norm": 0.10691020637750626, "learning_rate": 3.2717953527404155e-05, "loss": 0.0128, "step": 1736 }, { "epoch": 1.3030757689422354, "grad_norm": 0.1380341500043869, "learning_rate": 3.265652679600631e-05, "loss": 0.022, "step": 1737 }, { "epoch": 1.3038259564891224, "grad_norm": 0.10156739503145218, "learning_rate": 3.25951298069572e-05, "loss": 0.0173, "step": 1738 }, { "epoch": 1.304576144036009, "grad_norm": 0.0986672043800354, "learning_rate": 3.253376266554655e-05, "loss": 0.016, "step": 1739 }, { "epoch": 1.3053263315828958, "grad_norm": 0.14217688143253326, "learning_rate": 3.247242547701301e-05, "loss": 0.02, "step": 1740 }, { "epoch": 1.3060765191297825, "grad_norm": 0.15207204222679138, "learning_rate": 3.241111834654382e-05, "loss": 0.0253, "step": 1741 }, { "epoch": 1.3068267066766692, "grad_norm": 0.13661247491836548, "learning_rate": 3.234984137927464e-05, "loss": 0.0193, "step": 1742 }, { "epoch": 1.307576894223556, "grad_norm": 0.10947258770465851, "learning_rate": 3.228859468028946e-05, "loss": 0.0199, "step": 1743 }, { "epoch": 1.3083270817704427, "grad_norm": 0.2023872286081314, "learning_rate": 3.222737835462034e-05, "loss": 0.027, "step": 1744 }, { "epoch": 1.3090772693173294, "grad_norm": 0.0802428126335144, "learning_rate": 3.216619250724724e-05, "loss": 0.012, "step": 1745 }, { "epoch": 1.309827456864216, "grad_norm": 0.11808689683675766, "learning_rate": 3.2105037243097866e-05, "loss": 0.0168, "step": 1746 }, { "epoch": 1.3105776444111028, "grad_norm": 0.18175119161605835, "learning_rate": 3.2043912667047465e-05, "loss": 0.0261, "step": 1747 }, { "epoch": 1.3113278319579895, "grad_norm": 0.08211642503738403, "learning_rate": 3.198281888391869e-05, "loss": 0.0104, "step": 1748 }, { "epoch": 1.3120780195048762, "grad_norm": 0.14124037325382233, "learning_rate": 3.192175599848133e-05, "loss": 0.0222, "step": 1749 }, { "epoch": 1.312828207051763, "grad_norm": 0.11715236306190491, "learning_rate": 3.1860724115452234e-05, "loss": 0.014, "step": 1750 }, { "epoch": 1.3135783945986497, "grad_norm": 0.07071872800588608, "learning_rate": 3.179972333949509e-05, "loss": 0.012, "step": 1751 }, { "epoch": 1.3143285821455364, "grad_norm": 0.0900421068072319, "learning_rate": 3.173875377522019e-05, "loss": 0.0142, "step": 1752 }, { "epoch": 1.315078769692423, "grad_norm": 0.18912558257579803, "learning_rate": 3.167781552718435e-05, "loss": 0.0248, "step": 1753 }, { "epoch": 1.3158289572393098, "grad_norm": 0.09656878560781479, "learning_rate": 3.161690869989068e-05, "loss": 0.0149, "step": 1754 }, { "epoch": 1.3165791447861965, "grad_norm": 0.10428894311189651, "learning_rate": 3.155603339778837e-05, "loss": 0.0132, "step": 1755 }, { "epoch": 1.3173293323330832, "grad_norm": 0.15280795097351074, "learning_rate": 3.149518972527257e-05, "loss": 0.0189, "step": 1756 }, { "epoch": 1.31807951987997, "grad_norm": 0.1433146446943283, "learning_rate": 3.1434377786684197e-05, "loss": 0.0233, "step": 1757 }, { "epoch": 1.3188297074268567, "grad_norm": 0.10560217499732971, "learning_rate": 3.137359768630972e-05, "loss": 0.0184, "step": 1758 }, { "epoch": 1.3195798949737434, "grad_norm": 0.14866486191749573, "learning_rate": 3.131284952838106e-05, "loss": 0.0131, "step": 1759 }, { "epoch": 1.32033008252063, "grad_norm": 0.11577482521533966, "learning_rate": 3.125213341707528e-05, "loss": 0.0203, "step": 1760 }, { "epoch": 1.3210802700675168, "grad_norm": 0.13666844367980957, "learning_rate": 3.1191449456514575e-05, "loss": 0.0202, "step": 1761 }, { "epoch": 1.3218304576144035, "grad_norm": 0.09610434621572495, "learning_rate": 3.113079775076593e-05, "loss": 0.0126, "step": 1762 }, { "epoch": 1.3225806451612903, "grad_norm": 0.13390542566776276, "learning_rate": 3.107017840384107e-05, "loss": 0.0155, "step": 1763 }, { "epoch": 1.323330832708177, "grad_norm": 0.09720296412706375, "learning_rate": 3.100959151969619e-05, "loss": 0.0157, "step": 1764 }, { "epoch": 1.3240810202550637, "grad_norm": 0.16918721795082092, "learning_rate": 3.0949037202231826e-05, "loss": 0.024, "step": 1765 }, { "epoch": 1.3248312078019504, "grad_norm": 0.09024802595376968, "learning_rate": 3.08885155552927e-05, "loss": 0.0121, "step": 1766 }, { "epoch": 1.3255813953488373, "grad_norm": 0.0722404420375824, "learning_rate": 3.082802668266743e-05, "loss": 0.01, "step": 1767 }, { "epoch": 1.3263315828957238, "grad_norm": 0.11001674830913544, "learning_rate": 3.076757068808852e-05, "loss": 0.0154, "step": 1768 }, { "epoch": 1.3270817704426108, "grad_norm": 0.2680615782737732, "learning_rate": 3.070714767523203e-05, "loss": 0.0486, "step": 1769 }, { "epoch": 1.3278319579894973, "grad_norm": 0.11515659838914871, "learning_rate": 3.0646757747717475e-05, "loss": 0.0131, "step": 1770 }, { "epoch": 1.3285821455363842, "grad_norm": 0.11640623956918716, "learning_rate": 3.0586401009107636e-05, "loss": 0.0153, "step": 1771 }, { "epoch": 1.3293323330832707, "grad_norm": 0.22977732121944427, "learning_rate": 3.0526077562908386e-05, "loss": 0.0315, "step": 1772 }, { "epoch": 1.3300825206301576, "grad_norm": 0.09609303623437881, "learning_rate": 3.0465787512568466e-05, "loss": 0.0137, "step": 1773 }, { "epoch": 1.3308327081770441, "grad_norm": 0.10839638113975525, "learning_rate": 3.040553096147942e-05, "loss": 0.0203, "step": 1774 }, { "epoch": 1.331582895723931, "grad_norm": 0.12790155410766602, "learning_rate": 3.0345308012975255e-05, "loss": 0.0195, "step": 1775 }, { "epoch": 1.3323330832708178, "grad_norm": 0.08059082180261612, "learning_rate": 3.0285118770332428e-05, "loss": 0.0101, "step": 1776 }, { "epoch": 1.3330832708177045, "grad_norm": 0.1413569152355194, "learning_rate": 3.022496333676954e-05, "loss": 0.0194, "step": 1777 }, { "epoch": 1.3338334583645912, "grad_norm": 0.10592398792505264, "learning_rate": 3.0164841815447263e-05, "loss": 0.0156, "step": 1778 }, { "epoch": 1.334583645911478, "grad_norm": 0.15778115391731262, "learning_rate": 3.0104754309468066e-05, "loss": 0.0194, "step": 1779 }, { "epoch": 1.3353338334583646, "grad_norm": 0.2557612359523773, "learning_rate": 3.00447009218761e-05, "loss": 0.03, "step": 1780 }, { "epoch": 1.3360840210052514, "grad_norm": 0.1358753889799118, "learning_rate": 2.9984681755657017e-05, "loss": 0.0197, "step": 1781 }, { "epoch": 1.336834208552138, "grad_norm": 0.11550354212522507, "learning_rate": 2.9924696913737792e-05, "loss": 0.0217, "step": 1782 }, { "epoch": 1.3375843960990248, "grad_norm": 0.15435385704040527, "learning_rate": 2.986474649898651e-05, "loss": 0.021, "step": 1783 }, { "epoch": 1.3383345836459115, "grad_norm": 0.08404883742332458, "learning_rate": 2.9804830614212242e-05, "loss": 0.0146, "step": 1784 }, { "epoch": 1.3390847711927982, "grad_norm": 0.122297503054142, "learning_rate": 2.9744949362164798e-05, "loss": 0.0157, "step": 1785 }, { "epoch": 1.339834958739685, "grad_norm": 0.08815350383520126, "learning_rate": 2.9685102845534658e-05, "loss": 0.0126, "step": 1786 }, { "epoch": 1.3405851462865717, "grad_norm": 0.1538134217262268, "learning_rate": 2.9625291166952702e-05, "loss": 0.0319, "step": 1787 }, { "epoch": 1.3413353338334584, "grad_norm": 0.12486133724451065, "learning_rate": 2.956551442899005e-05, "loss": 0.0216, "step": 1788 }, { "epoch": 1.342085521380345, "grad_norm": 0.12948335707187653, "learning_rate": 2.9505772734157948e-05, "loss": 0.0187, "step": 1789 }, { "epoch": 1.3428357089272318, "grad_norm": 0.09788129478693008, "learning_rate": 2.9446066184907495e-05, "loss": 0.0125, "step": 1790 }, { "epoch": 1.3435858964741185, "grad_norm": 0.1548995077610016, "learning_rate": 2.9386394883629565e-05, "loss": 0.0227, "step": 1791 }, { "epoch": 1.3443360840210052, "grad_norm": 0.11524860560894012, "learning_rate": 2.932675893265454e-05, "loss": 0.0174, "step": 1792 }, { "epoch": 1.345086271567892, "grad_norm": 0.1371203064918518, "learning_rate": 2.926715843425223e-05, "loss": 0.0247, "step": 1793 }, { "epoch": 1.3458364591147787, "grad_norm": 0.10300806164741516, "learning_rate": 2.9207593490631592e-05, "loss": 0.0173, "step": 1794 }, { "epoch": 1.3465866466616654, "grad_norm": 0.17150884866714478, "learning_rate": 2.914806420394064e-05, "loss": 0.0284, "step": 1795 }, { "epoch": 1.347336834208552, "grad_norm": 0.13951390981674194, "learning_rate": 2.908857067626629e-05, "loss": 0.0158, "step": 1796 }, { "epoch": 1.3480870217554388, "grad_norm": 0.12770432233810425, "learning_rate": 2.902911300963403e-05, "loss": 0.0231, "step": 1797 }, { "epoch": 1.3488372093023255, "grad_norm": 0.09566380083560944, "learning_rate": 2.8969691306007918e-05, "loss": 0.0135, "step": 1798 }, { "epoch": 1.3495873968492123, "grad_norm": 0.09345205873250961, "learning_rate": 2.891030566729032e-05, "loss": 0.018, "step": 1799 }, { "epoch": 1.350337584396099, "grad_norm": 0.11738814413547516, "learning_rate": 2.8850956195321795e-05, "loss": 0.0178, "step": 1800 }, { "epoch": 1.350337584396099, "eval_loss": 0.0302781630307436, "eval_runtime": 5.1365, "eval_samples_per_second": 10.513, "eval_steps_per_second": 2.726, "step": 1800 }, { "epoch": 1.3510877719429857, "grad_norm": 0.16350629925727844, "learning_rate": 2.8791642991880784e-05, "loss": 0.0222, "step": 1801 }, { "epoch": 1.3518379594898724, "grad_norm": 0.09744306653738022, "learning_rate": 2.873236615868362e-05, "loss": 0.0184, "step": 1802 }, { "epoch": 1.3525881470367591, "grad_norm": 0.14381620287895203, "learning_rate": 2.8673125797384243e-05, "loss": 0.0166, "step": 1803 }, { "epoch": 1.3533383345836458, "grad_norm": 0.09115853160619736, "learning_rate": 2.8613922009574024e-05, "loss": 0.0122, "step": 1804 }, { "epoch": 1.3540885221305325, "grad_norm": 0.1326061189174652, "learning_rate": 2.8554754896781656e-05, "loss": 0.0187, "step": 1805 }, { "epoch": 1.3548387096774195, "grad_norm": 0.0877179428935051, "learning_rate": 2.8495624560472866e-05, "loss": 0.0111, "step": 1806 }, { "epoch": 1.355588897224306, "grad_norm": 0.13962623476982117, "learning_rate": 2.843653110205039e-05, "loss": 0.0191, "step": 1807 }, { "epoch": 1.356339084771193, "grad_norm": 0.11053589731454849, "learning_rate": 2.8377474622853683e-05, "loss": 0.0168, "step": 1808 }, { "epoch": 1.3570892723180794, "grad_norm": 0.09899614006280899, "learning_rate": 2.8318455224158786e-05, "loss": 0.0142, "step": 1809 }, { "epoch": 1.3578394598649663, "grad_norm": 0.11597689986228943, "learning_rate": 2.8259473007178163e-05, "loss": 0.0149, "step": 1810 }, { "epoch": 1.3585896474118528, "grad_norm": 0.08869005739688873, "learning_rate": 2.8200528073060507e-05, "loss": 0.0128, "step": 1811 }, { "epoch": 1.3593398349587398, "grad_norm": 0.12468276917934418, "learning_rate": 2.814162052289058e-05, "loss": 0.0171, "step": 1812 }, { "epoch": 1.3600900225056263, "grad_norm": 0.06973183900117874, "learning_rate": 2.8082750457689033e-05, "loss": 0.0079, "step": 1813 }, { "epoch": 1.3608402100525132, "grad_norm": 0.3414054811000824, "learning_rate": 2.8023917978412207e-05, "loss": 0.0179, "step": 1814 }, { "epoch": 1.3615903975994, "grad_norm": 0.10484994947910309, "learning_rate": 2.7965123185952023e-05, "loss": 0.0163, "step": 1815 }, { "epoch": 1.3623405851462866, "grad_norm": 0.1568952351808548, "learning_rate": 2.7906366181135775e-05, "loss": 0.0205, "step": 1816 }, { "epoch": 1.3630907726931734, "grad_norm": 0.14470714330673218, "learning_rate": 2.7847647064725924e-05, "loss": 0.0161, "step": 1817 }, { "epoch": 1.36384096024006, "grad_norm": 0.08876904845237732, "learning_rate": 2.778896593741999e-05, "loss": 0.0091, "step": 1818 }, { "epoch": 1.3645911477869468, "grad_norm": 0.07300567626953125, "learning_rate": 2.77303228998503e-05, "loss": 0.0091, "step": 1819 }, { "epoch": 1.3653413353338335, "grad_norm": 0.20413988828659058, "learning_rate": 2.7671718052583908e-05, "loss": 0.0339, "step": 1820 }, { "epoch": 1.3660915228807202, "grad_norm": 0.1549023687839508, "learning_rate": 2.7613151496122347e-05, "loss": 0.0264, "step": 1821 }, { "epoch": 1.366841710427607, "grad_norm": 0.1032576784491539, "learning_rate": 2.7554623330901524e-05, "loss": 0.0138, "step": 1822 }, { "epoch": 1.3675918979744937, "grad_norm": 0.19302760064601898, "learning_rate": 2.749613365729141e-05, "loss": 0.0302, "step": 1823 }, { "epoch": 1.3683420855213804, "grad_norm": 0.13430748879909515, "learning_rate": 2.7437682575596104e-05, "loss": 0.0225, "step": 1824 }, { "epoch": 1.369092273068267, "grad_norm": 0.11612657457590103, "learning_rate": 2.7379270186053428e-05, "loss": 0.0167, "step": 1825 }, { "epoch": 1.3698424606151538, "grad_norm": 0.09877978265285492, "learning_rate": 2.7320896588834903e-05, "loss": 0.0144, "step": 1826 }, { "epoch": 1.3705926481620405, "grad_norm": 0.12179384380578995, "learning_rate": 2.7262561884045457e-05, "loss": 0.0138, "step": 1827 }, { "epoch": 1.3713428357089272, "grad_norm": 0.08559141308069229, "learning_rate": 2.720426617172339e-05, "loss": 0.0124, "step": 1828 }, { "epoch": 1.372093023255814, "grad_norm": 0.16383309662342072, "learning_rate": 2.71460095518401e-05, "loss": 0.0227, "step": 1829 }, { "epoch": 1.3728432108027007, "grad_norm": 0.14575624465942383, "learning_rate": 2.708779212429996e-05, "loss": 0.0218, "step": 1830 }, { "epoch": 1.3735933983495874, "grad_norm": 0.08648881316184998, "learning_rate": 2.702961398894014e-05, "loss": 0.013, "step": 1831 }, { "epoch": 1.374343585896474, "grad_norm": 0.21686382591724396, "learning_rate": 2.6971475245530375e-05, "loss": 0.0299, "step": 1832 }, { "epoch": 1.3750937734433608, "grad_norm": 0.13174933195114136, "learning_rate": 2.6913375993772915e-05, "loss": 0.0183, "step": 1833 }, { "epoch": 1.3758439609902475, "grad_norm": 0.12637682259082794, "learning_rate": 2.6855316333302237e-05, "loss": 0.0179, "step": 1834 }, { "epoch": 1.3765941485371342, "grad_norm": 0.1379723697900772, "learning_rate": 2.6797296363684977e-05, "loss": 0.0157, "step": 1835 }, { "epoch": 1.377344336084021, "grad_norm": 0.1410985141992569, "learning_rate": 2.6739316184419622e-05, "loss": 0.0162, "step": 1836 }, { "epoch": 1.3780945236309077, "grad_norm": 0.12312553822994232, "learning_rate": 2.6681375894936472e-05, "loss": 0.016, "step": 1837 }, { "epoch": 1.3788447111777944, "grad_norm": 0.1267862617969513, "learning_rate": 2.662347559459746e-05, "loss": 0.0192, "step": 1838 }, { "epoch": 1.379594898724681, "grad_norm": 0.07263024896383286, "learning_rate": 2.6565615382695896e-05, "loss": 0.0117, "step": 1839 }, { "epoch": 1.3803450862715678, "grad_norm": 0.0848407968878746, "learning_rate": 2.6507795358456307e-05, "loss": 0.0157, "step": 1840 }, { "epoch": 1.3810952738184545, "grad_norm": 0.09617772698402405, "learning_rate": 2.6450015621034362e-05, "loss": 0.0133, "step": 1841 }, { "epoch": 1.3818454613653413, "grad_norm": 0.09032897651195526, "learning_rate": 2.6392276269516613e-05, "loss": 0.0114, "step": 1842 }, { "epoch": 1.3825956489122282, "grad_norm": 0.1260712742805481, "learning_rate": 2.63345774029204e-05, "loss": 0.0219, "step": 1843 }, { "epoch": 1.3833458364591147, "grad_norm": 0.1832832247018814, "learning_rate": 2.6276919120193543e-05, "loss": 0.0275, "step": 1844 }, { "epoch": 1.3840960240060016, "grad_norm": 0.1576533019542694, "learning_rate": 2.621930152021434e-05, "loss": 0.0249, "step": 1845 }, { "epoch": 1.3848462115528881, "grad_norm": 0.15020745992660522, "learning_rate": 2.6161724701791306e-05, "loss": 0.017, "step": 1846 }, { "epoch": 1.385596399099775, "grad_norm": 0.11638973653316498, "learning_rate": 2.6104188763663018e-05, "loss": 0.0142, "step": 1847 }, { "epoch": 1.3863465866466615, "grad_norm": 0.12805978953838348, "learning_rate": 2.604669380449795e-05, "loss": 0.0151, "step": 1848 }, { "epoch": 1.3870967741935485, "grad_norm": 0.12945251166820526, "learning_rate": 2.598923992289427e-05, "loss": 0.0153, "step": 1849 }, { "epoch": 1.387846961740435, "grad_norm": 0.07249733805656433, "learning_rate": 2.5931827217379746e-05, "loss": 0.0098, "step": 1850 }, { "epoch": 1.388597149287322, "grad_norm": 0.08274883031845093, "learning_rate": 2.5874455786411505e-05, "loss": 0.0126, "step": 1851 }, { "epoch": 1.3893473368342086, "grad_norm": 0.0918225646018982, "learning_rate": 2.5817125728375912e-05, "loss": 0.0118, "step": 1852 }, { "epoch": 1.3900975243810954, "grad_norm": 0.08083860576152802, "learning_rate": 2.5759837141588362e-05, "loss": 0.011, "step": 1853 }, { "epoch": 1.390847711927982, "grad_norm": 0.2597929537296295, "learning_rate": 2.5702590124293147e-05, "loss": 0.0213, "step": 1854 }, { "epoch": 1.3915978994748688, "grad_norm": 0.11949342489242554, "learning_rate": 2.5645384774663262e-05, "loss": 0.0195, "step": 1855 }, { "epoch": 1.3923480870217555, "grad_norm": 0.054844655096530914, "learning_rate": 2.5588221190800264e-05, "loss": 0.0106, "step": 1856 }, { "epoch": 1.3930982745686422, "grad_norm": 0.2637576460838318, "learning_rate": 2.5531099470734038e-05, "loss": 0.0293, "step": 1857 }, { "epoch": 1.393848462115529, "grad_norm": 0.17199398577213287, "learning_rate": 2.5474019712422724e-05, "loss": 0.0171, "step": 1858 }, { "epoch": 1.3945986496624156, "grad_norm": 0.16428735852241516, "learning_rate": 2.541698201375249e-05, "loss": 0.0173, "step": 1859 }, { "epoch": 1.3953488372093024, "grad_norm": 0.10029322654008865, "learning_rate": 2.5359986472537373e-05, "loss": 0.0118, "step": 1860 }, { "epoch": 1.396099024756189, "grad_norm": 0.10694640129804611, "learning_rate": 2.530303318651913e-05, "loss": 0.0144, "step": 1861 }, { "epoch": 1.3968492123030758, "grad_norm": 0.12824860215187073, "learning_rate": 2.5246122253366998e-05, "loss": 0.0137, "step": 1862 }, { "epoch": 1.3975993998499625, "grad_norm": 0.10325340926647186, "learning_rate": 2.5189253770677644e-05, "loss": 0.0112, "step": 1863 }, { "epoch": 1.3983495873968492, "grad_norm": 0.09274782985448837, "learning_rate": 2.5132427835974926e-05, "loss": 0.0129, "step": 1864 }, { "epoch": 1.399099774943736, "grad_norm": 0.06569540500640869, "learning_rate": 2.507564454670971e-05, "loss": 0.0068, "step": 1865 }, { "epoch": 1.3998499624906227, "grad_norm": 0.07178516685962677, "learning_rate": 2.5018904000259757e-05, "loss": 0.0126, "step": 1866 }, { "epoch": 1.4006001500375094, "grad_norm": 0.09361020475625992, "learning_rate": 2.4962206293929512e-05, "loss": 0.0111, "step": 1867 }, { "epoch": 1.401350337584396, "grad_norm": 0.09147719293832779, "learning_rate": 2.490555152494996e-05, "loss": 0.0093, "step": 1868 }, { "epoch": 1.4021005251312828, "grad_norm": 0.12089736014604568, "learning_rate": 2.4848939790478463e-05, "loss": 0.012, "step": 1869 }, { "epoch": 1.4028507126781695, "grad_norm": 0.07719265669584274, "learning_rate": 2.4792371187598544e-05, "loss": 0.0102, "step": 1870 }, { "epoch": 1.4036009002250562, "grad_norm": 0.07259789109230042, "learning_rate": 2.4735845813319804e-05, "loss": 0.0105, "step": 1871 }, { "epoch": 1.404351087771943, "grad_norm": 0.1539996713399887, "learning_rate": 2.4679363764577683e-05, "loss": 0.0188, "step": 1872 }, { "epoch": 1.4051012753188297, "grad_norm": 0.10606228560209274, "learning_rate": 2.462292513823336e-05, "loss": 0.0168, "step": 1873 }, { "epoch": 1.4058514628657164, "grad_norm": 0.09349866211414337, "learning_rate": 2.4566530031073486e-05, "loss": 0.013, "step": 1874 }, { "epoch": 1.406601650412603, "grad_norm": 0.20843681693077087, "learning_rate": 2.451017853981013e-05, "loss": 0.0161, "step": 1875 }, { "epoch": 1.4073518379594898, "grad_norm": 0.12823422253131866, "learning_rate": 2.4453870761080554e-05, "loss": 0.0222, "step": 1876 }, { "epoch": 1.4081020255063765, "grad_norm": 0.14354470372200012, "learning_rate": 2.4397606791447052e-05, "loss": 0.0188, "step": 1877 }, { "epoch": 1.4088522130532632, "grad_norm": 0.1241222470998764, "learning_rate": 2.4341386727396793e-05, "loss": 0.0204, "step": 1878 }, { "epoch": 1.40960240060015, "grad_norm": 0.11772765964269638, "learning_rate": 2.4285210665341646e-05, "loss": 0.0146, "step": 1879 }, { "epoch": 1.4103525881470367, "grad_norm": 0.14254790544509888, "learning_rate": 2.422907870161803e-05, "loss": 0.0168, "step": 1880 }, { "epoch": 1.4111027756939234, "grad_norm": 0.22304561734199524, "learning_rate": 2.4172990932486733e-05, "loss": 0.0146, "step": 1881 }, { "epoch": 1.4118529632408103, "grad_norm": 0.16511446237564087, "learning_rate": 2.4116947454132782e-05, "loss": 0.0187, "step": 1882 }, { "epoch": 1.4126031507876968, "grad_norm": 0.12960758805274963, "learning_rate": 2.4060948362665176e-05, "loss": 0.0145, "step": 1883 }, { "epoch": 1.4133533383345838, "grad_norm": 0.11755431443452835, "learning_rate": 2.4004993754116867e-05, "loss": 0.0171, "step": 1884 }, { "epoch": 1.4141035258814703, "grad_norm": 0.08797797560691833, "learning_rate": 2.39490837244445e-05, "loss": 0.011, "step": 1885 }, { "epoch": 1.4148537134283572, "grad_norm": 0.08972885459661484, "learning_rate": 2.389321836952828e-05, "loss": 0.012, "step": 1886 }, { "epoch": 1.4156039009752437, "grad_norm": 0.1484125852584839, "learning_rate": 2.383739778517176e-05, "loss": 0.0164, "step": 1887 }, { "epoch": 1.4163540885221306, "grad_norm": 0.20713378489017487, "learning_rate": 2.3781622067101767e-05, "loss": 0.0283, "step": 1888 }, { "epoch": 1.4171042760690171, "grad_norm": 0.13764242827892303, "learning_rate": 2.372589131096816e-05, "loss": 0.0225, "step": 1889 }, { "epoch": 1.417854463615904, "grad_norm": 0.12255063652992249, "learning_rate": 2.36702056123437e-05, "loss": 0.0153, "step": 1890 }, { "epoch": 1.4186046511627908, "grad_norm": 0.09976071864366531, "learning_rate": 2.3614565066723892e-05, "loss": 0.0147, "step": 1891 }, { "epoch": 1.4193548387096775, "grad_norm": 0.14297650754451752, "learning_rate": 2.355896976952674e-05, "loss": 0.0195, "step": 1892 }, { "epoch": 1.4201050262565642, "grad_norm": 0.1632770597934723, "learning_rate": 2.350341981609276e-05, "loss": 0.0217, "step": 1893 }, { "epoch": 1.420855213803451, "grad_norm": 0.13608597218990326, "learning_rate": 2.344791530168465e-05, "loss": 0.0178, "step": 1894 }, { "epoch": 1.4216054013503376, "grad_norm": 0.09987060725688934, "learning_rate": 2.339245632148715e-05, "loss": 0.0168, "step": 1895 }, { "epoch": 1.4223555888972244, "grad_norm": 0.15993699431419373, "learning_rate": 2.3337042970606965e-05, "loss": 0.0261, "step": 1896 }, { "epoch": 1.423105776444111, "grad_norm": 0.11352405697107315, "learning_rate": 2.3281675344072545e-05, "loss": 0.0166, "step": 1897 }, { "epoch": 1.4238559639909978, "grad_norm": 0.09080108255147934, "learning_rate": 2.3226353536833907e-05, "loss": 0.0145, "step": 1898 }, { "epoch": 1.4246061515378845, "grad_norm": 0.11767710000276566, "learning_rate": 2.317107764376253e-05, "loss": 0.0185, "step": 1899 }, { "epoch": 1.4253563390847712, "grad_norm": 0.10477675497531891, "learning_rate": 2.3115847759651082e-05, "loss": 0.0147, "step": 1900 }, { "epoch": 1.426106526631658, "grad_norm": 0.1232522651553154, "learning_rate": 2.3060663979213404e-05, "loss": 0.0192, "step": 1901 }, { "epoch": 1.4268567141785446, "grad_norm": 0.1216619461774826, "learning_rate": 2.300552639708423e-05, "loss": 0.0199, "step": 1902 }, { "epoch": 1.4276069017254314, "grad_norm": 0.0820419117808342, "learning_rate": 2.2950435107819124e-05, "loss": 0.0103, "step": 1903 }, { "epoch": 1.428357089272318, "grad_norm": 0.13588947057724, "learning_rate": 2.2895390205894164e-05, "loss": 0.0185, "step": 1904 }, { "epoch": 1.4291072768192048, "grad_norm": 0.30323174595832825, "learning_rate": 2.2840391785705967e-05, "loss": 0.0401, "step": 1905 }, { "epoch": 1.4298574643660915, "grad_norm": 0.15225709974765778, "learning_rate": 2.278543994157139e-05, "loss": 0.0165, "step": 1906 }, { "epoch": 1.4306076519129782, "grad_norm": 0.10808411240577698, "learning_rate": 2.2730534767727483e-05, "loss": 0.0143, "step": 1907 }, { "epoch": 1.431357839459865, "grad_norm": 0.1031549796462059, "learning_rate": 2.267567635833116e-05, "loss": 0.0224, "step": 1908 }, { "epoch": 1.4321080270067517, "grad_norm": 0.10170621424913406, "learning_rate": 2.2620864807459213e-05, "loss": 0.014, "step": 1909 }, { "epoch": 1.4328582145536384, "grad_norm": 0.10480276495218277, "learning_rate": 2.2566100209108048e-05, "loss": 0.0147, "step": 1910 }, { "epoch": 1.433608402100525, "grad_norm": 0.17218300700187683, "learning_rate": 2.2511382657193565e-05, "loss": 0.0171, "step": 1911 }, { "epoch": 1.4343585896474118, "grad_norm": 0.10617750883102417, "learning_rate": 2.2456712245550993e-05, "loss": 0.0175, "step": 1912 }, { "epoch": 1.4351087771942985, "grad_norm": 0.10631793737411499, "learning_rate": 2.2402089067934668e-05, "loss": 0.016, "step": 1913 }, { "epoch": 1.4358589647411852, "grad_norm": 0.15100601315498352, "learning_rate": 2.2347513218017974e-05, "loss": 0.0236, "step": 1914 }, { "epoch": 1.436609152288072, "grad_norm": 0.15311028063297272, "learning_rate": 2.2292984789393122e-05, "loss": 0.0237, "step": 1915 }, { "epoch": 1.4373593398349587, "grad_norm": 0.0808829814195633, "learning_rate": 2.2238503875571028e-05, "loss": 0.0093, "step": 1916 }, { "epoch": 1.4381095273818454, "grad_norm": 0.18772268295288086, "learning_rate": 2.218407056998104e-05, "loss": 0.0278, "step": 1917 }, { "epoch": 1.438859714928732, "grad_norm": 0.06562066823244095, "learning_rate": 2.2129684965970948e-05, "loss": 0.0077, "step": 1918 }, { "epoch": 1.439609902475619, "grad_norm": 0.1397140920162201, "learning_rate": 2.2075347156806697e-05, "loss": 0.0193, "step": 1919 }, { "epoch": 1.4403600900225055, "grad_norm": 0.1287301629781723, "learning_rate": 2.2021057235672288e-05, "loss": 0.0231, "step": 1920 }, { "epoch": 1.4411102775693925, "grad_norm": 0.10357276350259781, "learning_rate": 2.1966815295669585e-05, "loss": 0.0176, "step": 1921 }, { "epoch": 1.441860465116279, "grad_norm": 0.12166761606931686, "learning_rate": 2.1912621429818177e-05, "loss": 0.0171, "step": 1922 }, { "epoch": 1.442610652663166, "grad_norm": 0.08617536723613739, "learning_rate": 2.18584757310552e-05, "loss": 0.0111, "step": 1923 }, { "epoch": 1.4433608402100524, "grad_norm": 0.14840100705623627, "learning_rate": 2.1804378292235224e-05, "loss": 0.0181, "step": 1924 }, { "epoch": 1.4441110277569393, "grad_norm": 0.1149296835064888, "learning_rate": 2.1750329206129988e-05, "loss": 0.0198, "step": 1925 }, { "epoch": 1.4448612153038258, "grad_norm": 0.13646076619625092, "learning_rate": 2.1696328565428364e-05, "loss": 0.0169, "step": 1926 }, { "epoch": 1.4456114028507128, "grad_norm": 0.09913797676563263, "learning_rate": 2.1642376462736148e-05, "loss": 0.0109, "step": 1927 }, { "epoch": 1.4463615903975993, "grad_norm": 0.22945545613765717, "learning_rate": 2.158847299057587e-05, "loss": 0.0235, "step": 1928 }, { "epoch": 1.4471117779444862, "grad_norm": 0.0962451621890068, "learning_rate": 2.1534618241386705e-05, "loss": 0.0177, "step": 1929 }, { "epoch": 1.447861965491373, "grad_norm": 0.08848632872104645, "learning_rate": 2.14808123075242e-05, "loss": 0.0122, "step": 1930 }, { "epoch": 1.4486121530382596, "grad_norm": 0.13440725207328796, "learning_rate": 2.1427055281260255e-05, "loss": 0.0169, "step": 1931 }, { "epoch": 1.4493623405851463, "grad_norm": 0.08332593739032745, "learning_rate": 2.1373347254782882e-05, "loss": 0.0145, "step": 1932 }, { "epoch": 1.450112528132033, "grad_norm": 0.12370090931653976, "learning_rate": 2.1319688320196048e-05, "loss": 0.0157, "step": 1933 }, { "epoch": 1.4508627156789198, "grad_norm": 0.13575379550457, "learning_rate": 2.1266078569519542e-05, "loss": 0.0129, "step": 1934 }, { "epoch": 1.4516129032258065, "grad_norm": 0.10451852530241013, "learning_rate": 2.121251809468882e-05, "loss": 0.0181, "step": 1935 }, { "epoch": 1.4523630907726932, "grad_norm": 0.10986686497926712, "learning_rate": 2.1159006987554807e-05, "loss": 0.0178, "step": 1936 }, { "epoch": 1.45311327831958, "grad_norm": 0.08338434994220734, "learning_rate": 2.1105545339883808e-05, "loss": 0.0108, "step": 1937 }, { "epoch": 1.4538634658664666, "grad_norm": 0.14483703672885895, "learning_rate": 2.1052133243357253e-05, "loss": 0.0191, "step": 1938 }, { "epoch": 1.4546136534133534, "grad_norm": 0.12387536466121674, "learning_rate": 2.0998770789571636e-05, "loss": 0.0153, "step": 1939 }, { "epoch": 1.45536384096024, "grad_norm": 0.10853541642427444, "learning_rate": 2.0945458070038315e-05, "loss": 0.0115, "step": 1940 }, { "epoch": 1.4561140285071268, "grad_norm": 0.14933103322982788, "learning_rate": 2.0892195176183354e-05, "loss": 0.0164, "step": 1941 }, { "epoch": 1.4568642160540135, "grad_norm": 0.131834477186203, "learning_rate": 2.083898219934739e-05, "loss": 0.0176, "step": 1942 }, { "epoch": 1.4576144036009002, "grad_norm": 0.14950533211231232, "learning_rate": 2.0785819230785398e-05, "loss": 0.0226, "step": 1943 }, { "epoch": 1.458364591147787, "grad_norm": 0.0855354368686676, "learning_rate": 2.073270636166666e-05, "loss": 0.016, "step": 1944 }, { "epoch": 1.4591147786946737, "grad_norm": 0.13274472951889038, "learning_rate": 2.0679643683074513e-05, "loss": 0.0165, "step": 1945 }, { "epoch": 1.4598649662415604, "grad_norm": 0.10879681259393692, "learning_rate": 2.0626631286006236e-05, "loss": 0.0165, "step": 1946 }, { "epoch": 1.460615153788447, "grad_norm": 0.12825417518615723, "learning_rate": 2.0573669261372847e-05, "loss": 0.0211, "step": 1947 }, { "epoch": 1.4613653413353338, "grad_norm": 0.10240969061851501, "learning_rate": 2.052075769999899e-05, "loss": 0.0141, "step": 1948 }, { "epoch": 1.4621155288822205, "grad_norm": 0.08049630373716354, "learning_rate": 2.046789669262283e-05, "loss": 0.0122, "step": 1949 }, { "epoch": 1.4628657164291072, "grad_norm": 0.17087234556674957, "learning_rate": 2.0415086329895784e-05, "loss": 0.0184, "step": 1950 }, { "epoch": 1.463615903975994, "grad_norm": 0.09374770522117615, "learning_rate": 2.0362326702382384e-05, "loss": 0.0119, "step": 1951 }, { "epoch": 1.4643660915228807, "grad_norm": 0.10966373980045319, "learning_rate": 2.0309617900560218e-05, "loss": 0.0148, "step": 1952 }, { "epoch": 1.4651162790697674, "grad_norm": 0.1399017721414566, "learning_rate": 2.0256960014819692e-05, "loss": 0.02, "step": 1953 }, { "epoch": 1.465866466616654, "grad_norm": 0.10857044905424118, "learning_rate": 2.020435313546391e-05, "loss": 0.0161, "step": 1954 }, { "epoch": 1.4666166541635408, "grad_norm": 0.22650955617427826, "learning_rate": 2.0151797352708457e-05, "loss": 0.0407, "step": 1955 }, { "epoch": 1.4673668417104275, "grad_norm": 0.12916965782642365, "learning_rate": 2.0099292756681343e-05, "loss": 0.0206, "step": 1956 }, { "epoch": 1.4681170292573142, "grad_norm": 0.15416133403778076, "learning_rate": 2.0046839437422772e-05, "loss": 0.021, "step": 1957 }, { "epoch": 1.4688672168042012, "grad_norm": 0.08638902008533478, "learning_rate": 1.999443748488503e-05, "loss": 0.0104, "step": 1958 }, { "epoch": 1.4696174043510877, "grad_norm": 0.08310846239328384, "learning_rate": 1.9942086988932323e-05, "loss": 0.0119, "step": 1959 }, { "epoch": 1.4703675918979746, "grad_norm": 0.08909523487091064, "learning_rate": 1.9889788039340558e-05, "loss": 0.0173, "step": 1960 }, { "epoch": 1.471117779444861, "grad_norm": 0.1264401376247406, "learning_rate": 1.9837540725797305e-05, "loss": 0.0184, "step": 1961 }, { "epoch": 1.471867966991748, "grad_norm": 0.1094694435596466, "learning_rate": 1.9785345137901533e-05, "loss": 0.0153, "step": 1962 }, { "epoch": 1.4726181545386345, "grad_norm": 0.08005423098802567, "learning_rate": 1.9733201365163607e-05, "loss": 0.0126, "step": 1963 }, { "epoch": 1.4733683420855215, "grad_norm": 0.1491316258907318, "learning_rate": 1.968110949700489e-05, "loss": 0.025, "step": 1964 }, { "epoch": 1.474118529632408, "grad_norm": 0.10460489988327026, "learning_rate": 1.962906962275784e-05, "loss": 0.0161, "step": 1965 }, { "epoch": 1.474868717179295, "grad_norm": 0.07633949816226959, "learning_rate": 1.9577081831665707e-05, "loss": 0.0112, "step": 1966 }, { "epoch": 1.4756189047261816, "grad_norm": 0.12210875749588013, "learning_rate": 1.9525146212882456e-05, "loss": 0.0244, "step": 1967 }, { "epoch": 1.4763690922730683, "grad_norm": 0.10730652511119843, "learning_rate": 1.9473262855472517e-05, "loss": 0.0162, "step": 1968 }, { "epoch": 1.477119279819955, "grad_norm": 0.1215217113494873, "learning_rate": 1.942143184841077e-05, "loss": 0.0162, "step": 1969 }, { "epoch": 1.4778694673668418, "grad_norm": 0.0921521708369255, "learning_rate": 1.9369653280582273e-05, "loss": 0.015, "step": 1970 }, { "epoch": 1.4786196549137285, "grad_norm": 0.19362576305866241, "learning_rate": 1.931792724078218e-05, "loss": 0.0275, "step": 1971 }, { "epoch": 1.4793698424606152, "grad_norm": 0.12856422364711761, "learning_rate": 1.9266253817715575e-05, "loss": 0.017, "step": 1972 }, { "epoch": 1.480120030007502, "grad_norm": 0.17855559289455414, "learning_rate": 1.921463309999724e-05, "loss": 0.0222, "step": 1973 }, { "epoch": 1.4808702175543886, "grad_norm": 0.10688813775777817, "learning_rate": 1.9163065176151662e-05, "loss": 0.0149, "step": 1974 }, { "epoch": 1.4816204051012754, "grad_norm": 0.10548462718725204, "learning_rate": 1.9111550134612738e-05, "loss": 0.0143, "step": 1975 }, { "epoch": 1.482370592648162, "grad_norm": 0.07540253549814224, "learning_rate": 1.9060088063723696e-05, "loss": 0.0094, "step": 1976 }, { "epoch": 1.4831207801950488, "grad_norm": 0.09331195056438446, "learning_rate": 1.900867905173692e-05, "loss": 0.015, "step": 1977 }, { "epoch": 1.4838709677419355, "grad_norm": 0.09781509637832642, "learning_rate": 1.8957323186813803e-05, "loss": 0.0107, "step": 1978 }, { "epoch": 1.4846211552888222, "grad_norm": 0.12924246490001678, "learning_rate": 1.8906020557024597e-05, "loss": 0.0132, "step": 1979 }, { "epoch": 1.485371342835709, "grad_norm": 0.26540517807006836, "learning_rate": 1.885477125034827e-05, "loss": 0.0255, "step": 1980 }, { "epoch": 1.4861215303825956, "grad_norm": 0.17854297161102295, "learning_rate": 1.8803575354672315e-05, "loss": 0.0234, "step": 1981 }, { "epoch": 1.4868717179294824, "grad_norm": 0.1174192726612091, "learning_rate": 1.8752432957792654e-05, "loss": 0.0136, "step": 1982 }, { "epoch": 1.487621905476369, "grad_norm": 0.14164678752422333, "learning_rate": 1.8701344147413474e-05, "loss": 0.0221, "step": 1983 }, { "epoch": 1.4883720930232558, "grad_norm": 0.08099871128797531, "learning_rate": 1.8650309011147053e-05, "loss": 0.0129, "step": 1984 }, { "epoch": 1.4891222805701425, "grad_norm": 0.09161733090877533, "learning_rate": 1.8599327636513636e-05, "loss": 0.0118, "step": 1985 }, { "epoch": 1.4898724681170292, "grad_norm": 0.1466006189584732, "learning_rate": 1.8548400110941228e-05, "loss": 0.0109, "step": 1986 }, { "epoch": 1.490622655663916, "grad_norm": 0.10579754412174225, "learning_rate": 1.8497526521765534e-05, "loss": 0.0101, "step": 1987 }, { "epoch": 1.4913728432108027, "grad_norm": 0.06394397467374802, "learning_rate": 1.844670695622976e-05, "loss": 0.0073, "step": 1988 }, { "epoch": 1.4921230307576894, "grad_norm": 0.1046217605471611, "learning_rate": 1.8395941501484464e-05, "loss": 0.0163, "step": 1989 }, { "epoch": 1.492873218304576, "grad_norm": 0.09319301694631577, "learning_rate": 1.8345230244587354e-05, "loss": 0.0138, "step": 1990 }, { "epoch": 1.4936234058514628, "grad_norm": 0.10420255362987518, "learning_rate": 1.829457327250329e-05, "loss": 0.0124, "step": 1991 }, { "epoch": 1.4943735933983495, "grad_norm": 0.12257860600948334, "learning_rate": 1.8243970672103982e-05, "loss": 0.0143, "step": 1992 }, { "epoch": 1.4951237809452362, "grad_norm": 0.15633484721183777, "learning_rate": 1.8193422530167914e-05, "loss": 0.0183, "step": 1993 }, { "epoch": 1.495873968492123, "grad_norm": 0.09158031642436981, "learning_rate": 1.8142928933380142e-05, "loss": 0.0149, "step": 1994 }, { "epoch": 1.49662415603901, "grad_norm": 0.10753414034843445, "learning_rate": 1.8092489968332233e-05, "loss": 0.0138, "step": 1995 }, { "epoch": 1.4973743435858964, "grad_norm": 0.09815339744091034, "learning_rate": 1.804210572152204e-05, "loss": 0.0118, "step": 1996 }, { "epoch": 1.4981245311327833, "grad_norm": 0.13312570750713348, "learning_rate": 1.7991776279353604e-05, "loss": 0.0186, "step": 1997 }, { "epoch": 1.4988747186796698, "grad_norm": 0.08981100469827652, "learning_rate": 1.794150172813693e-05, "loss": 0.0095, "step": 1998 }, { "epoch": 1.4996249062265568, "grad_norm": 0.18983112275600433, "learning_rate": 1.7891282154087934e-05, "loss": 0.0228, "step": 1999 }, { "epoch": 1.5003750937734432, "grad_norm": 0.10713919252157211, "learning_rate": 1.7841117643328246e-05, "loss": 0.0154, "step": 2000 }, { "epoch": 1.5003750937734432, "eval_loss": 0.03212602809071541, "eval_runtime": 5.1295, "eval_samples_per_second": 10.527, "eval_steps_per_second": 2.729, "step": 2000 }, { "epoch": 1.5011252813203302, "grad_norm": 0.105213463306427, "learning_rate": 1.779100828188506e-05, "loss": 0.0187, "step": 2001 }, { "epoch": 1.5018754688672167, "grad_norm": 0.1180376186966896, "learning_rate": 1.774095415569102e-05, "loss": 0.0128, "step": 2002 }, { "epoch": 1.5026256564141036, "grad_norm": 0.09247060120105743, "learning_rate": 1.7690955350583976e-05, "loss": 0.0089, "step": 2003 }, { "epoch": 1.50337584396099, "grad_norm": 0.22491805255413055, "learning_rate": 1.764101195230696e-05, "loss": 0.0351, "step": 2004 }, { "epoch": 1.504126031507877, "grad_norm": 0.6523817181587219, "learning_rate": 1.7591124046508045e-05, "loss": 0.0568, "step": 2005 }, { "epoch": 1.5048762190547635, "grad_norm": 0.11308412998914719, "learning_rate": 1.7541291718740012e-05, "loss": 0.0138, "step": 2006 }, { "epoch": 1.5056264066016505, "grad_norm": 0.10251988470554352, "learning_rate": 1.7491515054460418e-05, "loss": 0.0118, "step": 2007 }, { "epoch": 1.506376594148537, "grad_norm": 0.09428345412015915, "learning_rate": 1.7441794139031337e-05, "loss": 0.0191, "step": 2008 }, { "epoch": 1.507126781695424, "grad_norm": 0.15063363313674927, "learning_rate": 1.7392129057719246e-05, "loss": 0.0192, "step": 2009 }, { "epoch": 1.5078769692423106, "grad_norm": 0.14728331565856934, "learning_rate": 1.7342519895694886e-05, "loss": 0.0209, "step": 2010 }, { "epoch": 1.5086271567891973, "grad_norm": 0.14803294837474823, "learning_rate": 1.7292966738033057e-05, "loss": 0.0186, "step": 2011 }, { "epoch": 1.509377344336084, "grad_norm": 0.11730030179023743, "learning_rate": 1.7243469669712546e-05, "loss": 0.0185, "step": 2012 }, { "epoch": 1.5101275318829708, "grad_norm": 0.10365916788578033, "learning_rate": 1.7194028775615966e-05, "loss": 0.0143, "step": 2013 }, { "epoch": 1.5108777194298575, "grad_norm": 0.12164322286844254, "learning_rate": 1.714464414052958e-05, "loss": 0.0144, "step": 2014 }, { "epoch": 1.5116279069767442, "grad_norm": 0.1376684308052063, "learning_rate": 1.7095315849143184e-05, "loss": 0.0236, "step": 2015 }, { "epoch": 1.512378094523631, "grad_norm": 0.11822548508644104, "learning_rate": 1.704604398604991e-05, "loss": 0.016, "step": 2016 }, { "epoch": 1.5131282820705176, "grad_norm": 0.11870657652616501, "learning_rate": 1.6996828635746165e-05, "loss": 0.0202, "step": 2017 }, { "epoch": 1.5138784696174044, "grad_norm": 0.098456971347332, "learning_rate": 1.6947669882631434e-05, "loss": 0.0128, "step": 2018 }, { "epoch": 1.514628657164291, "grad_norm": 0.125031977891922, "learning_rate": 1.6898567811008135e-05, "loss": 0.0134, "step": 2019 }, { "epoch": 1.5153788447111778, "grad_norm": 0.08734133839607239, "learning_rate": 1.684952250508149e-05, "loss": 0.0117, "step": 2020 }, { "epoch": 1.5161290322580645, "grad_norm": 0.09543147683143616, "learning_rate": 1.6800534048959364e-05, "loss": 0.012, "step": 2021 }, { "epoch": 1.5168792198049512, "grad_norm": 0.11019328981637955, "learning_rate": 1.6751602526652133e-05, "loss": 0.0151, "step": 2022 }, { "epoch": 1.517629407351838, "grad_norm": 0.16680921614170074, "learning_rate": 1.6702728022072562e-05, "loss": 0.0181, "step": 2023 }, { "epoch": 1.5183795948987246, "grad_norm": 0.09131893515586853, "learning_rate": 1.665391061903558e-05, "loss": 0.0126, "step": 2024 }, { "epoch": 1.5191297824456114, "grad_norm": 0.13133695721626282, "learning_rate": 1.660515040125824e-05, "loss": 0.0241, "step": 2025 }, { "epoch": 1.519879969992498, "grad_norm": 0.09224233776330948, "learning_rate": 1.6556447452359512e-05, "loss": 0.0148, "step": 2026 }, { "epoch": 1.5206301575393848, "grad_norm": 0.1913694441318512, "learning_rate": 1.6507801855860177e-05, "loss": 0.0211, "step": 2027 }, { "epoch": 1.5213803450862715, "grad_norm": 0.09763851761817932, "learning_rate": 1.645921369518261e-05, "loss": 0.0106, "step": 2028 }, { "epoch": 1.5221305326331582, "grad_norm": 0.13741938769817352, "learning_rate": 1.6410683053650737e-05, "loss": 0.0161, "step": 2029 }, { "epoch": 1.5228807201800452, "grad_norm": 0.16641104221343994, "learning_rate": 1.636221001448983e-05, "loss": 0.0226, "step": 2030 }, { "epoch": 1.5236309077269317, "grad_norm": 0.14530512690544128, "learning_rate": 1.631379466082638e-05, "loss": 0.0234, "step": 2031 }, { "epoch": 1.5243810952738186, "grad_norm": 0.1273575872182846, "learning_rate": 1.626543707568795e-05, "loss": 0.0153, "step": 2032 }, { "epoch": 1.525131282820705, "grad_norm": 0.18259398639202118, "learning_rate": 1.6217137342003036e-05, "loss": 0.0178, "step": 2033 }, { "epoch": 1.525881470367592, "grad_norm": 0.1915973424911499, "learning_rate": 1.616889554260092e-05, "loss": 0.0224, "step": 2034 }, { "epoch": 1.5266316579144785, "grad_norm": 0.14290551841259003, "learning_rate": 1.6120711760211548e-05, "loss": 0.0205, "step": 2035 }, { "epoch": 1.5273818454613655, "grad_norm": 0.10475607216358185, "learning_rate": 1.607258607746537e-05, "loss": 0.0187, "step": 2036 }, { "epoch": 1.528132033008252, "grad_norm": 0.1033739373087883, "learning_rate": 1.602451857689316e-05, "loss": 0.0129, "step": 2037 }, { "epoch": 1.528882220555139, "grad_norm": 0.09066800773143768, "learning_rate": 1.5976509340925977e-05, "loss": 0.0094, "step": 2038 }, { "epoch": 1.5296324081020254, "grad_norm": 0.12105967849493027, "learning_rate": 1.5928558451894914e-05, "loss": 0.0142, "step": 2039 }, { "epoch": 1.5303825956489123, "grad_norm": 0.38493460416793823, "learning_rate": 1.588066599203106e-05, "loss": 0.0288, "step": 2040 }, { "epoch": 1.5311327831957988, "grad_norm": 0.08361639082431793, "learning_rate": 1.583283204346521e-05, "loss": 0.0118, "step": 2041 }, { "epoch": 1.5318829707426858, "grad_norm": 0.13456936180591583, "learning_rate": 1.5785056688227916e-05, "loss": 0.0133, "step": 2042 }, { "epoch": 1.5326331582895723, "grad_norm": 0.16199876368045807, "learning_rate": 1.5737340008249202e-05, "loss": 0.0225, "step": 2043 }, { "epoch": 1.5333833458364592, "grad_norm": 0.0884062722325325, "learning_rate": 1.5689682085358465e-05, "loss": 0.0107, "step": 2044 }, { "epoch": 1.5341335333833457, "grad_norm": 0.06545273959636688, "learning_rate": 1.564208300128438e-05, "loss": 0.0084, "step": 2045 }, { "epoch": 1.5348837209302326, "grad_norm": 0.08026087284088135, "learning_rate": 1.5594542837654625e-05, "loss": 0.0094, "step": 2046 }, { "epoch": 1.5356339084771191, "grad_norm": 0.11514996737241745, "learning_rate": 1.554706167599596e-05, "loss": 0.0161, "step": 2047 }, { "epoch": 1.536384096024006, "grad_norm": 0.14320102334022522, "learning_rate": 1.5499639597733902e-05, "loss": 0.0168, "step": 2048 }, { "epoch": 1.5371342835708928, "grad_norm": 0.11476108431816101, "learning_rate": 1.54522766841926e-05, "loss": 0.0163, "step": 2049 }, { "epoch": 1.5378844711177795, "grad_norm": 0.17724426090717316, "learning_rate": 1.540497301659482e-05, "loss": 0.0182, "step": 2050 }, { "epoch": 1.5386346586646662, "grad_norm": 0.15506739914417267, "learning_rate": 1.5357728676061685e-05, "loss": 0.0185, "step": 2051 }, { "epoch": 1.539384846211553, "grad_norm": 0.19287851452827454, "learning_rate": 1.5310543743612582e-05, "loss": 0.0222, "step": 2052 }, { "epoch": 1.5401350337584396, "grad_norm": 0.14303511381149292, "learning_rate": 1.526341830016505e-05, "loss": 0.0209, "step": 2053 }, { "epoch": 1.5408852213053263, "grad_norm": 0.12963083386421204, "learning_rate": 1.5216352426534548e-05, "loss": 0.0198, "step": 2054 }, { "epoch": 1.541635408852213, "grad_norm": 0.1562402844429016, "learning_rate": 1.5169346203434425e-05, "loss": 0.0189, "step": 2055 }, { "epoch": 1.5423855963990998, "grad_norm": 0.15967603027820587, "learning_rate": 1.5122399711475732e-05, "loss": 0.0253, "step": 2056 }, { "epoch": 1.5431357839459865, "grad_norm": 0.16278547048568726, "learning_rate": 1.50755130311671e-05, "loss": 0.0206, "step": 2057 }, { "epoch": 1.5438859714928732, "grad_norm": 0.13421553373336792, "learning_rate": 1.502868624291452e-05, "loss": 0.019, "step": 2058 }, { "epoch": 1.54463615903976, "grad_norm": 0.12211119383573532, "learning_rate": 1.4981919427021357e-05, "loss": 0.0176, "step": 2059 }, { "epoch": 1.5453863465866466, "grad_norm": 0.11205483227968216, "learning_rate": 1.493521266368807e-05, "loss": 0.0144, "step": 2060 }, { "epoch": 1.5461365341335334, "grad_norm": 0.10355428606271744, "learning_rate": 1.4888566033012201e-05, "loss": 0.0181, "step": 2061 }, { "epoch": 1.54688672168042, "grad_norm": 0.07181017100811005, "learning_rate": 1.4841979614988094e-05, "loss": 0.0092, "step": 2062 }, { "epoch": 1.5476369092273068, "grad_norm": 0.09114021062850952, "learning_rate": 1.4795453489506878e-05, "loss": 0.0159, "step": 2063 }, { "epoch": 1.5483870967741935, "grad_norm": 0.10014685988426208, "learning_rate": 1.4748987736356273e-05, "loss": 0.0149, "step": 2064 }, { "epoch": 1.5491372843210802, "grad_norm": 0.10421111434698105, "learning_rate": 1.4702582435220475e-05, "loss": 0.0174, "step": 2065 }, { "epoch": 1.549887471867967, "grad_norm": 0.17178568243980408, "learning_rate": 1.4656237665680017e-05, "loss": 0.0273, "step": 2066 }, { "epoch": 1.5506376594148539, "grad_norm": 0.0894736498594284, "learning_rate": 1.4609953507211593e-05, "loss": 0.0137, "step": 2067 }, { "epoch": 1.5513878469617404, "grad_norm": 0.09273448586463928, "learning_rate": 1.4563730039187984e-05, "loss": 0.0157, "step": 2068 }, { "epoch": 1.5521380345086273, "grad_norm": 0.06843544542789459, "learning_rate": 1.4517567340877886e-05, "loss": 0.0092, "step": 2069 }, { "epoch": 1.5528882220555138, "grad_norm": 0.09910368174314499, "learning_rate": 1.4471465491445802e-05, "loss": 0.0124, "step": 2070 }, { "epoch": 1.5536384096024007, "grad_norm": 0.20299047231674194, "learning_rate": 1.4425424569951822e-05, "loss": 0.0179, "step": 2071 }, { "epoch": 1.5543885971492872, "grad_norm": 0.1219157725572586, "learning_rate": 1.4379444655351626e-05, "loss": 0.0229, "step": 2072 }, { "epoch": 1.5551387846961742, "grad_norm": 0.056685127317905426, "learning_rate": 1.4333525826496224e-05, "loss": 0.0047, "step": 2073 }, { "epoch": 1.5558889722430607, "grad_norm": 0.11681665480136871, "learning_rate": 1.4287668162131896e-05, "loss": 0.0177, "step": 2074 }, { "epoch": 1.5566391597899476, "grad_norm": 0.12039943039417267, "learning_rate": 1.4241871740900014e-05, "loss": 0.0154, "step": 2075 }, { "epoch": 1.557389347336834, "grad_norm": 0.12308166176080704, "learning_rate": 1.4196136641336932e-05, "loss": 0.0185, "step": 2076 }, { "epoch": 1.558139534883721, "grad_norm": 0.12046025693416595, "learning_rate": 1.4150462941873843e-05, "loss": 0.0149, "step": 2077 }, { "epoch": 1.5588897224306075, "grad_norm": 0.09745654463768005, "learning_rate": 1.410485072083666e-05, "loss": 0.0133, "step": 2078 }, { "epoch": 1.5596399099774945, "grad_norm": 0.09617792069911957, "learning_rate": 1.4059300056445823e-05, "loss": 0.0163, "step": 2079 }, { "epoch": 1.560390097524381, "grad_norm": 0.13568072021007538, "learning_rate": 1.4013811026816243e-05, "loss": 0.0187, "step": 2080 }, { "epoch": 1.561140285071268, "grad_norm": 0.15544694662094116, "learning_rate": 1.3968383709957133e-05, "loss": 0.0136, "step": 2081 }, { "epoch": 1.5618904726181544, "grad_norm": 0.0854220986366272, "learning_rate": 1.3923018183771868e-05, "loss": 0.0126, "step": 2082 }, { "epoch": 1.5626406601650413, "grad_norm": 0.16740089654922485, "learning_rate": 1.3877714526057872e-05, "loss": 0.0207, "step": 2083 }, { "epoch": 1.5633908477119278, "grad_norm": 0.20988257229328156, "learning_rate": 1.3832472814506425e-05, "loss": 0.0261, "step": 2084 }, { "epoch": 1.5641410352588148, "grad_norm": 0.13327625393867493, "learning_rate": 1.3787293126702622e-05, "loss": 0.0171, "step": 2085 }, { "epoch": 1.5648912228057015, "grad_norm": 0.22465142607688904, "learning_rate": 1.3742175540125179e-05, "loss": 0.0305, "step": 2086 }, { "epoch": 1.5656414103525882, "grad_norm": 0.11496560275554657, "learning_rate": 1.3697120132146318e-05, "loss": 0.0179, "step": 2087 }, { "epoch": 1.566391597899475, "grad_norm": 0.08796335756778717, "learning_rate": 1.3652126980031627e-05, "loss": 0.0097, "step": 2088 }, { "epoch": 1.5671417854463616, "grad_norm": 0.09953059256076813, "learning_rate": 1.3607196160939927e-05, "loss": 0.0112, "step": 2089 }, { "epoch": 1.5678919729932483, "grad_norm": 0.10293276607990265, "learning_rate": 1.3562327751923149e-05, "loss": 0.0169, "step": 2090 }, { "epoch": 1.568642160540135, "grad_norm": 0.15932396054267883, "learning_rate": 1.351752182992621e-05, "loss": 0.0227, "step": 2091 }, { "epoch": 1.5693923480870218, "grad_norm": 0.06464660167694092, "learning_rate": 1.3472778471786829e-05, "loss": 0.0069, "step": 2092 }, { "epoch": 1.5701425356339085, "grad_norm": 0.11081813275814056, "learning_rate": 1.3428097754235475e-05, "loss": 0.0131, "step": 2093 }, { "epoch": 1.5708927231807952, "grad_norm": 0.1540287286043167, "learning_rate": 1.3383479753895174e-05, "loss": 0.0267, "step": 2094 }, { "epoch": 1.571642910727682, "grad_norm": 0.13682395219802856, "learning_rate": 1.33389245472814e-05, "loss": 0.0134, "step": 2095 }, { "epoch": 1.5723930982745686, "grad_norm": 0.1723647564649582, "learning_rate": 1.3294432210801966e-05, "loss": 0.0251, "step": 2096 }, { "epoch": 1.5731432858214554, "grad_norm": 0.12762096524238586, "learning_rate": 1.3250002820756819e-05, "loss": 0.0188, "step": 2097 }, { "epoch": 1.573893473368342, "grad_norm": 0.14758145809173584, "learning_rate": 1.3205636453338e-05, "loss": 0.0173, "step": 2098 }, { "epoch": 1.5746436609152288, "grad_norm": 0.0950731411576271, "learning_rate": 1.316133318462946e-05, "loss": 0.0129, "step": 2099 }, { "epoch": 1.5753938484621155, "grad_norm": 0.09899888187646866, "learning_rate": 1.3117093090606958e-05, "loss": 0.0132, "step": 2100 }, { "epoch": 1.5761440360090022, "grad_norm": 0.15141884982585907, "learning_rate": 1.3072916247137861e-05, "loss": 0.0157, "step": 2101 }, { "epoch": 1.576894223555889, "grad_norm": 0.09063942730426788, "learning_rate": 1.302880272998112e-05, "loss": 0.0119, "step": 2102 }, { "epoch": 1.5776444111027756, "grad_norm": 0.14490434527397156, "learning_rate": 1.29847526147871e-05, "loss": 0.0164, "step": 2103 }, { "epoch": 1.5783945986496624, "grad_norm": 0.09947270154953003, "learning_rate": 1.2940765977097402e-05, "loss": 0.0122, "step": 2104 }, { "epoch": 1.579144786196549, "grad_norm": 0.24933743476867676, "learning_rate": 1.2896842892344751e-05, "loss": 0.0188, "step": 2105 }, { "epoch": 1.579894973743436, "grad_norm": 0.10064645111560822, "learning_rate": 1.2852983435852928e-05, "loss": 0.0127, "step": 2106 }, { "epoch": 1.5806451612903225, "grad_norm": 0.11761155724525452, "learning_rate": 1.2809187682836588e-05, "loss": 0.0121, "step": 2107 }, { "epoch": 1.5813953488372094, "grad_norm": 0.12103329598903656, "learning_rate": 1.2765455708401142e-05, "loss": 0.0156, "step": 2108 }, { "epoch": 1.582145536384096, "grad_norm": 0.24466180801391602, "learning_rate": 1.2721787587542595e-05, "loss": 0.0367, "step": 2109 }, { "epoch": 1.5828957239309829, "grad_norm": 0.11228577047586441, "learning_rate": 1.2678183395147485e-05, "loss": 0.0176, "step": 2110 }, { "epoch": 1.5836459114778694, "grad_norm": 0.13010624051094055, "learning_rate": 1.2634643205992707e-05, "loss": 0.0134, "step": 2111 }, { "epoch": 1.5843960990247563, "grad_norm": 0.12314150482416153, "learning_rate": 1.2591167094745404e-05, "loss": 0.0215, "step": 2112 }, { "epoch": 1.5851462865716428, "grad_norm": 0.09590944647789001, "learning_rate": 1.2547755135962841e-05, "loss": 0.0179, "step": 2113 }, { "epoch": 1.5858964741185297, "grad_norm": 0.1403607428073883, "learning_rate": 1.2504407404092217e-05, "loss": 0.0255, "step": 2114 }, { "epoch": 1.5866466616654162, "grad_norm": 0.09067095071077347, "learning_rate": 1.2461123973470634e-05, "loss": 0.0129, "step": 2115 }, { "epoch": 1.5873968492123032, "grad_norm": 0.13620354235172272, "learning_rate": 1.2417904918324913e-05, "loss": 0.018, "step": 2116 }, { "epoch": 1.5881470367591897, "grad_norm": 0.0839877724647522, "learning_rate": 1.237475031277151e-05, "loss": 0.0108, "step": 2117 }, { "epoch": 1.5888972243060766, "grad_norm": 0.0976109579205513, "learning_rate": 1.2331660230816288e-05, "loss": 0.0129, "step": 2118 }, { "epoch": 1.589647411852963, "grad_norm": 0.15283679962158203, "learning_rate": 1.2288634746354505e-05, "loss": 0.0134, "step": 2119 }, { "epoch": 1.59039759939985, "grad_norm": 0.09862658381462097, "learning_rate": 1.2245673933170626e-05, "loss": 0.0117, "step": 2120 }, { "epoch": 1.5911477869467365, "grad_norm": 0.09258013963699341, "learning_rate": 1.2202777864938236e-05, "loss": 0.015, "step": 2121 }, { "epoch": 1.5918979744936235, "grad_norm": 0.15144041180610657, "learning_rate": 1.2159946615219836e-05, "loss": 0.0141, "step": 2122 }, { "epoch": 1.59264816204051, "grad_norm": 0.12255115807056427, "learning_rate": 1.211718025746682e-05, "loss": 0.0095, "step": 2123 }, { "epoch": 1.593398349587397, "grad_norm": 0.1431581974029541, "learning_rate": 1.2074478865019273e-05, "loss": 0.0139, "step": 2124 }, { "epoch": 1.5941485371342836, "grad_norm": 0.09224507957696915, "learning_rate": 1.2031842511105885e-05, "loss": 0.0121, "step": 2125 }, { "epoch": 1.5948987246811703, "grad_norm": 0.1134839728474617, "learning_rate": 1.1989271268843815e-05, "loss": 0.0119, "step": 2126 }, { "epoch": 1.595648912228057, "grad_norm": 0.13770152628421783, "learning_rate": 1.1946765211238526e-05, "loss": 0.0167, "step": 2127 }, { "epoch": 1.5963990997749438, "grad_norm": 0.14627668261528015, "learning_rate": 1.1904324411183731e-05, "loss": 0.0174, "step": 2128 }, { "epoch": 1.5971492873218305, "grad_norm": 0.12559792399406433, "learning_rate": 1.1861948941461226e-05, "loss": 0.0138, "step": 2129 }, { "epoch": 1.5978994748687172, "grad_norm": 0.1679619550704956, "learning_rate": 1.1819638874740769e-05, "loss": 0.0179, "step": 2130 }, { "epoch": 1.598649662415604, "grad_norm": 0.11241298913955688, "learning_rate": 1.1777394283579956e-05, "loss": 0.0115, "step": 2131 }, { "epoch": 1.5993998499624906, "grad_norm": 0.09657392650842667, "learning_rate": 1.1735215240424102e-05, "loss": 0.0107, "step": 2132 }, { "epoch": 1.6001500375093773, "grad_norm": 0.12358458340167999, "learning_rate": 1.1693101817606117e-05, "loss": 0.0204, "step": 2133 }, { "epoch": 1.600900225056264, "grad_norm": 0.09444307535886765, "learning_rate": 1.165105408734638e-05, "loss": 0.0109, "step": 2134 }, { "epoch": 1.6016504126031508, "grad_norm": 0.1118428185582161, "learning_rate": 1.1609072121752584e-05, "loss": 0.0201, "step": 2135 }, { "epoch": 1.6024006001500375, "grad_norm": 0.10624516010284424, "learning_rate": 1.1567155992819678e-05, "loss": 0.016, "step": 2136 }, { "epoch": 1.6031507876969242, "grad_norm": 0.20431579649448395, "learning_rate": 1.15253057724297e-05, "loss": 0.0252, "step": 2137 }, { "epoch": 1.603900975243811, "grad_norm": 0.14086978137493134, "learning_rate": 1.1483521532351654e-05, "loss": 0.0166, "step": 2138 }, { "epoch": 1.6046511627906976, "grad_norm": 0.10527478903532028, "learning_rate": 1.144180334424141e-05, "loss": 0.014, "step": 2139 }, { "epoch": 1.6054013503375844, "grad_norm": 0.10598310083150864, "learning_rate": 1.1400151279641525e-05, "loss": 0.0137, "step": 2140 }, { "epoch": 1.606151537884471, "grad_norm": 0.13043269515037537, "learning_rate": 1.1358565409981203e-05, "loss": 0.014, "step": 2141 }, { "epoch": 1.6069017254313578, "grad_norm": 0.1680910736322403, "learning_rate": 1.1317045806576121e-05, "loss": 0.0215, "step": 2142 }, { "epoch": 1.6076519129782447, "grad_norm": 0.16358719766139984, "learning_rate": 1.12755925406283e-05, "loss": 0.0223, "step": 2143 }, { "epoch": 1.6084021005251312, "grad_norm": 0.12614835798740387, "learning_rate": 1.1234205683226012e-05, "loss": 0.0148, "step": 2144 }, { "epoch": 1.6091522880720182, "grad_norm": 0.05313693732023239, "learning_rate": 1.1192885305343648e-05, "loss": 0.0079, "step": 2145 }, { "epoch": 1.6099024756189046, "grad_norm": 0.1304258108139038, "learning_rate": 1.1151631477841584e-05, "loss": 0.0211, "step": 2146 }, { "epoch": 1.6106526631657916, "grad_norm": 0.16122934222221375, "learning_rate": 1.1110444271466086e-05, "loss": 0.0167, "step": 2147 }, { "epoch": 1.611402850712678, "grad_norm": 0.08754117786884308, "learning_rate": 1.1069323756849126e-05, "loss": 0.0136, "step": 2148 }, { "epoch": 1.612153038259565, "grad_norm": 0.1655844748020172, "learning_rate": 1.102827000450835e-05, "loss": 0.0199, "step": 2149 }, { "epoch": 1.6129032258064515, "grad_norm": 0.1335715353488922, "learning_rate": 1.0987283084846905e-05, "loss": 0.0127, "step": 2150 }, { "epoch": 1.6136534133533385, "grad_norm": 0.0860769972205162, "learning_rate": 1.0946363068153343e-05, "loss": 0.0114, "step": 2151 }, { "epoch": 1.614403600900225, "grad_norm": 0.1350153535604477, "learning_rate": 1.0905510024601423e-05, "loss": 0.0161, "step": 2152 }, { "epoch": 1.6151537884471119, "grad_norm": 0.11918716132640839, "learning_rate": 1.0864724024250106e-05, "loss": 0.0147, "step": 2153 }, { "epoch": 1.6159039759939984, "grad_norm": 0.0758005753159523, "learning_rate": 1.0824005137043375e-05, "loss": 0.0108, "step": 2154 }, { "epoch": 1.6166541635408853, "grad_norm": 0.11765742301940918, "learning_rate": 1.0783353432810106e-05, "loss": 0.0175, "step": 2155 }, { "epoch": 1.6174043510877718, "grad_norm": 0.24080726504325867, "learning_rate": 1.0742768981263984e-05, "loss": 0.0417, "step": 2156 }, { "epoch": 1.6181545386346587, "grad_norm": 0.07509870827198029, "learning_rate": 1.070225185200331e-05, "loss": 0.0099, "step": 2157 }, { "epoch": 1.6189047261815452, "grad_norm": 0.12462197244167328, "learning_rate": 1.0661802114511005e-05, "loss": 0.0222, "step": 2158 }, { "epoch": 1.6196549137284322, "grad_norm": 0.1365789920091629, "learning_rate": 1.062141983815439e-05, "loss": 0.0189, "step": 2159 }, { "epoch": 1.6204051012753187, "grad_norm": 0.1191381961107254, "learning_rate": 1.0581105092185062e-05, "loss": 0.0191, "step": 2160 }, { "epoch": 1.6211552888222056, "grad_norm": 0.07630721479654312, "learning_rate": 1.0540857945738852e-05, "loss": 0.0103, "step": 2161 }, { "epoch": 1.6219054763690923, "grad_norm": 0.08390341699123383, "learning_rate": 1.0500678467835662e-05, "loss": 0.0105, "step": 2162 }, { "epoch": 1.622655663915979, "grad_norm": 0.10651591420173645, "learning_rate": 1.0460566727379335e-05, "loss": 0.0187, "step": 2163 }, { "epoch": 1.6234058514628658, "grad_norm": 0.122169628739357, "learning_rate": 1.0420522793157567e-05, "loss": 0.0166, "step": 2164 }, { "epoch": 1.6241560390097525, "grad_norm": 0.1749759167432785, "learning_rate": 1.038054673384174e-05, "loss": 0.0227, "step": 2165 }, { "epoch": 1.6249062265566392, "grad_norm": 0.10475295782089233, "learning_rate": 1.0340638617986864e-05, "loss": 0.01, "step": 2166 }, { "epoch": 1.625656414103526, "grad_norm": 0.09955000877380371, "learning_rate": 1.030079851403144e-05, "loss": 0.0124, "step": 2167 }, { "epoch": 1.6264066016504126, "grad_norm": 0.15151360630989075, "learning_rate": 1.0261026490297315e-05, "loss": 0.0179, "step": 2168 }, { "epoch": 1.6271567891972993, "grad_norm": 0.11382818967103958, "learning_rate": 1.022132261498961e-05, "loss": 0.0176, "step": 2169 }, { "epoch": 1.627906976744186, "grad_norm": 0.15799665451049805, "learning_rate": 1.0181686956196529e-05, "loss": 0.0214, "step": 2170 }, { "epoch": 1.6286571642910728, "grad_norm": 0.11197953671216965, "learning_rate": 1.0142119581889332e-05, "loss": 0.0144, "step": 2171 }, { "epoch": 1.6294073518379595, "grad_norm": 0.19829680025577545, "learning_rate": 1.0102620559922204e-05, "loss": 0.023, "step": 2172 }, { "epoch": 1.6301575393848462, "grad_norm": 0.11381664127111435, "learning_rate": 1.0063189958032043e-05, "loss": 0.0145, "step": 2173 }, { "epoch": 1.630907726931733, "grad_norm": 0.2269834578037262, "learning_rate": 1.0023827843838457e-05, "loss": 0.0211, "step": 2174 }, { "epoch": 1.6316579144786196, "grad_norm": 0.11999978125095367, "learning_rate": 9.984534284843594e-06, "loss": 0.0166, "step": 2175 }, { "epoch": 1.6324081020255063, "grad_norm": 0.12092548608779907, "learning_rate": 9.945309348432047e-06, "loss": 0.0197, "step": 2176 }, { "epoch": 1.633158289572393, "grad_norm": 0.10755470395088196, "learning_rate": 9.906153101870725e-06, "loss": 0.0127, "step": 2177 }, { "epoch": 1.6339084771192798, "grad_norm": 0.17812775075435638, "learning_rate": 9.867065612308713e-06, "loss": 0.0176, "step": 2178 }, { "epoch": 1.6346586646661665, "grad_norm": 0.08771397173404694, "learning_rate": 9.82804694677722e-06, "loss": 0.0158, "step": 2179 }, { "epoch": 1.6354088522130532, "grad_norm": 0.14670372009277344, "learning_rate": 9.78909717218941e-06, "loss": 0.0181, "step": 2180 }, { "epoch": 1.63615903975994, "grad_norm": 0.0950799211859703, "learning_rate": 9.75021635534033e-06, "loss": 0.0127, "step": 2181 }, { "epoch": 1.6369092273068269, "grad_norm": 0.10451541095972061, "learning_rate": 9.711404562906717e-06, "loss": 0.016, "step": 2182 }, { "epoch": 1.6376594148537134, "grad_norm": 0.21459363400936127, "learning_rate": 9.672661861447002e-06, "loss": 0.029, "step": 2183 }, { "epoch": 1.6384096024006003, "grad_norm": 0.161620631814003, "learning_rate": 9.633988317401087e-06, "loss": 0.019, "step": 2184 }, { "epoch": 1.6391597899474868, "grad_norm": 0.17189188301563263, "learning_rate": 9.595383997090302e-06, "loss": 0.0243, "step": 2185 }, { "epoch": 1.6399099774943737, "grad_norm": 0.08551929891109467, "learning_rate": 9.556848966717247e-06, "loss": 0.0118, "step": 2186 }, { "epoch": 1.6406601650412602, "grad_norm": 0.12630008161067963, "learning_rate": 9.518383292365713e-06, "loss": 0.0155, "step": 2187 }, { "epoch": 1.6414103525881472, "grad_norm": 0.08629219233989716, "learning_rate": 9.479987040000538e-06, "loss": 0.012, "step": 2188 }, { "epoch": 1.6421605401350337, "grad_norm": 0.12059628218412399, "learning_rate": 9.441660275467512e-06, "loss": 0.0151, "step": 2189 }, { "epoch": 1.6429107276819206, "grad_norm": 0.13245142996311188, "learning_rate": 9.403403064493282e-06, "loss": 0.0161, "step": 2190 }, { "epoch": 1.643660915228807, "grad_norm": 0.1327851563692093, "learning_rate": 9.365215472685163e-06, "loss": 0.0209, "step": 2191 }, { "epoch": 1.644411102775694, "grad_norm": 0.14042188227176666, "learning_rate": 9.32709756553114e-06, "loss": 0.0175, "step": 2192 }, { "epoch": 1.6451612903225805, "grad_norm": 0.1419353187084198, "learning_rate": 9.289049408399659e-06, "loss": 0.0141, "step": 2193 }, { "epoch": 1.6459114778694675, "grad_norm": 0.10731997340917587, "learning_rate": 9.251071066539579e-06, "loss": 0.0153, "step": 2194 }, { "epoch": 1.646661665416354, "grad_norm": 0.13512557744979858, "learning_rate": 9.21316260507999e-06, "loss": 0.0164, "step": 2195 }, { "epoch": 1.6474118529632409, "grad_norm": 0.14420804381370544, "learning_rate": 9.175324089030185e-06, "loss": 0.0187, "step": 2196 }, { "epoch": 1.6481620405101274, "grad_norm": 0.1138468086719513, "learning_rate": 9.137555583279495e-06, "loss": 0.0138, "step": 2197 }, { "epoch": 1.6489122280570143, "grad_norm": 0.26051953434944153, "learning_rate": 9.099857152597185e-06, "loss": 0.0229, "step": 2198 }, { "epoch": 1.6496624156039008, "grad_norm": 0.13202644884586334, "learning_rate": 9.062228861632354e-06, "loss": 0.0155, "step": 2199 }, { "epoch": 1.6504126031507877, "grad_norm": 0.11023962497711182, "learning_rate": 9.024670774913812e-06, "loss": 0.0172, "step": 2200 }, { "epoch": 1.6504126031507877, "eval_loss": 0.03166023641824722, "eval_runtime": 5.1277, "eval_samples_per_second": 10.531, "eval_steps_per_second": 2.73, "step": 2200 }, { "epoch": 1.6511627906976745, "grad_norm": 0.1747378408908844, "learning_rate": 8.987182956849983e-06, "loss": 0.0208, "step": 2201 }, { "epoch": 1.6519129782445612, "grad_norm": 0.104463130235672, "learning_rate": 8.949765471728789e-06, "loss": 0.0149, "step": 2202 }, { "epoch": 1.652663165791448, "grad_norm": 0.25243860483169556, "learning_rate": 8.912418383717513e-06, "loss": 0.0316, "step": 2203 }, { "epoch": 1.6534133533383346, "grad_norm": 0.1255129724740982, "learning_rate": 8.875141756862749e-06, "loss": 0.0157, "step": 2204 }, { "epoch": 1.6541635408852213, "grad_norm": 0.13603879511356354, "learning_rate": 8.837935655090241e-06, "loss": 0.0159, "step": 2205 }, { "epoch": 1.654913728432108, "grad_norm": 0.1283695250749588, "learning_rate": 8.800800142204779e-06, "loss": 0.0151, "step": 2206 }, { "epoch": 1.6556639159789948, "grad_norm": 0.0896884873509407, "learning_rate": 8.763735281890133e-06, "loss": 0.01, "step": 2207 }, { "epoch": 1.6564141035258815, "grad_norm": 0.08420361578464508, "learning_rate": 8.726741137708866e-06, "loss": 0.0089, "step": 2208 }, { "epoch": 1.6571642910727682, "grad_norm": 0.09029056876897812, "learning_rate": 8.689817773102293e-06, "loss": 0.0095, "step": 2209 }, { "epoch": 1.657914478619655, "grad_norm": 0.14621016383171082, "learning_rate": 8.65296525139036e-06, "loss": 0.0178, "step": 2210 }, { "epoch": 1.6586646661665416, "grad_norm": 0.19486641883850098, "learning_rate": 8.616183635771525e-06, "loss": 0.0121, "step": 2211 }, { "epoch": 1.6594148537134283, "grad_norm": 0.12832102179527283, "learning_rate": 8.579472989322602e-06, "loss": 0.0133, "step": 2212 }, { "epoch": 1.660165041260315, "grad_norm": 0.06961192190647125, "learning_rate": 8.542833374998744e-06, "loss": 0.0086, "step": 2213 }, { "epoch": 1.6609152288072018, "grad_norm": 0.23715099692344666, "learning_rate": 8.5062648556333e-06, "loss": 0.0268, "step": 2214 }, { "epoch": 1.6616654163540885, "grad_norm": 0.09155306220054626, "learning_rate": 8.469767493937681e-06, "loss": 0.0163, "step": 2215 }, { "epoch": 1.6624156039009752, "grad_norm": 0.17014844715595245, "learning_rate": 8.43334135250125e-06, "loss": 0.0207, "step": 2216 }, { "epoch": 1.663165791447862, "grad_norm": 0.14714054763317108, "learning_rate": 8.39698649379126e-06, "loss": 0.0197, "step": 2217 }, { "epoch": 1.6639159789947486, "grad_norm": 0.18474572896957397, "learning_rate": 8.360702980152713e-06, "loss": 0.0242, "step": 2218 }, { "epoch": 1.6646661665416356, "grad_norm": 0.20110833644866943, "learning_rate": 8.32449087380826e-06, "loss": 0.0233, "step": 2219 }, { "epoch": 1.665416354088522, "grad_norm": 0.09897205233573914, "learning_rate": 8.288350236858117e-06, "loss": 0.0157, "step": 2220 }, { "epoch": 1.666166541635409, "grad_norm": 0.1320415437221527, "learning_rate": 8.252281131279887e-06, "loss": 0.0159, "step": 2221 }, { "epoch": 1.6669167291822955, "grad_norm": 0.13419120013713837, "learning_rate": 8.21628361892855e-06, "loss": 0.0202, "step": 2222 }, { "epoch": 1.6676669167291824, "grad_norm": 0.11188473552465439, "learning_rate": 8.180357761536296e-06, "loss": 0.0142, "step": 2223 }, { "epoch": 1.668417104276069, "grad_norm": 0.15787629783153534, "learning_rate": 8.14450362071244e-06, "loss": 0.0256, "step": 2224 }, { "epoch": 1.6691672918229559, "grad_norm": 0.14580890536308289, "learning_rate": 8.10872125794328e-06, "loss": 0.0241, "step": 2225 }, { "epoch": 1.6699174793698424, "grad_norm": 0.08850288391113281, "learning_rate": 8.073010734592057e-06, "loss": 0.0089, "step": 2226 }, { "epoch": 1.6706676669167293, "grad_norm": 0.14631128311157227, "learning_rate": 8.037372111898789e-06, "loss": 0.0124, "step": 2227 }, { "epoch": 1.6714178544636158, "grad_norm": 0.14625024795532227, "learning_rate": 8.001805450980249e-06, "loss": 0.0152, "step": 2228 }, { "epoch": 1.6721680420105027, "grad_norm": 0.0920761376619339, "learning_rate": 7.966310812829709e-06, "loss": 0.0133, "step": 2229 }, { "epoch": 1.6729182295573892, "grad_norm": 0.2126186639070511, "learning_rate": 7.930888258316998e-06, "loss": 0.0243, "step": 2230 }, { "epoch": 1.6736684171042762, "grad_norm": 0.16171617805957794, "learning_rate": 7.89553784818831e-06, "loss": 0.028, "step": 2231 }, { "epoch": 1.6744186046511627, "grad_norm": 0.09965240210294724, "learning_rate": 7.860259643066126e-06, "loss": 0.0111, "step": 2232 }, { "epoch": 1.6751687921980496, "grad_norm": 0.13482119143009186, "learning_rate": 7.82505370344907e-06, "loss": 0.0223, "step": 2233 }, { "epoch": 1.675918979744936, "grad_norm": 0.07899262011051178, "learning_rate": 7.789920089711871e-06, "loss": 0.0096, "step": 2234 }, { "epoch": 1.676669167291823, "grad_norm": 0.06677675992250443, "learning_rate": 7.754858862105224e-06, "loss": 0.0071, "step": 2235 }, { "epoch": 1.6774193548387095, "grad_norm": 0.1100536659359932, "learning_rate": 7.71987008075568e-06, "loss": 0.0104, "step": 2236 }, { "epoch": 1.6781695423855965, "grad_norm": 0.12148049473762512, "learning_rate": 7.684953805665562e-06, "loss": 0.0115, "step": 2237 }, { "epoch": 1.6789197299324832, "grad_norm": 0.08337541669607162, "learning_rate": 7.65011009671282e-06, "loss": 0.0124, "step": 2238 }, { "epoch": 1.67966991747937, "grad_norm": 0.1657513678073883, "learning_rate": 7.615339013651001e-06, "loss": 0.023, "step": 2239 }, { "epoch": 1.6804201050262566, "grad_norm": 0.07674112915992737, "learning_rate": 7.580640616109081e-06, "loss": 0.0087, "step": 2240 }, { "epoch": 1.6811702925731433, "grad_norm": 0.17678917944431305, "learning_rate": 7.546014963591397e-06, "loss": 0.0206, "step": 2241 }, { "epoch": 1.68192048012003, "grad_norm": 0.14089573919773102, "learning_rate": 7.511462115477536e-06, "loss": 0.0194, "step": 2242 }, { "epoch": 1.6826706676669168, "grad_norm": 0.17028793692588806, "learning_rate": 7.476982131022231e-06, "loss": 0.0174, "step": 2243 }, { "epoch": 1.6834208552138035, "grad_norm": 0.10286597907543182, "learning_rate": 7.442575069355256e-06, "loss": 0.0115, "step": 2244 }, { "epoch": 1.6841710427606902, "grad_norm": 0.19301265478134155, "learning_rate": 7.408240989481347e-06, "loss": 0.0204, "step": 2245 }, { "epoch": 1.684921230307577, "grad_norm": 0.15151479840278625, "learning_rate": 7.373979950280046e-06, "loss": 0.0174, "step": 2246 }, { "epoch": 1.6856714178544636, "grad_norm": 0.0994754210114479, "learning_rate": 7.33979201050568e-06, "loss": 0.0105, "step": 2247 }, { "epoch": 1.6864216054013503, "grad_norm": 0.19209209084510803, "learning_rate": 7.3056772287871886e-06, "loss": 0.0195, "step": 2248 }, { "epoch": 1.687171792948237, "grad_norm": 0.15901431441307068, "learning_rate": 7.2716356636280684e-06, "loss": 0.0182, "step": 2249 }, { "epoch": 1.6879219804951238, "grad_norm": 0.10334349423646927, "learning_rate": 7.237667373406259e-06, "loss": 0.0116, "step": 2250 }, { "epoch": 1.6886721680420105, "grad_norm": 0.10797673463821411, "learning_rate": 7.203772416374016e-06, "loss": 0.0178, "step": 2251 }, { "epoch": 1.6894223555888972, "grad_norm": 0.13973666727542877, "learning_rate": 7.1699508506578636e-06, "loss": 0.0171, "step": 2252 }, { "epoch": 1.690172543135784, "grad_norm": 0.13739119470119476, "learning_rate": 7.136202734258457e-06, "loss": 0.018, "step": 2253 }, { "epoch": 1.6909227306826706, "grad_norm": 0.19144482910633087, "learning_rate": 7.1025281250505006e-06, "loss": 0.024, "step": 2254 }, { "epoch": 1.6916729182295573, "grad_norm": 0.12330687791109085, "learning_rate": 7.0689270807826e-06, "loss": 0.0158, "step": 2255 }, { "epoch": 1.692423105776444, "grad_norm": 0.10102571547031403, "learning_rate": 7.035399659077268e-06, "loss": 0.0167, "step": 2256 }, { "epoch": 1.6931732933233308, "grad_norm": 0.15109023451805115, "learning_rate": 7.00194591743073e-06, "loss": 0.0246, "step": 2257 }, { "epoch": 1.6939234808702177, "grad_norm": 0.0868748277425766, "learning_rate": 6.96856591321286e-06, "loss": 0.0093, "step": 2258 }, { "epoch": 1.6946736684171042, "grad_norm": 0.11420181393623352, "learning_rate": 6.9352597036670575e-06, "loss": 0.0186, "step": 2259 }, { "epoch": 1.6954238559639911, "grad_norm": 0.10863950848579407, "learning_rate": 6.902027345910211e-06, "loss": 0.0136, "step": 2260 }, { "epoch": 1.6961740435108776, "grad_norm": 0.13581328094005585, "learning_rate": 6.868868896932534e-06, "loss": 0.0224, "step": 2261 }, { "epoch": 1.6969242310577646, "grad_norm": 0.13632053136825562, "learning_rate": 6.835784413597512e-06, "loss": 0.0204, "step": 2262 }, { "epoch": 1.697674418604651, "grad_norm": 0.1366109400987625, "learning_rate": 6.802773952641761e-06, "loss": 0.0173, "step": 2263 }, { "epoch": 1.698424606151538, "grad_norm": 0.08851693570613861, "learning_rate": 6.769837570674975e-06, "loss": 0.013, "step": 2264 }, { "epoch": 1.6991747936984245, "grad_norm": 0.11936666071414948, "learning_rate": 6.7369753241798114e-06, "loss": 0.0136, "step": 2265 }, { "epoch": 1.6999249812453114, "grad_norm": 0.0834725946187973, "learning_rate": 6.70418726951178e-06, "loss": 0.0108, "step": 2266 }, { "epoch": 1.700675168792198, "grad_norm": 0.12028200924396515, "learning_rate": 6.671473462899181e-06, "loss": 0.014, "step": 2267 }, { "epoch": 1.7014253563390849, "grad_norm": 0.20901398360729218, "learning_rate": 6.638833960442948e-06, "loss": 0.0205, "step": 2268 }, { "epoch": 1.7021755438859714, "grad_norm": 0.09815794229507446, "learning_rate": 6.606268818116618e-06, "loss": 0.0138, "step": 2269 }, { "epoch": 1.7029257314328583, "grad_norm": 0.17781807482242584, "learning_rate": 6.573778091766219e-06, "loss": 0.0199, "step": 2270 }, { "epoch": 1.7036759189797448, "grad_norm": 0.10289303958415985, "learning_rate": 6.541361837110149e-06, "loss": 0.012, "step": 2271 }, { "epoch": 1.7044261065266317, "grad_norm": 0.08833058178424835, "learning_rate": 6.509020109739078e-06, "loss": 0.0134, "step": 2272 }, { "epoch": 1.7051762940735182, "grad_norm": 0.12388486415147781, "learning_rate": 6.476752965115884e-06, "loss": 0.0121, "step": 2273 }, { "epoch": 1.7059264816204052, "grad_norm": 0.1161249652504921, "learning_rate": 6.444560458575544e-06, "loss": 0.0154, "step": 2274 }, { "epoch": 1.7066766691672917, "grad_norm": 0.11769194155931473, "learning_rate": 6.412442645325057e-06, "loss": 0.0133, "step": 2275 }, { "epoch": 1.7074268567141786, "grad_norm": 0.08280669152736664, "learning_rate": 6.38039958044328e-06, "loss": 0.0098, "step": 2276 }, { "epoch": 1.7081770442610653, "grad_norm": 0.10545045137405396, "learning_rate": 6.3484313188809265e-06, "loss": 0.0156, "step": 2277 }, { "epoch": 1.708927231807952, "grad_norm": 0.20702876150608063, "learning_rate": 6.316537915460418e-06, "loss": 0.0305, "step": 2278 }, { "epoch": 1.7096774193548387, "grad_norm": 0.09845060855150223, "learning_rate": 6.284719424875796e-06, "loss": 0.0136, "step": 2279 }, { "epoch": 1.7104276069017255, "grad_norm": 0.10509305447340012, "learning_rate": 6.252975901692659e-06, "loss": 0.0132, "step": 2280 }, { "epoch": 1.7111777944486122, "grad_norm": 0.08033392578363419, "learning_rate": 6.221307400347992e-06, "loss": 0.0082, "step": 2281 }, { "epoch": 1.711927981995499, "grad_norm": 0.11224459111690521, "learning_rate": 6.1897139751501796e-06, "loss": 0.0183, "step": 2282 }, { "epoch": 1.7126781695423856, "grad_norm": 0.09149748831987381, "learning_rate": 6.158195680278816e-06, "loss": 0.0117, "step": 2283 }, { "epoch": 1.7134283570892723, "grad_norm": 0.14243397116661072, "learning_rate": 6.126752569784694e-06, "loss": 0.0186, "step": 2284 }, { "epoch": 1.714178544636159, "grad_norm": 0.11700420081615448, "learning_rate": 6.095384697589635e-06, "loss": 0.0132, "step": 2285 }, { "epoch": 1.7149287321830458, "grad_norm": 0.10646030306816101, "learning_rate": 6.064092117486464e-06, "loss": 0.0152, "step": 2286 }, { "epoch": 1.7156789197299325, "grad_norm": 0.14706531167030334, "learning_rate": 6.032874883138867e-06, "loss": 0.0159, "step": 2287 }, { "epoch": 1.7164291072768192, "grad_norm": 0.12105236947536469, "learning_rate": 6.001733048081337e-06, "loss": 0.0196, "step": 2288 }, { "epoch": 1.717179294823706, "grad_norm": 0.17891399562358856, "learning_rate": 5.970666665719033e-06, "loss": 0.0197, "step": 2289 }, { "epoch": 1.7179294823705926, "grad_norm": 0.16054308414459229, "learning_rate": 5.939675789327759e-06, "loss": 0.0205, "step": 2290 }, { "epoch": 1.7186796699174793, "grad_norm": 0.1569271832704544, "learning_rate": 5.908760472053809e-06, "loss": 0.0238, "step": 2291 }, { "epoch": 1.719429857464366, "grad_norm": 0.13982658088207245, "learning_rate": 5.877920766913919e-06, "loss": 0.0146, "step": 2292 }, { "epoch": 1.7201800450112528, "grad_norm": 0.1269511878490448, "learning_rate": 5.847156726795133e-06, "loss": 0.0174, "step": 2293 }, { "epoch": 1.7209302325581395, "grad_norm": 0.12216448783874512, "learning_rate": 5.816468404454755e-06, "loss": 0.0225, "step": 2294 }, { "epoch": 1.7216804201050264, "grad_norm": 0.10995414108037949, "learning_rate": 5.7858558525202336e-06, "loss": 0.011, "step": 2295 }, { "epoch": 1.722430607651913, "grad_norm": 0.10215462744235992, "learning_rate": 5.755319123489083e-06, "loss": 0.0138, "step": 2296 }, { "epoch": 1.7231807951987999, "grad_norm": 0.07460669428110123, "learning_rate": 5.724858269728789e-06, "loss": 0.0107, "step": 2297 }, { "epoch": 1.7239309827456863, "grad_norm": 0.13698826730251312, "learning_rate": 5.694473343476714e-06, "loss": 0.0137, "step": 2298 }, { "epoch": 1.7246811702925733, "grad_norm": 0.11854610592126846, "learning_rate": 5.664164396840016e-06, "loss": 0.0171, "step": 2299 }, { "epoch": 1.7254313578394598, "grad_norm": 0.06490737944841385, "learning_rate": 5.633931481795552e-06, "loss": 0.0068, "step": 2300 }, { "epoch": 1.7261815453863467, "grad_norm": 0.08114010840654373, "learning_rate": 5.603774650189808e-06, "loss": 0.0108, "step": 2301 }, { "epoch": 1.7269317329332332, "grad_norm": 0.13306188583374023, "learning_rate": 5.573693953738751e-06, "loss": 0.0199, "step": 2302 }, { "epoch": 1.7276819204801201, "grad_norm": 0.09569033980369568, "learning_rate": 5.543689444027839e-06, "loss": 0.0108, "step": 2303 }, { "epoch": 1.7284321080270066, "grad_norm": 0.08656707406044006, "learning_rate": 5.513761172511833e-06, "loss": 0.0111, "step": 2304 }, { "epoch": 1.7291822955738936, "grad_norm": 0.14147700369358063, "learning_rate": 5.483909190514797e-06, "loss": 0.0163, "step": 2305 }, { "epoch": 1.72993248312078, "grad_norm": 0.0972064808011055, "learning_rate": 5.4541335492299115e-06, "loss": 0.0119, "step": 2306 }, { "epoch": 1.730682670667667, "grad_norm": 0.14779311418533325, "learning_rate": 5.424434299719483e-06, "loss": 0.0194, "step": 2307 }, { "epoch": 1.7314328582145535, "grad_norm": 0.09764081239700317, "learning_rate": 5.394811492914803e-06, "loss": 0.0146, "step": 2308 }, { "epoch": 1.7321830457614404, "grad_norm": 0.08887841552495956, "learning_rate": 5.365265179616063e-06, "loss": 0.0102, "step": 2309 }, { "epoch": 1.732933233308327, "grad_norm": 0.141043022274971, "learning_rate": 5.3357954104922895e-06, "loss": 0.0152, "step": 2310 }, { "epoch": 1.7336834208552139, "grad_norm": 0.16252657771110535, "learning_rate": 5.306402236081209e-06, "loss": 0.0349, "step": 2311 }, { "epoch": 1.7344336084021004, "grad_norm": 0.09353138506412506, "learning_rate": 5.277085706789248e-06, "loss": 0.0127, "step": 2312 }, { "epoch": 1.7351837959489873, "grad_norm": 0.09399595111608505, "learning_rate": 5.247845872891371e-06, "loss": 0.0103, "step": 2313 }, { "epoch": 1.735933983495874, "grad_norm": 0.10673192143440247, "learning_rate": 5.218682784530993e-06, "loss": 0.0151, "step": 2314 }, { "epoch": 1.7366841710427607, "grad_norm": 0.2325008064508438, "learning_rate": 5.1895964917199445e-06, "loss": 0.0121, "step": 2315 }, { "epoch": 1.7374343585896475, "grad_norm": 0.1543661206960678, "learning_rate": 5.160587044338355e-06, "loss": 0.0186, "step": 2316 }, { "epoch": 1.7381845461365342, "grad_norm": 0.11611554026603699, "learning_rate": 5.131654492134574e-06, "loss": 0.0134, "step": 2317 }, { "epoch": 1.7389347336834209, "grad_norm": 0.09432833641767502, "learning_rate": 5.102798884725091e-06, "loss": 0.0108, "step": 2318 }, { "epoch": 1.7396849212303076, "grad_norm": 0.1359938085079193, "learning_rate": 5.074020271594404e-06, "loss": 0.0198, "step": 2319 }, { "epoch": 1.7404351087771943, "grad_norm": 0.1561252623796463, "learning_rate": 5.045318702095014e-06, "loss": 0.0232, "step": 2320 }, { "epoch": 1.741185296324081, "grad_norm": 0.13938003778457642, "learning_rate": 5.016694225447288e-06, "loss": 0.0197, "step": 2321 }, { "epoch": 1.7419354838709677, "grad_norm": 0.13493551313877106, "learning_rate": 4.988146890739381e-06, "loss": 0.0214, "step": 2322 }, { "epoch": 1.7426856714178545, "grad_norm": 0.09438906610012054, "learning_rate": 4.959676746927172e-06, "loss": 0.0125, "step": 2323 }, { "epoch": 1.7434358589647412, "grad_norm": 0.26109787821769714, "learning_rate": 4.931283842834139e-06, "loss": 0.0148, "step": 2324 }, { "epoch": 1.744186046511628, "grad_norm": 0.12471003085374832, "learning_rate": 4.902968227151311e-06, "loss": 0.0185, "step": 2325 }, { "epoch": 1.7449362340585146, "grad_norm": 0.08934643864631653, "learning_rate": 4.874729948437218e-06, "loss": 0.0118, "step": 2326 }, { "epoch": 1.7456864216054013, "grad_norm": 0.10696186125278473, "learning_rate": 4.846569055117684e-06, "loss": 0.0135, "step": 2327 }, { "epoch": 1.746436609152288, "grad_norm": 0.14127911627292633, "learning_rate": 4.818485595485889e-06, "loss": 0.0198, "step": 2328 }, { "epoch": 1.7471867966991748, "grad_norm": 0.12543734908103943, "learning_rate": 4.790479617702198e-06, "loss": 0.0146, "step": 2329 }, { "epoch": 1.7479369842460615, "grad_norm": 0.1253226101398468, "learning_rate": 4.762551169794105e-06, "loss": 0.0136, "step": 2330 }, { "epoch": 1.7486871717929482, "grad_norm": 0.15639212727546692, "learning_rate": 4.734700299656158e-06, "loss": 0.0154, "step": 2331 }, { "epoch": 1.749437359339835, "grad_norm": 0.12211638689041138, "learning_rate": 4.706927055049837e-06, "loss": 0.0151, "step": 2332 }, { "epoch": 1.7501875468867216, "grad_norm": 0.06649266928434372, "learning_rate": 4.6792314836035304e-06, "loss": 0.0099, "step": 2333 }, { "epoch": 1.7509377344336086, "grad_norm": 0.07582502067089081, "learning_rate": 4.651613632812413e-06, "loss": 0.0105, "step": 2334 }, { "epoch": 1.751687921980495, "grad_norm": 0.1324513703584671, "learning_rate": 4.624073550038399e-06, "loss": 0.0168, "step": 2335 }, { "epoch": 1.752438109527382, "grad_norm": 0.14669281244277954, "learning_rate": 4.596611282509989e-06, "loss": 0.0198, "step": 2336 }, { "epoch": 1.7531882970742685, "grad_norm": 0.0815398246049881, "learning_rate": 4.56922687732228e-06, "loss": 0.0161, "step": 2337 }, { "epoch": 1.7539384846211554, "grad_norm": 0.10307783633470535, "learning_rate": 4.5419203814368376e-06, "loss": 0.0128, "step": 2338 }, { "epoch": 1.754688672168042, "grad_norm": 0.07888856530189514, "learning_rate": 4.514691841681601e-06, "loss": 0.0143, "step": 2339 }, { "epoch": 1.7554388597149289, "grad_norm": 0.14705821871757507, "learning_rate": 4.487541304750848e-06, "loss": 0.0205, "step": 2340 }, { "epoch": 1.7561890472618154, "grad_norm": 0.11970402300357819, "learning_rate": 4.4604688172050605e-06, "loss": 0.0152, "step": 2341 }, { "epoch": 1.7569392348087023, "grad_norm": 0.1779605597257614, "learning_rate": 4.433474425470902e-06, "loss": 0.0208, "step": 2342 }, { "epoch": 1.7576894223555888, "grad_norm": 0.09036190807819366, "learning_rate": 4.406558175841097e-06, "loss": 0.0115, "step": 2343 }, { "epoch": 1.7584396099024757, "grad_norm": 0.10841822624206543, "learning_rate": 4.379720114474351e-06, "loss": 0.0153, "step": 2344 }, { "epoch": 1.7591897974493622, "grad_norm": 0.1275036334991455, "learning_rate": 4.352960287395303e-06, "loss": 0.0179, "step": 2345 }, { "epoch": 1.7599399849962492, "grad_norm": 0.08594145625829697, "learning_rate": 4.3262787404944165e-06, "loss": 0.011, "step": 2346 }, { "epoch": 1.7606901725431356, "grad_norm": 0.05019707977771759, "learning_rate": 4.299675519527929e-06, "loss": 0.0056, "step": 2347 }, { "epoch": 1.7614403600900226, "grad_norm": 0.1445784568786621, "learning_rate": 4.273150670117743e-06, "loss": 0.0165, "step": 2348 }, { "epoch": 1.762190547636909, "grad_norm": 0.15771323442459106, "learning_rate": 4.246704237751342e-06, "loss": 0.0145, "step": 2349 }, { "epoch": 1.762940735183796, "grad_norm": 0.1449221819639206, "learning_rate": 4.220336267781777e-06, "loss": 0.0235, "step": 2350 }, { "epoch": 1.7636909227306825, "grad_norm": 0.09601754695177078, "learning_rate": 4.19404680542751e-06, "loss": 0.0154, "step": 2351 }, { "epoch": 1.7644411102775694, "grad_norm": 0.0986536517739296, "learning_rate": 4.167835895772382e-06, "loss": 0.0111, "step": 2352 }, { "epoch": 1.7651912978244562, "grad_norm": 0.104599729180336, "learning_rate": 4.141703583765522e-06, "loss": 0.0103, "step": 2353 }, { "epoch": 1.7659414853713429, "grad_norm": 0.08715003728866577, "learning_rate": 4.11564991422127e-06, "loss": 0.0106, "step": 2354 }, { "epoch": 1.7666916729182296, "grad_norm": 0.1697830855846405, "learning_rate": 4.0896749318191095e-06, "loss": 0.0191, "step": 2355 }, { "epoch": 1.7674418604651163, "grad_norm": 0.1439453512430191, "learning_rate": 4.06377868110358e-06, "loss": 0.0157, "step": 2356 }, { "epoch": 1.768192048012003, "grad_norm": 0.08522556722164154, "learning_rate": 4.037961206484186e-06, "loss": 0.0118, "step": 2357 }, { "epoch": 1.7689422355588897, "grad_norm": 0.1324508637189865, "learning_rate": 4.0122225522353675e-06, "loss": 0.0156, "step": 2358 }, { "epoch": 1.7696924231057765, "grad_norm": 0.12487998604774475, "learning_rate": 3.986562762496376e-06, "loss": 0.0177, "step": 2359 }, { "epoch": 1.7704426106526632, "grad_norm": 0.22810116410255432, "learning_rate": 3.9609818812712255e-06, "loss": 0.0299, "step": 2360 }, { "epoch": 1.77119279819955, "grad_norm": 0.07804132252931595, "learning_rate": 3.935479952428611e-06, "loss": 0.0093, "step": 2361 }, { "epoch": 1.7719429857464366, "grad_norm": 0.18137666583061218, "learning_rate": 3.91005701970183e-06, "loss": 0.0205, "step": 2362 }, { "epoch": 1.7726931732933233, "grad_norm": 0.14881862699985504, "learning_rate": 3.8847131266886935e-06, "loss": 0.0154, "step": 2363 }, { "epoch": 1.77344336084021, "grad_norm": 0.12017820030450821, "learning_rate": 3.859448316851505e-06, "loss": 0.0178, "step": 2364 }, { "epoch": 1.7741935483870968, "grad_norm": 0.13187788426876068, "learning_rate": 3.834262633516916e-06, "loss": 0.0136, "step": 2365 }, { "epoch": 1.7749437359339835, "grad_norm": 0.10610248148441315, "learning_rate": 3.8091561198758897e-06, "loss": 0.0148, "step": 2366 }, { "epoch": 1.7756939234808702, "grad_norm": 0.11325306445360184, "learning_rate": 3.784128818983618e-06, "loss": 0.0167, "step": 2367 }, { "epoch": 1.776444111027757, "grad_norm": 0.08559075742959976, "learning_rate": 3.7591807737594743e-06, "loss": 0.0129, "step": 2368 }, { "epoch": 1.7771942985746436, "grad_norm": 0.10802838951349258, "learning_rate": 3.734312026986897e-06, "loss": 0.0129, "step": 2369 }, { "epoch": 1.7779444861215303, "grad_norm": 0.13777166604995728, "learning_rate": 3.7095226213133272e-06, "loss": 0.0232, "step": 2370 }, { "epoch": 1.7786946736684173, "grad_norm": 0.09481120854616165, "learning_rate": 3.6848125992501592e-06, "loss": 0.0155, "step": 2371 }, { "epoch": 1.7794448612153038, "grad_norm": 0.13231073319911957, "learning_rate": 3.6601820031726517e-06, "loss": 0.0143, "step": 2372 }, { "epoch": 1.7801950487621907, "grad_norm": 0.11420634388923645, "learning_rate": 3.6356308753198454e-06, "loss": 0.0179, "step": 2373 }, { "epoch": 1.7809452363090772, "grad_norm": 0.07495707273483276, "learning_rate": 3.6111592577945217e-06, "loss": 0.0083, "step": 2374 }, { "epoch": 1.7816954238559641, "grad_norm": 0.09842219948768616, "learning_rate": 3.586767192563073e-06, "loss": 0.0151, "step": 2375 }, { "epoch": 1.7824456114028506, "grad_norm": 0.12309426814317703, "learning_rate": 3.562454721455505e-06, "loss": 0.0169, "step": 2376 }, { "epoch": 1.7831957989497376, "grad_norm": 0.0984698012471199, "learning_rate": 3.538221886165299e-06, "loss": 0.0131, "step": 2377 }, { "epoch": 1.783945986496624, "grad_norm": 0.12303908169269562, "learning_rate": 3.514068728249398e-06, "loss": 0.013, "step": 2378 }, { "epoch": 1.784696174043511, "grad_norm": 0.17969577014446259, "learning_rate": 3.489995289128073e-06, "loss": 0.0373, "step": 2379 }, { "epoch": 1.7854463615903975, "grad_norm": 0.10440315306186676, "learning_rate": 3.4660016100849126e-06, "loss": 0.015, "step": 2380 }, { "epoch": 1.7861965491372844, "grad_norm": 0.09079710394144058, "learning_rate": 3.442087732266697e-06, "loss": 0.0143, "step": 2381 }, { "epoch": 1.786946736684171, "grad_norm": 0.15569381415843964, "learning_rate": 3.418253696683399e-06, "loss": 0.0183, "step": 2382 }, { "epoch": 1.7876969242310579, "grad_norm": 0.11904071271419525, "learning_rate": 3.3944995442080185e-06, "loss": 0.0165, "step": 2383 }, { "epoch": 1.7884471117779444, "grad_norm": 0.14972615242004395, "learning_rate": 3.3708253155766033e-06, "loss": 0.0232, "step": 2384 }, { "epoch": 1.7891972993248313, "grad_norm": 0.0957341119647026, "learning_rate": 3.347231051388117e-06, "loss": 0.0117, "step": 2385 }, { "epoch": 1.7899474868717178, "grad_norm": 0.08932828903198242, "learning_rate": 3.323716792104403e-06, "loss": 0.0091, "step": 2386 }, { "epoch": 1.7906976744186047, "grad_norm": 0.11236236989498138, "learning_rate": 3.3002825780500957e-06, "loss": 0.018, "step": 2387 }, { "epoch": 1.7914478619654912, "grad_norm": 0.10578911006450653, "learning_rate": 3.276928449412564e-06, "loss": 0.0133, "step": 2388 }, { "epoch": 1.7921980495123782, "grad_norm": 0.11684674769639969, "learning_rate": 3.253654446241844e-06, "loss": 0.0164, "step": 2389 }, { "epoch": 1.7929482370592649, "grad_norm": 0.0889001339673996, "learning_rate": 3.2304606084505585e-06, "loss": 0.013, "step": 2390 }, { "epoch": 1.7936984246061516, "grad_norm": 0.1431235671043396, "learning_rate": 3.2073469758138577e-06, "loss": 0.0248, "step": 2391 }, { "epoch": 1.7944486121530383, "grad_norm": 0.10257580131292343, "learning_rate": 3.18431358796934e-06, "loss": 0.0136, "step": 2392 }, { "epoch": 1.795198799699925, "grad_norm": 0.08846593648195267, "learning_rate": 3.161360484416992e-06, "loss": 0.0119, "step": 2393 }, { "epoch": 1.7959489872468117, "grad_norm": 0.07945258170366287, "learning_rate": 3.1384877045191384e-06, "loss": 0.0102, "step": 2394 }, { "epoch": 1.7966991747936985, "grad_norm": 0.1506921648979187, "learning_rate": 3.1156952875003365e-06, "loss": 0.0197, "step": 2395 }, { "epoch": 1.7974493623405852, "grad_norm": 0.0959123820066452, "learning_rate": 3.0929832724473416e-06, "loss": 0.0111, "step": 2396 }, { "epoch": 1.7981995498874719, "grad_norm": 0.12383115291595459, "learning_rate": 3.0703516983090207e-06, "loss": 0.0123, "step": 2397 }, { "epoch": 1.7989497374343586, "grad_norm": 0.20673918724060059, "learning_rate": 3.0478006038962947e-06, "loss": 0.0221, "step": 2398 }, { "epoch": 1.7996999249812453, "grad_norm": 0.07872434705495834, "learning_rate": 3.0253300278820783e-06, "loss": 0.0088, "step": 2399 }, { "epoch": 1.800450112528132, "grad_norm": 0.10210002958774567, "learning_rate": 3.002940008801186e-06, "loss": 0.0133, "step": 2400 }, { "epoch": 1.800450112528132, "eval_loss": 0.031127866357564926, "eval_runtime": 5.1038, "eval_samples_per_second": 10.58, "eval_steps_per_second": 2.743, "step": 2400 }, { "epoch": 1.8012003000750187, "grad_norm": 0.13143689930438995, "learning_rate": 2.9806305850502923e-06, "loss": 0.0172, "step": 2401 }, { "epoch": 1.8019504876219055, "grad_norm": 0.1445961445569992, "learning_rate": 2.9584017948878717e-06, "loss": 0.0238, "step": 2402 }, { "epoch": 1.8027006751687922, "grad_norm": 0.15010763704776764, "learning_rate": 2.9362536764341085e-06, "loss": 0.0183, "step": 2403 }, { "epoch": 1.803450862715679, "grad_norm": 0.20283788442611694, "learning_rate": 2.9141862676708486e-06, "loss": 0.0397, "step": 2404 }, { "epoch": 1.8042010502625656, "grad_norm": 0.16290506720542908, "learning_rate": 2.8921996064415147e-06, "loss": 0.0161, "step": 2405 }, { "epoch": 1.8049512378094523, "grad_norm": 0.12406005710363388, "learning_rate": 2.870293730451068e-06, "loss": 0.0143, "step": 2406 }, { "epoch": 1.805701425356339, "grad_norm": 0.10236632823944092, "learning_rate": 2.8484686772659308e-06, "loss": 0.0186, "step": 2407 }, { "epoch": 1.8064516129032258, "grad_norm": 0.1057526171207428, "learning_rate": 2.826724484313925e-06, "loss": 0.0116, "step": 2408 }, { "epoch": 1.8072018004501125, "grad_norm": 0.1698257327079773, "learning_rate": 2.8050611888841947e-06, "loss": 0.0229, "step": 2409 }, { "epoch": 1.8079519879969994, "grad_norm": 0.08635108917951584, "learning_rate": 2.7834788281271616e-06, "loss": 0.0129, "step": 2410 }, { "epoch": 1.808702175543886, "grad_norm": 0.11237742751836777, "learning_rate": 2.7619774390544473e-06, "loss": 0.0132, "step": 2411 }, { "epoch": 1.8094523630907728, "grad_norm": 0.0990079864859581, "learning_rate": 2.740557058538823e-06, "loss": 0.0139, "step": 2412 }, { "epoch": 1.8102025506376593, "grad_norm": 0.11661037802696228, "learning_rate": 2.7192177233141215e-06, "loss": 0.015, "step": 2413 }, { "epoch": 1.8109527381845463, "grad_norm": 0.16443532705307007, "learning_rate": 2.697959469975203e-06, "loss": 0.0227, "step": 2414 }, { "epoch": 1.8117029257314328, "grad_norm": 0.12524110078811646, "learning_rate": 2.6767823349778843e-06, "loss": 0.0145, "step": 2415 }, { "epoch": 1.8124531132783197, "grad_norm": 0.09298525005578995, "learning_rate": 2.65568635463887e-06, "loss": 0.0163, "step": 2416 }, { "epoch": 1.8132033008252062, "grad_norm": 0.10674528032541275, "learning_rate": 2.634671565135677e-06, "loss": 0.0174, "step": 2417 }, { "epoch": 1.8139534883720931, "grad_norm": 0.15459661185741425, "learning_rate": 2.613738002506605e-06, "loss": 0.0147, "step": 2418 }, { "epoch": 1.8147036759189796, "grad_norm": 0.11485212296247482, "learning_rate": 2.592885702650655e-06, "loss": 0.0175, "step": 2419 }, { "epoch": 1.8154538634658666, "grad_norm": 0.13821269571781158, "learning_rate": 2.572114701327466e-06, "loss": 0.0172, "step": 2420 }, { "epoch": 1.816204051012753, "grad_norm": 0.10720129311084747, "learning_rate": 2.551425034157262e-06, "loss": 0.015, "step": 2421 }, { "epoch": 1.81695423855964, "grad_norm": 0.10853177309036255, "learning_rate": 2.5308167366207724e-06, "loss": 0.0195, "step": 2422 }, { "epoch": 1.8177044261065265, "grad_norm": 0.10341428220272064, "learning_rate": 2.510289844059216e-06, "loss": 0.0164, "step": 2423 }, { "epoch": 1.8184546136534134, "grad_norm": 0.07772510498762131, "learning_rate": 2.48984439167419e-06, "loss": 0.012, "step": 2424 }, { "epoch": 1.8192048012003, "grad_norm": 0.12696626782417297, "learning_rate": 2.4694804145276305e-06, "loss": 0.0159, "step": 2425 }, { "epoch": 1.8199549887471869, "grad_norm": 0.16353991627693176, "learning_rate": 2.449197947541737e-06, "loss": 0.019, "step": 2426 }, { "epoch": 1.8207051762940734, "grad_norm": 0.1997886449098587, "learning_rate": 2.4289970254989635e-06, "loss": 0.02, "step": 2427 }, { "epoch": 1.8214553638409603, "grad_norm": 0.42609405517578125, "learning_rate": 2.408877683041888e-06, "loss": 0.0319, "step": 2428 }, { "epoch": 1.822205551387847, "grad_norm": 0.1115378588438034, "learning_rate": 2.388839954673222e-06, "loss": 0.0118, "step": 2429 }, { "epoch": 1.8229557389347337, "grad_norm": 0.1740262359380722, "learning_rate": 2.3688838747556674e-06, "loss": 0.0192, "step": 2430 }, { "epoch": 1.8237059264816204, "grad_norm": 0.08586451411247253, "learning_rate": 2.3490094775119597e-06, "loss": 0.0098, "step": 2431 }, { "epoch": 1.8244561140285072, "grad_norm": 0.10886698216199875, "learning_rate": 2.3292167970247193e-06, "loss": 0.0137, "step": 2432 }, { "epoch": 1.8252063015753939, "grad_norm": 0.10428888350725174, "learning_rate": 2.30950586723645e-06, "loss": 0.0124, "step": 2433 }, { "epoch": 1.8259564891222806, "grad_norm": 0.10169228166341782, "learning_rate": 2.2898767219494634e-06, "loss": 0.0127, "step": 2434 }, { "epoch": 1.8267066766691673, "grad_norm": 0.25430697202682495, "learning_rate": 2.270329394825793e-06, "loss": 0.0274, "step": 2435 }, { "epoch": 1.827456864216054, "grad_norm": 0.110308937728405, "learning_rate": 2.2508639193871805e-06, "loss": 0.0184, "step": 2436 }, { "epoch": 1.8282070517629407, "grad_norm": 0.13315962255001068, "learning_rate": 2.2314803290150287e-06, "loss": 0.0144, "step": 2437 }, { "epoch": 1.8289572393098275, "grad_norm": 0.08486826717853546, "learning_rate": 2.2121786569502535e-06, "loss": 0.0085, "step": 2438 }, { "epoch": 1.8297074268567142, "grad_norm": 0.084154412150383, "learning_rate": 2.192958936293338e-06, "loss": 0.0139, "step": 2439 }, { "epoch": 1.8304576144036009, "grad_norm": 0.12315694987773895, "learning_rate": 2.1738212000042e-06, "loss": 0.0176, "step": 2440 }, { "epoch": 1.8312078019504876, "grad_norm": 0.10296216607093811, "learning_rate": 2.1547654809021877e-06, "loss": 0.0109, "step": 2441 }, { "epoch": 1.8319579894973743, "grad_norm": 0.09593435376882553, "learning_rate": 2.135791811665977e-06, "loss": 0.0142, "step": 2442 }, { "epoch": 1.832708177044261, "grad_norm": 0.08409593999385834, "learning_rate": 2.1169002248335346e-06, "loss": 0.0101, "step": 2443 }, { "epoch": 1.8334583645911477, "grad_norm": 0.11003497987985611, "learning_rate": 2.098090752802073e-06, "loss": 0.0149, "step": 2444 }, { "epoch": 1.8342085521380345, "grad_norm": 0.10805971175432205, "learning_rate": 2.0793634278279907e-06, "loss": 0.0163, "step": 2445 }, { "epoch": 1.8349587396849212, "grad_norm": 0.13255015015602112, "learning_rate": 2.0607182820268133e-06, "loss": 0.0162, "step": 2446 }, { "epoch": 1.835708927231808, "grad_norm": 0.13535521924495697, "learning_rate": 2.042155347373109e-06, "loss": 0.0206, "step": 2447 }, { "epoch": 1.8364591147786946, "grad_norm": 0.13720273971557617, "learning_rate": 2.023674655700497e-06, "loss": 0.0126, "step": 2448 }, { "epoch": 1.8372093023255816, "grad_norm": 0.20435172319412231, "learning_rate": 2.0052762387015424e-06, "loss": 0.0223, "step": 2449 }, { "epoch": 1.837959489872468, "grad_norm": 0.1612643599510193, "learning_rate": 1.986960127927717e-06, "loss": 0.0147, "step": 2450 }, { "epoch": 1.838709677419355, "grad_norm": 0.139284148812294, "learning_rate": 1.9687263547893407e-06, "loss": 0.0182, "step": 2451 }, { "epoch": 1.8394598649662415, "grad_norm": 0.16131514310836792, "learning_rate": 1.9505749505555503e-06, "loss": 0.0175, "step": 2452 }, { "epoch": 1.8402100525131284, "grad_norm": 0.11480918526649475, "learning_rate": 1.932505946354213e-06, "loss": 0.0136, "step": 2453 }, { "epoch": 1.840960240060015, "grad_norm": 0.1593775898218155, "learning_rate": 1.9145193731718858e-06, "loss": 0.019, "step": 2454 }, { "epoch": 1.8417104276069018, "grad_norm": 0.28252995014190674, "learning_rate": 1.8966152618537846e-06, "loss": 0.0292, "step": 2455 }, { "epoch": 1.8424606151537883, "grad_norm": 0.20613250136375427, "learning_rate": 1.8787936431036824e-06, "loss": 0.0222, "step": 2456 }, { "epoch": 1.8432108027006753, "grad_norm": 0.1379401534795761, "learning_rate": 1.8610545474839036e-06, "loss": 0.0298, "step": 2457 }, { "epoch": 1.8439609902475618, "grad_norm": 0.09081874042749405, "learning_rate": 1.8433980054152533e-06, "loss": 0.0155, "step": 2458 }, { "epoch": 1.8447111777944487, "grad_norm": 0.05574246495962143, "learning_rate": 1.8258240471769662e-06, "loss": 0.0088, "step": 2459 }, { "epoch": 1.8454613653413352, "grad_norm": 0.11207207292318344, "learning_rate": 1.8083327029066399e-06, "loss": 0.0132, "step": 2460 }, { "epoch": 1.8462115528882221, "grad_norm": 0.14374390244483948, "learning_rate": 1.7909240026002138e-06, "loss": 0.0226, "step": 2461 }, { "epoch": 1.8469617404351086, "grad_norm": 0.08422519266605377, "learning_rate": 1.773597976111896e-06, "loss": 0.0087, "step": 2462 }, { "epoch": 1.8477119279819956, "grad_norm": 0.1311851292848587, "learning_rate": 1.7563546531541132e-06, "loss": 0.0167, "step": 2463 }, { "epoch": 1.848462115528882, "grad_norm": 0.11781198531389236, "learning_rate": 1.7391940632974667e-06, "loss": 0.0134, "step": 2464 }, { "epoch": 1.849212303075769, "grad_norm": 0.13201270997524261, "learning_rate": 1.7221162359706776e-06, "loss": 0.0158, "step": 2465 }, { "epoch": 1.8499624906226555, "grad_norm": 0.24497579038143158, "learning_rate": 1.705121200460541e-06, "loss": 0.0396, "step": 2466 }, { "epoch": 1.8507126781695424, "grad_norm": 0.16112586855888367, "learning_rate": 1.6882089859118766e-06, "loss": 0.0258, "step": 2467 }, { "epoch": 1.8514628657164292, "grad_norm": 0.12652385234832764, "learning_rate": 1.6713796213274457e-06, "loss": 0.0127, "step": 2468 }, { "epoch": 1.8522130532633159, "grad_norm": 0.12414011359214783, "learning_rate": 1.6546331355679623e-06, "loss": 0.0148, "step": 2469 }, { "epoch": 1.8529632408102026, "grad_norm": 0.1247783824801445, "learning_rate": 1.6379695573520093e-06, "loss": 0.016, "step": 2470 }, { "epoch": 1.8537134283570893, "grad_norm": 0.1910862773656845, "learning_rate": 1.621388915255967e-06, "loss": 0.0302, "step": 2471 }, { "epoch": 1.854463615903976, "grad_norm": 0.09329649060964584, "learning_rate": 1.604891237714018e-06, "loss": 0.0132, "step": 2472 }, { "epoch": 1.8552138034508627, "grad_norm": 0.12266834080219269, "learning_rate": 1.5884765530180478e-06, "loss": 0.0175, "step": 2473 }, { "epoch": 1.8559639909977494, "grad_norm": 0.10107134282588959, "learning_rate": 1.5721448893176228e-06, "loss": 0.0113, "step": 2474 }, { "epoch": 1.8567141785446362, "grad_norm": 0.1326693296432495, "learning_rate": 1.5558962746199335e-06, "loss": 0.0198, "step": 2475 }, { "epoch": 1.8574643660915229, "grad_norm": 0.0919971838593483, "learning_rate": 1.5397307367897684e-06, "loss": 0.013, "step": 2476 }, { "epoch": 1.8582145536384096, "grad_norm": 0.09108588844537735, "learning_rate": 1.5236483035494297e-06, "loss": 0.014, "step": 2477 }, { "epoch": 1.8589647411852963, "grad_norm": 0.1272948980331421, "learning_rate": 1.5076490024786893e-06, "loss": 0.0175, "step": 2478 }, { "epoch": 1.859714928732183, "grad_norm": 0.1336841732263565, "learning_rate": 1.4917328610147885e-06, "loss": 0.0179, "step": 2479 }, { "epoch": 1.8604651162790697, "grad_norm": 0.19899259507656097, "learning_rate": 1.4758999064523493e-06, "loss": 0.0218, "step": 2480 }, { "epoch": 1.8612153038259565, "grad_norm": 0.12127574533224106, "learning_rate": 1.4601501659433137e-06, "loss": 0.0147, "step": 2481 }, { "epoch": 1.8619654913728432, "grad_norm": 0.11681947857141495, "learning_rate": 1.444483666496943e-06, "loss": 0.0144, "step": 2482 }, { "epoch": 1.86271567891973, "grad_norm": 0.0795118510723114, "learning_rate": 1.4289004349797409e-06, "loss": 0.0128, "step": 2483 }, { "epoch": 1.8634658664666166, "grad_norm": 0.11302927881479263, "learning_rate": 1.4134004981154137e-06, "loss": 0.019, "step": 2484 }, { "epoch": 1.8642160540135033, "grad_norm": 0.12013009190559387, "learning_rate": 1.3979838824848378e-06, "loss": 0.0174, "step": 2485 }, { "epoch": 1.8649662415603903, "grad_norm": 0.15739822387695312, "learning_rate": 1.382650614525971e-06, "loss": 0.0179, "step": 2486 }, { "epoch": 1.8657164291072768, "grad_norm": 0.15660080313682556, "learning_rate": 1.3674007205338678e-06, "loss": 0.0199, "step": 2487 }, { "epoch": 1.8664666166541637, "grad_norm": 0.10661745071411133, "learning_rate": 1.3522342266605925e-06, "loss": 0.0144, "step": 2488 }, { "epoch": 1.8672168042010502, "grad_norm": 0.1589604765176773, "learning_rate": 1.3371511589152008e-06, "loss": 0.0189, "step": 2489 }, { "epoch": 1.8679669917479371, "grad_norm": 0.11324826627969742, "learning_rate": 1.3221515431636522e-06, "loss": 0.0135, "step": 2490 }, { "epoch": 1.8687171792948236, "grad_norm": 0.19825062155723572, "learning_rate": 1.307235405128815e-06, "loss": 0.0279, "step": 2491 }, { "epoch": 1.8694673668417106, "grad_norm": 0.18215356767177582, "learning_rate": 1.292402770390394e-06, "loss": 0.0225, "step": 2492 }, { "epoch": 1.870217554388597, "grad_norm": 0.09638606756925583, "learning_rate": 1.2776536643849145e-06, "loss": 0.0132, "step": 2493 }, { "epoch": 1.870967741935484, "grad_norm": 0.0821240171790123, "learning_rate": 1.2629881124056274e-06, "loss": 0.0109, "step": 2494 }, { "epoch": 1.8717179294823705, "grad_norm": 0.14057280123233795, "learning_rate": 1.2484061396025038e-06, "loss": 0.0144, "step": 2495 }, { "epoch": 1.8724681170292574, "grad_norm": 0.11429706960916519, "learning_rate": 1.2339077709822067e-06, "loss": 0.0161, "step": 2496 }, { "epoch": 1.873218304576144, "grad_norm": 0.11428690701723099, "learning_rate": 1.2194930314080032e-06, "loss": 0.0176, "step": 2497 }, { "epoch": 1.8739684921230308, "grad_norm": 0.1507968008518219, "learning_rate": 1.2051619455997476e-06, "loss": 0.0199, "step": 2498 }, { "epoch": 1.8747186796699173, "grad_norm": 0.20986028015613556, "learning_rate": 1.1909145381338472e-06, "loss": 0.0168, "step": 2499 }, { "epoch": 1.8754688672168043, "grad_norm": 0.23694069683551788, "learning_rate": 1.1767508334431964e-06, "loss": 0.0267, "step": 2500 }, { "epoch": 1.8762190547636908, "grad_norm": 0.09894689917564392, "learning_rate": 1.1626708558171606e-06, "loss": 0.0147, "step": 2501 }, { "epoch": 1.8769692423105777, "grad_norm": 0.1138412207365036, "learning_rate": 1.1486746294015193e-06, "loss": 0.0181, "step": 2502 }, { "epoch": 1.8777194298574642, "grad_norm": 0.12932085990905762, "learning_rate": 1.134762178198412e-06, "loss": 0.0169, "step": 2503 }, { "epoch": 1.8784696174043511, "grad_norm": 0.09746240079402924, "learning_rate": 1.1209335260663256e-06, "loss": 0.0121, "step": 2504 }, { "epoch": 1.8792198049512379, "grad_norm": 0.11326896399259567, "learning_rate": 1.1071886967200352e-06, "loss": 0.0208, "step": 2505 }, { "epoch": 1.8799699924981246, "grad_norm": 0.09456074237823486, "learning_rate": 1.0935277137305744e-06, "loss": 0.0124, "step": 2506 }, { "epoch": 1.8807201800450113, "grad_norm": 0.1309920698404312, "learning_rate": 1.0799506005251814e-06, "loss": 0.0181, "step": 2507 }, { "epoch": 1.881470367591898, "grad_norm": 0.12602433562278748, "learning_rate": 1.06645738038727e-06, "loss": 0.0184, "step": 2508 }, { "epoch": 1.8822205551387847, "grad_norm": 0.10263347625732422, "learning_rate": 1.053048076456381e-06, "loss": 0.0161, "step": 2509 }, { "epoch": 1.8829707426856714, "grad_norm": 0.09971325099468231, "learning_rate": 1.0397227117281528e-06, "loss": 0.0115, "step": 2510 }, { "epoch": 1.8837209302325582, "grad_norm": 0.11184331774711609, "learning_rate": 1.0264813090542725e-06, "loss": 0.0146, "step": 2511 }, { "epoch": 1.8844711177794449, "grad_norm": 0.16481465101242065, "learning_rate": 1.0133238911424426e-06, "loss": 0.0182, "step": 2512 }, { "epoch": 1.8852213053263316, "grad_norm": 0.16809651255607605, "learning_rate": 1.0002504805563362e-06, "loss": 0.0144, "step": 2513 }, { "epoch": 1.8859714928732183, "grad_norm": 0.0960497260093689, "learning_rate": 9.872610997155695e-07, "loss": 0.0135, "step": 2514 }, { "epoch": 1.886721680420105, "grad_norm": 0.07542908936738968, "learning_rate": 9.743557708956575e-07, "loss": 0.0089, "step": 2515 }, { "epoch": 1.8874718679669917, "grad_norm": 0.07793007045984268, "learning_rate": 9.615345162279521e-07, "loss": 0.0078, "step": 2516 }, { "epoch": 1.8882220555138785, "grad_norm": 0.21286483108997345, "learning_rate": 9.48797357699649e-07, "loss": 0.0262, "step": 2517 }, { "epoch": 1.8889722430607652, "grad_norm": 0.09491562098264694, "learning_rate": 9.361443171537254e-07, "loss": 0.0074, "step": 2518 }, { "epoch": 1.8897224306076519, "grad_norm": 0.14178186655044556, "learning_rate": 9.235754162889021e-07, "loss": 0.0205, "step": 2519 }, { "epoch": 1.8904726181545386, "grad_norm": 0.16438992321491241, "learning_rate": 9.110906766595872e-07, "loss": 0.0185, "step": 2520 }, { "epoch": 1.8912228057014253, "grad_norm": 0.17157548666000366, "learning_rate": 8.986901196759046e-07, "loss": 0.0291, "step": 2521 }, { "epoch": 1.891972993248312, "grad_norm": 0.06102016195654869, "learning_rate": 8.863737666035765e-07, "loss": 0.009, "step": 2522 }, { "epoch": 1.8927231807951987, "grad_norm": 0.18232974410057068, "learning_rate": 8.741416385639412e-07, "loss": 0.0306, "step": 2523 }, { "epoch": 1.8934733683420855, "grad_norm": 0.14115692675113678, "learning_rate": 8.619937565338854e-07, "loss": 0.0187, "step": 2524 }, { "epoch": 1.8942235558889724, "grad_norm": 0.10181062668561935, "learning_rate": 8.499301413458338e-07, "loss": 0.0114, "step": 2525 }, { "epoch": 1.894973743435859, "grad_norm": 0.12386108934879303, "learning_rate": 8.37950813687699e-07, "loss": 0.0154, "step": 2526 }, { "epoch": 1.8957239309827458, "grad_norm": 0.12360105663537979, "learning_rate": 8.26055794102848e-07, "loss": 0.0184, "step": 2527 }, { "epoch": 1.8964741185296323, "grad_norm": 0.12389027327299118, "learning_rate": 8.142451029900744e-07, "loss": 0.0143, "step": 2528 }, { "epoch": 1.8972243060765193, "grad_norm": 0.12951231002807617, "learning_rate": 8.025187606035434e-07, "loss": 0.0186, "step": 2529 }, { "epoch": 1.8979744936234058, "grad_norm": 0.10430709272623062, "learning_rate": 7.908767870527745e-07, "loss": 0.0135, "step": 2530 }, { "epoch": 1.8987246811702927, "grad_norm": 0.09554420411586761, "learning_rate": 7.793192023026142e-07, "loss": 0.0139, "step": 2531 }, { "epoch": 1.8994748687171792, "grad_norm": 0.13989853858947754, "learning_rate": 7.678460261731801e-07, "loss": 0.0205, "step": 2532 }, { "epoch": 1.9002250562640661, "grad_norm": 0.13728971779346466, "learning_rate": 7.564572783398339e-07, "loss": 0.0131, "step": 2533 }, { "epoch": 1.9009752438109526, "grad_norm": 0.09057745337486267, "learning_rate": 7.451529783331523e-07, "loss": 0.0093, "step": 2534 }, { "epoch": 1.9017254313578396, "grad_norm": 0.1505969762802124, "learning_rate": 7.339331455389175e-07, "loss": 0.0145, "step": 2535 }, { "epoch": 1.902475618904726, "grad_norm": 0.1213538721203804, "learning_rate": 7.227977991980217e-07, "loss": 0.0179, "step": 2536 }, { "epoch": 1.903225806451613, "grad_norm": 0.09671944379806519, "learning_rate": 7.117469584064895e-07, "loss": 0.0119, "step": 2537 }, { "epoch": 1.9039759939984995, "grad_norm": 0.08646194636821747, "learning_rate": 7.007806421154284e-07, "loss": 0.0074, "step": 2538 }, { "epoch": 1.9047261815453864, "grad_norm": 0.11140266805887222, "learning_rate": 6.898988691309893e-07, "loss": 0.0151, "step": 2539 }, { "epoch": 1.905476369092273, "grad_norm": 0.09805849939584732, "learning_rate": 6.791016581143395e-07, "loss": 0.0136, "step": 2540 }, { "epoch": 1.9062265566391599, "grad_norm": 0.1474841833114624, "learning_rate": 6.683890275816341e-07, "loss": 0.0185, "step": 2541 }, { "epoch": 1.9069767441860463, "grad_norm": 0.11673633009195328, "learning_rate": 6.577609959039776e-07, "loss": 0.0155, "step": 2542 }, { "epoch": 1.9077269317329333, "grad_norm": 0.13770844042301178, "learning_rate": 6.472175813074022e-07, "loss": 0.018, "step": 2543 }, { "epoch": 1.90847711927982, "grad_norm": 0.10890696942806244, "learning_rate": 6.367588018728166e-07, "loss": 0.0166, "step": 2544 }, { "epoch": 1.9092273068267067, "grad_norm": 0.08831574767827988, "learning_rate": 6.263846755360126e-07, "loss": 0.0115, "step": 2545 }, { "epoch": 1.9099774943735934, "grad_norm": 0.08889841288328171, "learning_rate": 6.16095220087587e-07, "loss": 0.011, "step": 2546 }, { "epoch": 1.9107276819204801, "grad_norm": 0.11765546351671219, "learning_rate": 6.05890453172936e-07, "loss": 0.0153, "step": 2547 }, { "epoch": 1.9114778694673669, "grad_norm": 0.13256019353866577, "learning_rate": 5.957703922922386e-07, "loss": 0.0199, "step": 2548 }, { "epoch": 1.9122280570142536, "grad_norm": 0.0688183531165123, "learning_rate": 5.857350548004015e-07, "loss": 0.0084, "step": 2549 }, { "epoch": 1.9129782445611403, "grad_norm": 0.09796126186847687, "learning_rate": 5.757844579070359e-07, "loss": 0.0095, "step": 2550 }, { "epoch": 1.913728432108027, "grad_norm": 0.09508487582206726, "learning_rate": 5.65918618676442e-07, "loss": 0.0117, "step": 2551 }, { "epoch": 1.9144786196549137, "grad_norm": 0.14749017357826233, "learning_rate": 5.561375540275581e-07, "loss": 0.0202, "step": 2552 }, { "epoch": 1.9152288072018004, "grad_norm": 0.12853582203388214, "learning_rate": 5.464412807339558e-07, "loss": 0.0171, "step": 2553 }, { "epoch": 1.9159789947486872, "grad_norm": 0.13968953490257263, "learning_rate": 5.368298154237727e-07, "loss": 0.0178, "step": 2554 }, { "epoch": 1.9167291822955739, "grad_norm": 0.14459316432476044, "learning_rate": 5.273031745797352e-07, "loss": 0.0146, "step": 2555 }, { "epoch": 1.9174793698424606, "grad_norm": 0.12243661284446716, "learning_rate": 5.17861374539097e-07, "loss": 0.0106, "step": 2556 }, { "epoch": 1.9182295573893473, "grad_norm": 0.14610101282596588, "learning_rate": 5.085044314936116e-07, "loss": 0.0136, "step": 2557 }, { "epoch": 1.918979744936234, "grad_norm": 0.10917095094919205, "learning_rate": 4.992323614895156e-07, "loss": 0.0184, "step": 2558 }, { "epoch": 1.9197299324831207, "grad_norm": 0.16345548629760742, "learning_rate": 4.900451804274898e-07, "loss": 0.0193, "step": 2559 }, { "epoch": 1.9204801200300075, "grad_norm": 0.16431359946727753, "learning_rate": 4.809429040626535e-07, "loss": 0.0191, "step": 2560 }, { "epoch": 1.9212303075768942, "grad_norm": 0.06987474858760834, "learning_rate": 4.719255480045148e-07, "loss": 0.0109, "step": 2561 }, { "epoch": 1.921980495123781, "grad_norm": 0.07272496074438095, "learning_rate": 4.6299312771694304e-07, "loss": 0.011, "step": 2562 }, { "epoch": 1.9227306826706676, "grad_norm": 0.1672581136226654, "learning_rate": 4.5414565851816806e-07, "loss": 0.018, "step": 2563 }, { "epoch": 1.9234808702175545, "grad_norm": 0.18490219116210938, "learning_rate": 4.453831555807253e-07, "loss": 0.0189, "step": 2564 }, { "epoch": 1.924231057764441, "grad_norm": 0.08568117022514343, "learning_rate": 4.36705633931439e-07, "loss": 0.0088, "step": 2565 }, { "epoch": 1.924981245311328, "grad_norm": 0.12976892292499542, "learning_rate": 4.281131084514167e-07, "loss": 0.0182, "step": 2566 }, { "epoch": 1.9257314328582145, "grad_norm": 0.21210747957229614, "learning_rate": 4.196055938759824e-07, "loss": 0.0219, "step": 2567 }, { "epoch": 1.9264816204051014, "grad_norm": 0.09944725036621094, "learning_rate": 4.111831047946879e-07, "loss": 0.0101, "step": 2568 }, { "epoch": 1.927231807951988, "grad_norm": 0.1581333726644516, "learning_rate": 4.0284565565127384e-07, "loss": 0.0131, "step": 2569 }, { "epoch": 1.9279819954988748, "grad_norm": 0.10216685384511948, "learning_rate": 3.9459326074364756e-07, "loss": 0.015, "step": 2570 }, { "epoch": 1.9287321830457613, "grad_norm": 0.0924372598528862, "learning_rate": 3.8642593422384965e-07, "loss": 0.012, "step": 2571 }, { "epoch": 1.9294823705926483, "grad_norm": 0.10501393675804138, "learning_rate": 3.7834369009804303e-07, "loss": 0.0155, "step": 2572 }, { "epoch": 1.9302325581395348, "grad_norm": 0.1474066525697708, "learning_rate": 3.703465422264796e-07, "loss": 0.0142, "step": 2573 }, { "epoch": 1.9309827456864217, "grad_norm": 0.07856673747301102, "learning_rate": 3.624345043234778e-07, "loss": 0.0118, "step": 2574 }, { "epoch": 1.9317329332333082, "grad_norm": 0.10008254647254944, "learning_rate": 3.5460758995741194e-07, "loss": 0.0105, "step": 2575 }, { "epoch": 1.9324831207801951, "grad_norm": 0.13343030214309692, "learning_rate": 3.468658125506563e-07, "loss": 0.0181, "step": 2576 }, { "epoch": 1.9332333083270816, "grad_norm": 0.11500576883554459, "learning_rate": 3.3920918537960754e-07, "loss": 0.0144, "step": 2577 }, { "epoch": 1.9339834958739686, "grad_norm": 0.14943444728851318, "learning_rate": 3.3163772157462357e-07, "loss": 0.0242, "step": 2578 }, { "epoch": 1.934733683420855, "grad_norm": 0.15446636080741882, "learning_rate": 3.241514341200236e-07, "loss": 0.021, "step": 2579 }, { "epoch": 1.935483870967742, "grad_norm": 0.06783124804496765, "learning_rate": 3.1675033585404355e-07, "loss": 0.0073, "step": 2580 }, { "epoch": 1.9362340585146287, "grad_norm": 0.14099209010601044, "learning_rate": 3.0943443946884755e-07, "loss": 0.0192, "step": 2581 }, { "epoch": 1.9369842460615154, "grad_norm": 0.11827205866575241, "learning_rate": 3.0220375751047194e-07, "loss": 0.0126, "step": 2582 }, { "epoch": 1.9377344336084021, "grad_norm": 0.16399821639060974, "learning_rate": 2.950583023788256e-07, "loss": 0.0216, "step": 2583 }, { "epoch": 1.9384846211552889, "grad_norm": 0.10808265209197998, "learning_rate": 2.879980863276621e-07, "loss": 0.012, "step": 2584 }, { "epoch": 1.9392348087021756, "grad_norm": 0.11654945462942123, "learning_rate": 2.8102312146455755e-07, "loss": 0.017, "step": 2585 }, { "epoch": 1.9399849962490623, "grad_norm": 0.12237963080406189, "learning_rate": 2.7413341975088824e-07, "loss": 0.0263, "step": 2586 }, { "epoch": 1.940735183795949, "grad_norm": 0.11718087643384933, "learning_rate": 2.6732899300180857e-07, "loss": 0.0124, "step": 2587 }, { "epoch": 1.9414853713428357, "grad_norm": 0.08054438978433609, "learning_rate": 2.606098528862566e-07, "loss": 0.012, "step": 2588 }, { "epoch": 1.9422355588897224, "grad_norm": 0.10906367003917694, "learning_rate": 2.5397601092687627e-07, "loss": 0.0188, "step": 2589 }, { "epoch": 1.9429857464366092, "grad_norm": 0.10259516537189484, "learning_rate": 2.474274785000619e-07, "loss": 0.0145, "step": 2590 }, { "epoch": 1.9437359339834959, "grad_norm": 0.11857086420059204, "learning_rate": 2.40964266835908e-07, "loss": 0.0162, "step": 2591 }, { "epoch": 1.9444861215303826, "grad_norm": 0.07964332401752472, "learning_rate": 2.3458638701817636e-07, "loss": 0.0126, "step": 2592 }, { "epoch": 1.9452363090772693, "grad_norm": 0.08068211376667023, "learning_rate": 2.2829384998430681e-07, "loss": 0.0105, "step": 2593 }, { "epoch": 1.945986496624156, "grad_norm": 0.11038036644458771, "learning_rate": 2.2208666652537846e-07, "loss": 0.0163, "step": 2594 }, { "epoch": 1.9467366841710427, "grad_norm": 0.12264223396778107, "learning_rate": 2.1596484728610421e-07, "loss": 0.0222, "step": 2595 }, { "epoch": 1.9474868717179294, "grad_norm": 0.1597457379102707, "learning_rate": 2.099284027647974e-07, "loss": 0.0232, "step": 2596 }, { "epoch": 1.9482370592648162, "grad_norm": 0.13513779640197754, "learning_rate": 2.039773433133718e-07, "loss": 0.0156, "step": 2597 }, { "epoch": 1.9489872468117029, "grad_norm": 0.1858983039855957, "learning_rate": 1.9811167913729723e-07, "loss": 0.0143, "step": 2598 }, { "epoch": 1.9497374343585896, "grad_norm": 0.0995759516954422, "learning_rate": 1.923314202956217e-07, "loss": 0.0099, "step": 2599 }, { "epoch": 1.9504876219054763, "grad_norm": 0.11884553730487823, "learning_rate": 1.8663657670091595e-07, "loss": 0.019, "step": 2600 }, { "epoch": 1.9504876219054763, "eval_loss": 0.03117949888110161, "eval_runtime": 5.1258, "eval_samples_per_second": 10.535, "eval_steps_per_second": 2.731, "step": 2600 }, { "epoch": 1.9512378094523632, "grad_norm": 0.09198283404111862, "learning_rate": 1.810271581192735e-07, "loss": 0.0081, "step": 2601 }, { "epoch": 1.9519879969992497, "grad_norm": 0.13007594645023346, "learning_rate": 1.755031741702995e-07, "loss": 0.0131, "step": 2602 }, { "epoch": 1.9527381845461367, "grad_norm": 0.17154718935489655, "learning_rate": 1.7006463432707177e-07, "loss": 0.0161, "step": 2603 }, { "epoch": 1.9534883720930232, "grad_norm": 0.08539299666881561, "learning_rate": 1.6471154791616317e-07, "loss": 0.013, "step": 2604 }, { "epoch": 1.9542385596399101, "grad_norm": 0.11189521849155426, "learning_rate": 1.59443924117586e-07, "loss": 0.0119, "step": 2605 }, { "epoch": 1.9549887471867966, "grad_norm": 0.06556031852960587, "learning_rate": 1.5426177196479207e-07, "loss": 0.0079, "step": 2606 }, { "epoch": 1.9557389347336835, "grad_norm": 0.135658398270607, "learning_rate": 1.4916510034466702e-07, "loss": 0.0231, "step": 2607 }, { "epoch": 1.95648912228057, "grad_norm": 0.07447344064712524, "learning_rate": 1.441539179974971e-07, "loss": 0.0113, "step": 2608 }, { "epoch": 1.957239309827457, "grad_norm": 0.09345728904008865, "learning_rate": 1.3922823351697479e-07, "loss": 0.0114, "step": 2609 }, { "epoch": 1.9579894973743435, "grad_norm": 0.11973581463098526, "learning_rate": 1.343880553501542e-07, "loss": 0.0176, "step": 2610 }, { "epoch": 1.9587396849212304, "grad_norm": 0.1319183111190796, "learning_rate": 1.2963339179746238e-07, "loss": 0.0125, "step": 2611 }, { "epoch": 1.959489872468117, "grad_norm": 0.11517900228500366, "learning_rate": 1.2496425101268804e-07, "loss": 0.0199, "step": 2612 }, { "epoch": 1.9602400600150038, "grad_norm": 0.053299639374017715, "learning_rate": 1.2038064100294843e-07, "loss": 0.0067, "step": 2613 }, { "epoch": 1.9609902475618903, "grad_norm": 0.11605609208345413, "learning_rate": 1.158825696286725e-07, "loss": 0.0127, "step": 2614 }, { "epoch": 1.9617404351087773, "grad_norm": 0.12783505022525787, "learning_rate": 1.114700446036232e-07, "loss": 0.0189, "step": 2615 }, { "epoch": 1.9624906226556638, "grad_norm": 0.0946228876709938, "learning_rate": 1.0714307349483089e-07, "loss": 0.0114, "step": 2616 }, { "epoch": 1.9632408102025507, "grad_norm": 0.13363100588321686, "learning_rate": 1.029016637226432e-07, "loss": 0.0173, "step": 2617 }, { "epoch": 1.9639909977494372, "grad_norm": 0.10163934528827667, "learning_rate": 9.874582256064192e-08, "loss": 0.0127, "step": 2618 }, { "epoch": 1.9647411852963241, "grad_norm": 0.14493410289287567, "learning_rate": 9.46755571356983e-08, "loss": 0.0171, "step": 2619 }, { "epoch": 1.9654913728432108, "grad_norm": 0.12048706412315369, "learning_rate": 9.069087442791224e-08, "loss": 0.0184, "step": 2620 }, { "epoch": 1.9662415603900976, "grad_norm": 0.11740259826183319, "learning_rate": 8.679178127062871e-08, "loss": 0.018, "step": 2621 }, { "epoch": 1.9669917479369843, "grad_norm": 0.1652223914861679, "learning_rate": 8.297828435039346e-08, "loss": 0.0202, "step": 2622 }, { "epoch": 1.967741935483871, "grad_norm": 0.09575902670621872, "learning_rate": 7.925039020699187e-08, "loss": 0.0145, "step": 2623 }, { "epoch": 1.9684921230307577, "grad_norm": 0.2031949758529663, "learning_rate": 7.56081052333879e-08, "loss": 0.0316, "step": 2624 }, { "epoch": 1.9692423105776444, "grad_norm": 0.15664900839328766, "learning_rate": 7.205143567574624e-08, "loss": 0.0267, "step": 2625 }, { "epoch": 1.9699924981245311, "grad_norm": 0.07686346024274826, "learning_rate": 6.858038763340458e-08, "loss": 0.0098, "step": 2626 }, { "epoch": 1.9707426856714179, "grad_norm": 0.155964657664299, "learning_rate": 6.519496705886252e-08, "loss": 0.0257, "step": 2627 }, { "epoch": 1.9714928732183046, "grad_norm": 0.13349410891532898, "learning_rate": 6.189517975778713e-08, "loss": 0.0169, "step": 2628 }, { "epoch": 1.9722430607651913, "grad_norm": 0.11973363161087036, "learning_rate": 5.8681031388990724e-08, "loss": 0.0109, "step": 2629 }, { "epoch": 1.972993248312078, "grad_norm": 0.08597532659769058, "learning_rate": 5.555252746441975e-08, "loss": 0.0108, "step": 2630 }, { "epoch": 1.9737434358589647, "grad_norm": 0.10258978605270386, "learning_rate": 5.25096733491548e-08, "loss": 0.0121, "step": 2631 }, { "epoch": 1.9744936234058514, "grad_norm": 0.11236192286014557, "learning_rate": 4.9552474261377326e-08, "loss": 0.0157, "step": 2632 }, { "epoch": 1.9752438109527382, "grad_norm": 0.08419892191886902, "learning_rate": 4.6680935272408465e-08, "loss": 0.0108, "step": 2633 }, { "epoch": 1.9759939984996249, "grad_norm": 0.13061557710170746, "learning_rate": 4.3895061306648e-08, "loss": 0.0178, "step": 2634 }, { "epoch": 1.9767441860465116, "grad_norm": 0.08569876849651337, "learning_rate": 4.119485714159099e-08, "loss": 0.0088, "step": 2635 }, { "epoch": 1.9774943735933983, "grad_norm": 0.10098525881767273, "learning_rate": 3.8580327407827796e-08, "loss": 0.0171, "step": 2636 }, { "epoch": 1.978244561140285, "grad_norm": 0.16379106044769287, "learning_rate": 3.605147658901631e-08, "loss": 0.0161, "step": 2637 }, { "epoch": 1.978994748687172, "grad_norm": 0.13738934695720673, "learning_rate": 3.360830902189305e-08, "loss": 0.0217, "step": 2638 }, { "epoch": 1.9797449362340584, "grad_norm": 0.12114493548870087, "learning_rate": 3.125082889623987e-08, "loss": 0.0123, "step": 2639 }, { "epoch": 1.9804951237809454, "grad_norm": 0.22203579545021057, "learning_rate": 2.8979040254911717e-08, "loss": 0.0316, "step": 2640 }, { "epoch": 1.9812453113278319, "grad_norm": 0.13473734259605408, "learning_rate": 2.67929469937922e-08, "loss": 0.0215, "step": 2641 }, { "epoch": 1.9819954988747188, "grad_norm": 0.15361478924751282, "learning_rate": 2.4692552861826925e-08, "loss": 0.0195, "step": 2642 }, { "epoch": 1.9827456864216053, "grad_norm": 0.16221070289611816, "learning_rate": 2.2677861460984607e-08, "loss": 0.0156, "step": 2643 }, { "epoch": 1.9834958739684923, "grad_norm": 0.1510624885559082, "learning_rate": 2.074887624625155e-08, "loss": 0.0226, "step": 2644 }, { "epoch": 1.9842460615153787, "grad_norm": 0.12968359887599945, "learning_rate": 1.890560052565937e-08, "loss": 0.0211, "step": 2645 }, { "epoch": 1.9849962490622657, "grad_norm": 0.09835748374462128, "learning_rate": 1.7148037460235078e-08, "loss": 0.0116, "step": 2646 }, { "epoch": 1.9857464366091522, "grad_norm": 0.2587953209877014, "learning_rate": 1.5476190064034334e-08, "loss": 0.0358, "step": 2647 }, { "epoch": 1.9864966241560391, "grad_norm": 0.14192144572734833, "learning_rate": 1.3890061204108185e-08, "loss": 0.0185, "step": 2648 }, { "epoch": 1.9872468117029256, "grad_norm": 0.16238971054553986, "learning_rate": 1.2389653600508588e-08, "loss": 0.0211, "step": 2649 }, { "epoch": 1.9879969992498125, "grad_norm": 0.14360111951828003, "learning_rate": 1.0974969826288428e-08, "loss": 0.0172, "step": 2650 }, { "epoch": 1.988747186796699, "grad_norm": 0.10696499794721603, "learning_rate": 9.646012307490405e-09, "loss": 0.0112, "step": 2651 }, { "epoch": 1.989497374343586, "grad_norm": 0.19679325819015503, "learning_rate": 8.402783323147034e-09, "loss": 0.0238, "step": 2652 }, { "epoch": 1.9902475618904725, "grad_norm": 0.10451437532901764, "learning_rate": 7.245285005275104e-09, "loss": 0.0134, "step": 2653 }, { "epoch": 1.9909977494373594, "grad_norm": 0.06075247749686241, "learning_rate": 6.1735193388701155e-09, "loss": 0.0077, "step": 2654 }, { "epoch": 1.991747936984246, "grad_norm": 0.07771611213684082, "learning_rate": 5.187488161895182e-09, "loss": 0.0071, "step": 2655 }, { "epoch": 1.9924981245311328, "grad_norm": 0.1421760767698288, "learning_rate": 4.28719316531434e-09, "loss": 0.0187, "step": 2656 }, { "epoch": 1.9932483120780196, "grad_norm": 0.11837299168109894, "learning_rate": 3.4726358930259328e-09, "loss": 0.0166, "step": 2657 }, { "epoch": 1.9939984996249063, "grad_norm": 0.1275044083595276, "learning_rate": 2.743817741929222e-09, "loss": 0.0234, "step": 2658 }, { "epoch": 1.994748687171793, "grad_norm": 0.11450985819101334, "learning_rate": 2.1007399618688807e-09, "loss": 0.0121, "step": 2659 }, { "epoch": 1.9954988747186797, "grad_norm": 0.13041210174560547, "learning_rate": 1.543403655662745e-09, "loss": 0.0182, "step": 2660 }, { "epoch": 1.9962490622655664, "grad_norm": 0.1500764936208725, "learning_rate": 1.0718097790907156e-09, "loss": 0.0188, "step": 2661 }, { "epoch": 1.9969992498124531, "grad_norm": 0.07319959998130798, "learning_rate": 6.859591408836519e-10, "loss": 0.0079, "step": 2662 }, { "epoch": 1.9977494373593399, "grad_norm": 0.1447007954120636, "learning_rate": 3.8585240273447677e-10, "loss": 0.0202, "step": 2663 }, { "epoch": 1.9984996249062266, "grad_norm": 0.09215506911277771, "learning_rate": 1.7149007930927773e-10, "loss": 0.0138, "step": 2664 }, { "epoch": 1.9992498124531133, "grad_norm": 0.08019605278968811, "learning_rate": 4.2872538208449386e-11, "loss": 0.0086, "step": 2665 }, { "epoch": 2.0, "grad_norm": 0.1368834525346756, "learning_rate": 0.0, "loss": 0.02, "step": 2666 }, { "epoch": 2.0, "step": 2666, "total_flos": 1.0511156991219466e+18, "train_loss": 0.02966136350740091, "train_runtime": 3474.0812, "train_samples_per_second": 3.068, "train_steps_per_second": 0.767 } ], "logging_steps": 1, "max_steps": 2666, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0511156991219466e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }