{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.008840116849063313, "eval_steps": 500, "global_step": 1330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.646704397791965e-06, "grad_norm": 0.2631438672542572, "learning_rate": 2e-05, "loss": 1.1562, "step": 1 }, { "epoch": 1.329340879558393e-05, "grad_norm": 0.5983430743217468, "learning_rate": 4e-05, "loss": 1.2192, "step": 2 }, { "epoch": 1.9940113193375895e-05, "grad_norm": 0.5146092176437378, "learning_rate": 6e-05, "loss": 1.2959, "step": 3 }, { "epoch": 2.658681759116786e-05, "grad_norm": 0.495796799659729, "learning_rate": 8e-05, "loss": 1.1495, "step": 4 }, { "epoch": 3.3233521988959826e-05, "grad_norm": 0.6830540895462036, "learning_rate": 0.0001, "loss": 1.2563, "step": 5 }, { "epoch": 3.988022638675179e-05, "grad_norm": 0.6696339249610901, "learning_rate": 9.999985945746134e-05, "loss": 1.2423, "step": 6 }, { "epoch": 4.6526930784543754e-05, "grad_norm": 0.6810370683670044, "learning_rate": 9.999943783063545e-05, "loss": 1.4062, "step": 7 }, { "epoch": 5.317363518233572e-05, "grad_norm": 0.49856042861938477, "learning_rate": 9.999873512189259e-05, "loss": 1.0917, "step": 8 }, { "epoch": 5.982033958012768e-05, "grad_norm": 0.5586492419242859, "learning_rate": 9.999775133518317e-05, "loss": 1.2181, "step": 9 }, { "epoch": 6.646704397791965e-05, "grad_norm": 1.1063677072525024, "learning_rate": 9.999648647603774e-05, "loss": 1.4241, "step": 10 }, { "epoch": 7.311374837571161e-05, "grad_norm": 1.0093430280685425, "learning_rate": 9.9994940551567e-05, "loss": 1.3167, "step": 11 }, { "epoch": 7.976045277350358e-05, "grad_norm": 0.8735777735710144, "learning_rate": 9.999311357046163e-05, "loss": 1.2228, "step": 12 }, { "epoch": 8.640715717129554e-05, "grad_norm": 0.7596926093101501, "learning_rate": 9.999100554299239e-05, "loss": 1.2321, "step": 13 }, { "epoch": 9.305386156908751e-05, "grad_norm": 0.9295351505279541, "learning_rate": 9.998861648100999e-05, "loss": 1.2999, "step": 14 }, { "epoch": 9.970056596687947e-05, "grad_norm": 1.039036512374878, "learning_rate": 9.998594639794501e-05, "loss": 1.3474, "step": 15 }, { "epoch": 0.00010634727036467144, "grad_norm": 1.303148627281189, "learning_rate": 9.998299530880787e-05, "loss": 1.3453, "step": 16 }, { "epoch": 0.00011299397476246341, "grad_norm": 0.9711621403694153, "learning_rate": 9.997976323018871e-05, "loss": 1.19, "step": 17 }, { "epoch": 0.00011964067916025536, "grad_norm": 1.0144423246383667, "learning_rate": 9.997625018025731e-05, "loss": 1.2248, "step": 18 }, { "epoch": 0.00012628738355804732, "grad_norm": 0.8451800346374512, "learning_rate": 9.997245617876297e-05, "loss": 1.07, "step": 19 }, { "epoch": 0.0001329340879558393, "grad_norm": 1.1662631034851074, "learning_rate": 9.996838124703447e-05, "loss": 1.0376, "step": 20 }, { "epoch": 0.00013958079235363126, "grad_norm": 1.2466472387313843, "learning_rate": 9.996402540797985e-05, "loss": 1.0342, "step": 21 }, { "epoch": 0.00014622749675142322, "grad_norm": 1.228630542755127, "learning_rate": 9.99593886860863e-05, "loss": 1.0391, "step": 22 }, { "epoch": 0.00015287420114921518, "grad_norm": 1.3177781105041504, "learning_rate": 9.995447110742014e-05, "loss": 0.8845, "step": 23 }, { "epoch": 0.00015952090554700716, "grad_norm": 1.3232067823410034, "learning_rate": 9.994927269962649e-05, "loss": 1.2801, "step": 24 }, { "epoch": 0.00016616760994479912, "grad_norm": 1.350176215171814, "learning_rate": 9.994379349192926e-05, "loss": 0.9231, "step": 25 }, { "epoch": 0.00017281431434259108, "grad_norm": 1.6647334098815918, "learning_rate": 9.993803351513094e-05, "loss": 1.059, "step": 26 }, { "epoch": 0.00017946101874038306, "grad_norm": 1.6989617347717285, "learning_rate": 9.993199280161237e-05, "loss": 1.288, "step": 27 }, { "epoch": 0.00018610772313817502, "grad_norm": 1.9089605808258057, "learning_rate": 9.992567138533267e-05, "loss": 0.8938, "step": 28 }, { "epoch": 0.00019275442753596697, "grad_norm": 1.659945011138916, "learning_rate": 9.99190693018289e-05, "loss": 0.8532, "step": 29 }, { "epoch": 0.00019940113193375893, "grad_norm": 1.974351167678833, "learning_rate": 9.991218658821608e-05, "loss": 0.9827, "step": 30 }, { "epoch": 0.00020604783633155092, "grad_norm": 2.1737191677093506, "learning_rate": 9.990502328318671e-05, "loss": 1.1886, "step": 31 }, { "epoch": 0.00021269454072934287, "grad_norm": 2.226149320602417, "learning_rate": 9.989757942701078e-05, "loss": 1.2019, "step": 32 }, { "epoch": 0.00021934124512713483, "grad_norm": 2.3712666034698486, "learning_rate": 9.988985506153543e-05, "loss": 1.146, "step": 33 }, { "epoch": 0.00022598794952492681, "grad_norm": 1.9496679306030273, "learning_rate": 9.988185023018472e-05, "loss": 0.8585, "step": 34 }, { "epoch": 0.00023263465392271877, "grad_norm": 1.7973573207855225, "learning_rate": 9.987356497795943e-05, "loss": 1.2078, "step": 35 }, { "epoch": 0.00023928135832051073, "grad_norm": 1.8953720331192017, "learning_rate": 9.986499935143679e-05, "loss": 1.0214, "step": 36 }, { "epoch": 0.0002459280627183027, "grad_norm": 2.2118096351623535, "learning_rate": 9.985615339877018e-05, "loss": 0.9515, "step": 37 }, { "epoch": 0.00025257476711609464, "grad_norm": 2.447432279586792, "learning_rate": 9.98470271696889e-05, "loss": 1.0976, "step": 38 }, { "epoch": 0.0002592214715138866, "grad_norm": 2.1565799713134766, "learning_rate": 9.983762071549792e-05, "loss": 0.866, "step": 39 }, { "epoch": 0.0002658681759116786, "grad_norm": 1.9369972944259644, "learning_rate": 9.982793408907747e-05, "loss": 0.8828, "step": 40 }, { "epoch": 0.00027251488030947057, "grad_norm": 3.0585219860076904, "learning_rate": 9.981796734488292e-05, "loss": 1.0994, "step": 41 }, { "epoch": 0.0002791615847072625, "grad_norm": 1.9319347143173218, "learning_rate": 9.980772053894427e-05, "loss": 0.8895, "step": 42 }, { "epoch": 0.0002858082891050545, "grad_norm": 2.4317328929901123, "learning_rate": 9.979719372886607e-05, "loss": 1.1237, "step": 43 }, { "epoch": 0.00029245499350284644, "grad_norm": 2.063292980194092, "learning_rate": 9.978638697382687e-05, "loss": 1.0101, "step": 44 }, { "epoch": 0.0002991016979006384, "grad_norm": 1.9541330337524414, "learning_rate": 9.977530033457905e-05, "loss": 1.1236, "step": 45 }, { "epoch": 0.00030574840229843035, "grad_norm": 3.488677501678467, "learning_rate": 9.976393387344834e-05, "loss": 1.2321, "step": 46 }, { "epoch": 0.00031239510669622237, "grad_norm": 2.380718469619751, "learning_rate": 9.975228765433362e-05, "loss": 0.913, "step": 47 }, { "epoch": 0.0003190418110940143, "grad_norm": 2.557265281677246, "learning_rate": 9.974036174270647e-05, "loss": 0.9253, "step": 48 }, { "epoch": 0.0003256885154918063, "grad_norm": 3.2321486473083496, "learning_rate": 9.972815620561077e-05, "loss": 0.9996, "step": 49 }, { "epoch": 0.00033233521988959824, "grad_norm": 4.961748123168945, "learning_rate": 9.971567111166246e-05, "loss": 1.2718, "step": 50 }, { "epoch": 0.0003389819242873902, "grad_norm": 1.1540372371673584, "learning_rate": 9.970290653104896e-05, "loss": 1.2866, "step": 51 }, { "epoch": 0.00034562862868518215, "grad_norm": 1.4799156188964844, "learning_rate": 9.968986253552895e-05, "loss": 1.0999, "step": 52 }, { "epoch": 0.0003522753330829741, "grad_norm": 1.2287836074829102, "learning_rate": 9.96765391984319e-05, "loss": 1.0499, "step": 53 }, { "epoch": 0.0003589220374807661, "grad_norm": 1.2238783836364746, "learning_rate": 9.966293659465759e-05, "loss": 1.114, "step": 54 }, { "epoch": 0.0003655687418785581, "grad_norm": 0.9259488582611084, "learning_rate": 9.964905480067586e-05, "loss": 0.8557, "step": 55 }, { "epoch": 0.00037221544627635004, "grad_norm": 0.8354536890983582, "learning_rate": 9.963489389452596e-05, "loss": 0.9913, "step": 56 }, { "epoch": 0.000378862150674142, "grad_norm": 1.1435717344284058, "learning_rate": 9.96204539558163e-05, "loss": 0.782, "step": 57 }, { "epoch": 0.00038550885507193395, "grad_norm": 0.8060230016708374, "learning_rate": 9.96057350657239e-05, "loss": 1.1333, "step": 58 }, { "epoch": 0.0003921555594697259, "grad_norm": 0.6990379095077515, "learning_rate": 9.959073730699397e-05, "loss": 0.9485, "step": 59 }, { "epoch": 0.00039880226386751786, "grad_norm": 0.6797767281532288, "learning_rate": 9.957546076393943e-05, "loss": 0.8461, "step": 60 }, { "epoch": 0.0004054489682653099, "grad_norm": 1.1346392631530762, "learning_rate": 9.955990552244045e-05, "loss": 1.0013, "step": 61 }, { "epoch": 0.00041209567266310183, "grad_norm": 1.0199185609817505, "learning_rate": 9.954407166994397e-05, "loss": 1.0349, "step": 62 }, { "epoch": 0.0004187423770608938, "grad_norm": 0.6908994913101196, "learning_rate": 9.952795929546314e-05, "loss": 0.8031, "step": 63 }, { "epoch": 0.00042538908145868575, "grad_norm": 0.9055588245391846, "learning_rate": 9.951156848957698e-05, "loss": 0.8597, "step": 64 }, { "epoch": 0.0004320357858564777, "grad_norm": 0.833838939666748, "learning_rate": 9.949489934442966e-05, "loss": 0.802, "step": 65 }, { "epoch": 0.00043868249025426966, "grad_norm": 0.8726608157157898, "learning_rate": 9.947795195373016e-05, "loss": 0.884, "step": 66 }, { "epoch": 0.0004453291946520616, "grad_norm": 0.8528004288673401, "learning_rate": 9.946072641275163e-05, "loss": 0.8926, "step": 67 }, { "epoch": 0.00045197589904985363, "grad_norm": 0.772897481918335, "learning_rate": 9.944322281833096e-05, "loss": 0.8125, "step": 68 }, { "epoch": 0.0004586226034476456, "grad_norm": 0.840191662311554, "learning_rate": 9.942544126886812e-05, "loss": 1.0848, "step": 69 }, { "epoch": 0.00046526930784543754, "grad_norm": 1.2214893102645874, "learning_rate": 9.940738186432565e-05, "loss": 0.8576, "step": 70 }, { "epoch": 0.0004719160122432295, "grad_norm": 0.9319333434104919, "learning_rate": 9.938904470622815e-05, "loss": 0.7976, "step": 71 }, { "epoch": 0.00047856271664102146, "grad_norm": 1.0511255264282227, "learning_rate": 9.937042989766165e-05, "loss": 0.6438, "step": 72 }, { "epoch": 0.0004852094210388134, "grad_norm": 1.0021990537643433, "learning_rate": 9.935153754327304e-05, "loss": 0.8932, "step": 73 }, { "epoch": 0.0004918561254366054, "grad_norm": 1.6561142206192017, "learning_rate": 9.933236774926952e-05, "loss": 0.6266, "step": 74 }, { "epoch": 0.0004985028298343973, "grad_norm": 1.1879206895828247, "learning_rate": 9.931292062341793e-05, "loss": 1.0754, "step": 75 }, { "epoch": 0.0005051495342321893, "grad_norm": 1.2196751832962036, "learning_rate": 9.92931962750442e-05, "loss": 0.5184, "step": 76 }, { "epoch": 0.0005117962386299812, "grad_norm": 1.3561830520629883, "learning_rate": 9.927319481503277e-05, "loss": 1.0951, "step": 77 }, { "epoch": 0.0005184429430277732, "grad_norm": 1.2481271028518677, "learning_rate": 9.925291635582583e-05, "loss": 0.8269, "step": 78 }, { "epoch": 0.0005250896474255653, "grad_norm": 1.957845687866211, "learning_rate": 9.923236101142286e-05, "loss": 1.1767, "step": 79 }, { "epoch": 0.0005317363518233572, "grad_norm": 1.5084407329559326, "learning_rate": 9.921152889737984e-05, "loss": 0.695, "step": 80 }, { "epoch": 0.0005383830562211492, "grad_norm": 1.272848129272461, "learning_rate": 9.919042013080873e-05, "loss": 0.9717, "step": 81 }, { "epoch": 0.0005450297606189411, "grad_norm": 1.281510353088379, "learning_rate": 9.91690348303767e-05, "loss": 0.7004, "step": 82 }, { "epoch": 0.0005516764650167331, "grad_norm": 1.6305228471755981, "learning_rate": 9.914737311630553e-05, "loss": 0.8992, "step": 83 }, { "epoch": 0.000558323169414525, "grad_norm": 1.8635321855545044, "learning_rate": 9.912543511037091e-05, "loss": 0.7849, "step": 84 }, { "epoch": 0.000564969873812317, "grad_norm": 1.5316699743270874, "learning_rate": 9.910322093590177e-05, "loss": 1.2306, "step": 85 }, { "epoch": 0.000571616578210109, "grad_norm": 1.468440294265747, "learning_rate": 9.908073071777954e-05, "loss": 1.0128, "step": 86 }, { "epoch": 0.0005782632826079009, "grad_norm": 1.618682622909546, "learning_rate": 9.905796458243755e-05, "loss": 0.7572, "step": 87 }, { "epoch": 0.0005849099870056929, "grad_norm": 1.650620698928833, "learning_rate": 9.903492265786019e-05, "loss": 0.6693, "step": 88 }, { "epoch": 0.0005915566914034848, "grad_norm": 1.364016056060791, "learning_rate": 9.901160507358232e-05, "loss": 0.8004, "step": 89 }, { "epoch": 0.0005982033958012768, "grad_norm": 1.856819987297058, "learning_rate": 9.898801196068839e-05, "loss": 0.9729, "step": 90 }, { "epoch": 0.0006048501001990688, "grad_norm": 1.8669190406799316, "learning_rate": 9.896414345181188e-05, "loss": 0.764, "step": 91 }, { "epoch": 0.0006114968045968607, "grad_norm": 2.1475913524627686, "learning_rate": 9.893999968113439e-05, "loss": 0.9721, "step": 92 }, { "epoch": 0.0006181435089946528, "grad_norm": 2.5458381175994873, "learning_rate": 9.891558078438503e-05, "loss": 0.8875, "step": 93 }, { "epoch": 0.0006247902133924447, "grad_norm": 2.4093551635742188, "learning_rate": 9.889088689883952e-05, "loss": 0.928, "step": 94 }, { "epoch": 0.0006314369177902367, "grad_norm": 2.540717601776123, "learning_rate": 9.886591816331954e-05, "loss": 0.8347, "step": 95 }, { "epoch": 0.0006380836221880286, "grad_norm": 2.4616358280181885, "learning_rate": 9.884067471819184e-05, "loss": 0.7776, "step": 96 }, { "epoch": 0.0006447303265858206, "grad_norm": 2.8362159729003906, "learning_rate": 9.881515670536755e-05, "loss": 1.2073, "step": 97 }, { "epoch": 0.0006513770309836126, "grad_norm": 2.2793056964874268, "learning_rate": 9.878936426830132e-05, "loss": 1.0833, "step": 98 }, { "epoch": 0.0006580237353814045, "grad_norm": 2.6168718338012695, "learning_rate": 9.876329755199054e-05, "loss": 1.0175, "step": 99 }, { "epoch": 0.0006646704397791965, "grad_norm": 5.154669284820557, "learning_rate": 9.87369567029745e-05, "loss": 1.0954, "step": 100 }, { "epoch": 0.0006713171441769884, "grad_norm": 0.648977518081665, "learning_rate": 9.87103418693336e-05, "loss": 0.861, "step": 101 }, { "epoch": 0.0006779638485747804, "grad_norm": 0.6909152269363403, "learning_rate": 9.868345320068847e-05, "loss": 0.9495, "step": 102 }, { "epoch": 0.0006846105529725723, "grad_norm": 1.014430046081543, "learning_rate": 9.865629084819923e-05, "loss": 1.0735, "step": 103 }, { "epoch": 0.0006912572573703643, "grad_norm": 1.1070421934127808, "learning_rate": 9.862885496456447e-05, "loss": 0.7958, "step": 104 }, { "epoch": 0.0006979039617681563, "grad_norm": 0.7281860709190369, "learning_rate": 9.860114570402054e-05, "loss": 1.052, "step": 105 }, { "epoch": 0.0007045506661659482, "grad_norm": 0.8016036748886108, "learning_rate": 9.857316322234067e-05, "loss": 0.8589, "step": 106 }, { "epoch": 0.0007111973705637403, "grad_norm": 0.7948988080024719, "learning_rate": 9.854490767683397e-05, "loss": 0.8733, "step": 107 }, { "epoch": 0.0007178440749615322, "grad_norm": 0.8897978067398071, "learning_rate": 9.851637922634475e-05, "loss": 1.1144, "step": 108 }, { "epoch": 0.0007244907793593242, "grad_norm": 0.7510639429092407, "learning_rate": 9.848757803125138e-05, "loss": 0.8166, "step": 109 }, { "epoch": 0.0007311374837571162, "grad_norm": 1.0977451801300049, "learning_rate": 9.845850425346563e-05, "loss": 0.796, "step": 110 }, { "epoch": 0.0007377841881549081, "grad_norm": 0.7391382455825806, "learning_rate": 9.842915805643155e-05, "loss": 1.0735, "step": 111 }, { "epoch": 0.0007444308925527001, "grad_norm": 0.8569241166114807, "learning_rate": 9.839953960512476e-05, "loss": 0.9633, "step": 112 }, { "epoch": 0.000751077596950492, "grad_norm": 0.8037195801734924, "learning_rate": 9.836964906605133e-05, "loss": 0.9107, "step": 113 }, { "epoch": 0.000757724301348284, "grad_norm": 0.7987781167030334, "learning_rate": 9.833948660724692e-05, "loss": 0.9367, "step": 114 }, { "epoch": 0.0007643710057460759, "grad_norm": 0.8678058385848999, "learning_rate": 9.830905239827593e-05, "loss": 1.1163, "step": 115 }, { "epoch": 0.0007710177101438679, "grad_norm": 0.8879592418670654, "learning_rate": 9.827834661023034e-05, "loss": 0.8784, "step": 116 }, { "epoch": 0.0007776644145416599, "grad_norm": 0.9234637022018433, "learning_rate": 9.824736941572896e-05, "loss": 0.9235, "step": 117 }, { "epoch": 0.0007843111189394518, "grad_norm": 0.8318954706192017, "learning_rate": 9.821612098891632e-05, "loss": 1.0028, "step": 118 }, { "epoch": 0.0007909578233372438, "grad_norm": 0.8476008176803589, "learning_rate": 9.818460150546177e-05, "loss": 0.705, "step": 119 }, { "epoch": 0.0007976045277350357, "grad_norm": 1.0558176040649414, "learning_rate": 9.815281114255841e-05, "loss": 0.9652, "step": 120 }, { "epoch": 0.0008042512321328278, "grad_norm": 0.8651800155639648, "learning_rate": 9.812075007892218e-05, "loss": 1.0003, "step": 121 }, { "epoch": 0.0008108979365306198, "grad_norm": 0.8710187673568726, "learning_rate": 9.808841849479084e-05, "loss": 1.0046, "step": 122 }, { "epoch": 0.0008175446409284117, "grad_norm": 1.2119121551513672, "learning_rate": 9.805581657192287e-05, "loss": 0.8402, "step": 123 }, { "epoch": 0.0008241913453262037, "grad_norm": 1.083730697631836, "learning_rate": 9.802294449359656e-05, "loss": 0.79, "step": 124 }, { "epoch": 0.0008308380497239956, "grad_norm": 1.0458542108535767, "learning_rate": 9.798980244460893e-05, "loss": 0.9391, "step": 125 }, { "epoch": 0.0008374847541217876, "grad_norm": 0.9619566202163696, "learning_rate": 9.795639061127468e-05, "loss": 0.9366, "step": 126 }, { "epoch": 0.0008441314585195795, "grad_norm": 1.7271543741226196, "learning_rate": 9.792270918142518e-05, "loss": 0.5437, "step": 127 }, { "epoch": 0.0008507781629173715, "grad_norm": 1.005113124847412, "learning_rate": 9.788875834440737e-05, "loss": 0.7217, "step": 128 }, { "epoch": 0.0008574248673151635, "grad_norm": 1.5306681394577026, "learning_rate": 9.78545382910827e-05, "loss": 0.7353, "step": 129 }, { "epoch": 0.0008640715717129554, "grad_norm": 1.151214838027954, "learning_rate": 9.782004921382612e-05, "loss": 0.8793, "step": 130 }, { "epoch": 0.0008707182761107474, "grad_norm": 1.0847221612930298, "learning_rate": 9.778529130652494e-05, "loss": 0.8695, "step": 131 }, { "epoch": 0.0008773649805085393, "grad_norm": 1.0828322172164917, "learning_rate": 9.775026476457771e-05, "loss": 0.8751, "step": 132 }, { "epoch": 0.0008840116849063313, "grad_norm": 1.0218029022216797, "learning_rate": 9.771496978489323e-05, "loss": 0.8676, "step": 133 }, { "epoch": 0.0008906583893041232, "grad_norm": 1.2020763158798218, "learning_rate": 9.767940656588931e-05, "loss": 0.973, "step": 134 }, { "epoch": 0.0008973050937019153, "grad_norm": 1.169690489768982, "learning_rate": 9.764357530749178e-05, "loss": 0.7805, "step": 135 }, { "epoch": 0.0009039517980997073, "grad_norm": 1.8148276805877686, "learning_rate": 9.760747621113325e-05, "loss": 0.9686, "step": 136 }, { "epoch": 0.0009105985024974992, "grad_norm": 1.2379844188690186, "learning_rate": 9.757110947975208e-05, "loss": 0.9013, "step": 137 }, { "epoch": 0.0009172452068952912, "grad_norm": 1.8107781410217285, "learning_rate": 9.75344753177912e-05, "loss": 0.7211, "step": 138 }, { "epoch": 0.0009238919112930831, "grad_norm": 1.312720775604248, "learning_rate": 9.749757393119691e-05, "loss": 0.8022, "step": 139 }, { "epoch": 0.0009305386156908751, "grad_norm": 1.9345266819000244, "learning_rate": 9.74604055274178e-05, "loss": 0.5008, "step": 140 }, { "epoch": 0.000937185320088667, "grad_norm": 1.7821546792984009, "learning_rate": 9.742297031540354e-05, "loss": 0.8974, "step": 141 }, { "epoch": 0.000943832024486459, "grad_norm": 1.2601072788238525, "learning_rate": 9.738526850560373e-05, "loss": 0.5588, "step": 142 }, { "epoch": 0.000950478728884251, "grad_norm": 1.8271839618682861, "learning_rate": 9.734730030996669e-05, "loss": 0.6667, "step": 143 }, { "epoch": 0.0009571254332820429, "grad_norm": 2.0186102390289307, "learning_rate": 9.730906594193824e-05, "loss": 0.8857, "step": 144 }, { "epoch": 0.0009637721376798349, "grad_norm": 1.6370420455932617, "learning_rate": 9.727056561646066e-05, "loss": 0.8406, "step": 145 }, { "epoch": 0.0009704188420776268, "grad_norm": 2.60221529006958, "learning_rate": 9.723179954997125e-05, "loss": 1.0972, "step": 146 }, { "epoch": 0.0009770655464754188, "grad_norm": 2.049570322036743, "learning_rate": 9.719276796040127e-05, "loss": 0.8639, "step": 147 }, { "epoch": 0.0009837122508732107, "grad_norm": 2.249347686767578, "learning_rate": 9.715347106717467e-05, "loss": 0.7253, "step": 148 }, { "epoch": 0.0009903589552710027, "grad_norm": 2.40408992767334, "learning_rate": 9.711390909120686e-05, "loss": 0.7013, "step": 149 }, { "epoch": 0.0009970056596687947, "grad_norm": 4.417992115020752, "learning_rate": 9.707408225490344e-05, "loss": 1.007, "step": 150 }, { "epoch": 0.0010036523640665866, "grad_norm": 0.5938814878463745, "learning_rate": 9.7033990782159e-05, "loss": 1.2118, "step": 151 }, { "epoch": 0.0010102990684643786, "grad_norm": 0.6780165433883667, "learning_rate": 9.699363489835586e-05, "loss": 1.0222, "step": 152 }, { "epoch": 0.0010169457728621705, "grad_norm": 0.7802003026008606, "learning_rate": 9.695301483036275e-05, "loss": 0.8664, "step": 153 }, { "epoch": 0.0010235924772599625, "grad_norm": 0.6593429446220398, "learning_rate": 9.691213080653356e-05, "loss": 0.793, "step": 154 }, { "epoch": 0.0010302391816577544, "grad_norm": 0.7949785590171814, "learning_rate": 9.687098305670605e-05, "loss": 0.8327, "step": 155 }, { "epoch": 0.0010368858860555464, "grad_norm": 0.608187198638916, "learning_rate": 9.682957181220062e-05, "loss": 0.8268, "step": 156 }, { "epoch": 0.0010435325904533386, "grad_norm": 0.6817964911460876, "learning_rate": 9.678789730581892e-05, "loss": 0.9741, "step": 157 }, { "epoch": 0.0010501792948511305, "grad_norm": 1.039652705192566, "learning_rate": 9.674595977184256e-05, "loss": 0.958, "step": 158 }, { "epoch": 0.0010568259992489225, "grad_norm": 0.8586385250091553, "learning_rate": 9.670375944603189e-05, "loss": 0.7179, "step": 159 }, { "epoch": 0.0010634727036467144, "grad_norm": 1.2006994485855103, "learning_rate": 9.66612965656245e-05, "loss": 0.9916, "step": 160 }, { "epoch": 0.0010701194080445064, "grad_norm": 0.6251561641693115, "learning_rate": 9.661857136933405e-05, "loss": 0.8783, "step": 161 }, { "epoch": 0.0010767661124422984, "grad_norm": 0.9916641712188721, "learning_rate": 9.657558409734886e-05, "loss": 1.0163, "step": 162 }, { "epoch": 0.0010834128168400903, "grad_norm": 0.8104830980300903, "learning_rate": 9.65323349913305e-05, "loss": 0.8032, "step": 163 }, { "epoch": 0.0010900595212378823, "grad_norm": 0.7668328881263733, "learning_rate": 9.648882429441257e-05, "loss": 0.8551, "step": 164 }, { "epoch": 0.0010967062256356742, "grad_norm": 0.7517951130867004, "learning_rate": 9.644505225119922e-05, "loss": 0.9447, "step": 165 }, { "epoch": 0.0011033529300334662, "grad_norm": 0.8532454371452332, "learning_rate": 9.640101910776381e-05, "loss": 0.8546, "step": 166 }, { "epoch": 0.0011099996344312581, "grad_norm": 0.7892952561378479, "learning_rate": 9.635672511164751e-05, "loss": 0.858, "step": 167 }, { "epoch": 0.00111664633882905, "grad_norm": 0.7072875499725342, "learning_rate": 9.631217051185797e-05, "loss": 0.9406, "step": 168 }, { "epoch": 0.001123293043226842, "grad_norm": 0.8287038803100586, "learning_rate": 9.626735555886785e-05, "loss": 0.8493, "step": 169 }, { "epoch": 0.001129939747624634, "grad_norm": 0.8774399161338806, "learning_rate": 9.622228050461343e-05, "loss": 0.8884, "step": 170 }, { "epoch": 0.001136586452022426, "grad_norm": 0.9374727010726929, "learning_rate": 9.617694560249322e-05, "loss": 0.7065, "step": 171 }, { "epoch": 0.001143233156420218, "grad_norm": 0.9686769843101501, "learning_rate": 9.61313511073665e-05, "loss": 1.1187, "step": 172 }, { "epoch": 0.0011498798608180099, "grad_norm": 1.72421395778656, "learning_rate": 9.608549727555195e-05, "loss": 0.8343, "step": 173 }, { "epoch": 0.0011565265652158018, "grad_norm": 0.9585894346237183, "learning_rate": 9.603938436482608e-05, "loss": 0.9726, "step": 174 }, { "epoch": 0.0011631732696135938, "grad_norm": 0.9020530581474304, "learning_rate": 9.599301263442192e-05, "loss": 1.0393, "step": 175 }, { "epoch": 0.0011698199740113858, "grad_norm": 1.0052881240844727, "learning_rate": 9.594638234502753e-05, "loss": 0.8441, "step": 176 }, { "epoch": 0.0011764666784091777, "grad_norm": 1.2736499309539795, "learning_rate": 9.589949375878444e-05, "loss": 1.0913, "step": 177 }, { "epoch": 0.0011831133828069697, "grad_norm": 1.2498910427093506, "learning_rate": 9.585234713928633e-05, "loss": 0.8139, "step": 178 }, { "epoch": 0.0011897600872047616, "grad_norm": 1.167474389076233, "learning_rate": 9.580494275157737e-05, "loss": 0.8243, "step": 179 }, { "epoch": 0.0011964067916025536, "grad_norm": 1.0626884698867798, "learning_rate": 9.575728086215092e-05, "loss": 0.8134, "step": 180 }, { "epoch": 0.0012030534960003455, "grad_norm": 1.2147183418273926, "learning_rate": 9.57093617389479e-05, "loss": 0.848, "step": 181 }, { "epoch": 0.0012097002003981375, "grad_norm": 1.1623990535736084, "learning_rate": 9.56611856513553e-05, "loss": 0.7649, "step": 182 }, { "epoch": 0.0012163469047959295, "grad_norm": 0.9728236794471741, "learning_rate": 9.561275287020472e-05, "loss": 0.8265, "step": 183 }, { "epoch": 0.0012229936091937214, "grad_norm": 1.405484676361084, "learning_rate": 9.556406366777077e-05, "loss": 0.9288, "step": 184 }, { "epoch": 0.0012296403135915136, "grad_norm": 0.9838158488273621, "learning_rate": 9.551511831776965e-05, "loss": 0.6958, "step": 185 }, { "epoch": 0.0012362870179893056, "grad_norm": 1.1341612339019775, "learning_rate": 9.546591709535751e-05, "loss": 0.4824, "step": 186 }, { "epoch": 0.0012429337223870975, "grad_norm": 1.1700488328933716, "learning_rate": 9.541646027712892e-05, "loss": 0.6101, "step": 187 }, { "epoch": 0.0012495804267848895, "grad_norm": 1.3073359727859497, "learning_rate": 9.536674814111535e-05, "loss": 0.8391, "step": 188 }, { "epoch": 0.0012562271311826814, "grad_norm": 1.2790961265563965, "learning_rate": 9.53167809667836e-05, "loss": 0.6953, "step": 189 }, { "epoch": 0.0012628738355804734, "grad_norm": 1.3291611671447754, "learning_rate": 9.526655903503423e-05, "loss": 0.8073, "step": 190 }, { "epoch": 0.0012695205399782653, "grad_norm": 1.3117409944534302, "learning_rate": 9.52160826281999e-05, "loss": 0.7799, "step": 191 }, { "epoch": 0.0012761672443760573, "grad_norm": 1.531928300857544, "learning_rate": 9.516535203004395e-05, "loss": 0.7419, "step": 192 }, { "epoch": 0.0012828139487738493, "grad_norm": 1.9491792917251587, "learning_rate": 9.511436752575866e-05, "loss": 0.776, "step": 193 }, { "epoch": 0.0012894606531716412, "grad_norm": 2.464545249938965, "learning_rate": 9.506312940196367e-05, "loss": 0.7564, "step": 194 }, { "epoch": 0.0012961073575694332, "grad_norm": 1.9504872560501099, "learning_rate": 9.501163794670444e-05, "loss": 0.9005, "step": 195 }, { "epoch": 0.0013027540619672251, "grad_norm": 2.1457152366638184, "learning_rate": 9.495989344945056e-05, "loss": 0.9662, "step": 196 }, { "epoch": 0.001309400766365017, "grad_norm": 2.0072641372680664, "learning_rate": 9.490789620109415e-05, "loss": 0.7599, "step": 197 }, { "epoch": 0.001316047470762809, "grad_norm": 3.2454893589019775, "learning_rate": 9.485564649394822e-05, "loss": 1.0149, "step": 198 }, { "epoch": 0.001322694175160601, "grad_norm": 2.1446099281311035, "learning_rate": 9.480314462174502e-05, "loss": 0.9446, "step": 199 }, { "epoch": 0.001329340879558393, "grad_norm": 6.221963405609131, "learning_rate": 9.475039087963442e-05, "loss": 1.1899, "step": 200 }, { "epoch": 0.001335987583956185, "grad_norm": 0.3930818736553192, "learning_rate": 9.469738556418222e-05, "loss": 0.7966, "step": 201 }, { "epoch": 0.0013426342883539769, "grad_norm": 0.817714512348175, "learning_rate": 9.464412897336845e-05, "loss": 0.8251, "step": 202 }, { "epoch": 0.0013492809927517688, "grad_norm": 0.7453000545501709, "learning_rate": 9.459062140658582e-05, "loss": 0.9693, "step": 203 }, { "epoch": 0.0013559276971495608, "grad_norm": 0.8357697129249573, "learning_rate": 9.453686316463786e-05, "loss": 0.935, "step": 204 }, { "epoch": 0.0013625744015473527, "grad_norm": 0.8527042269706726, "learning_rate": 9.448285454973738e-05, "loss": 0.9294, "step": 205 }, { "epoch": 0.0013692211059451447, "grad_norm": 0.6852911114692688, "learning_rate": 9.442859586550468e-05, "loss": 0.8999, "step": 206 }, { "epoch": 0.0013758678103429367, "grad_norm": 0.7064734697341919, "learning_rate": 9.437408741696589e-05, "loss": 0.9051, "step": 207 }, { "epoch": 0.0013825145147407286, "grad_norm": 0.8755961060523987, "learning_rate": 9.431932951055127e-05, "loss": 1.073, "step": 208 }, { "epoch": 0.0013891612191385206, "grad_norm": 0.6645509600639343, "learning_rate": 9.426432245409338e-05, "loss": 0.8813, "step": 209 }, { "epoch": 0.0013958079235363125, "grad_norm": 0.9016600847244263, "learning_rate": 9.420906655682553e-05, "loss": 1.026, "step": 210 }, { "epoch": 0.0014024546279341045, "grad_norm": 0.7616488337516785, "learning_rate": 9.415356212937982e-05, "loss": 0.9374, "step": 211 }, { "epoch": 0.0014091013323318964, "grad_norm": 0.9184173345565796, "learning_rate": 9.409780948378562e-05, "loss": 0.7705, "step": 212 }, { "epoch": 0.0014157480367296884, "grad_norm": 0.778388261795044, "learning_rate": 9.404180893346766e-05, "loss": 0.8679, "step": 213 }, { "epoch": 0.0014223947411274806, "grad_norm": 0.9642260670661926, "learning_rate": 9.39855607932443e-05, "loss": 0.8988, "step": 214 }, { "epoch": 0.0014290414455252725, "grad_norm": 0.8209648132324219, "learning_rate": 9.392906537932582e-05, "loss": 0.9862, "step": 215 }, { "epoch": 0.0014356881499230645, "grad_norm": 0.9169635772705078, "learning_rate": 9.387232300931255e-05, "loss": 0.7994, "step": 216 }, { "epoch": 0.0014423348543208564, "grad_norm": 1.3506489992141724, "learning_rate": 9.381533400219318e-05, "loss": 0.9395, "step": 217 }, { "epoch": 0.0014489815587186484, "grad_norm": 1.245642900466919, "learning_rate": 9.37580986783429e-05, "loss": 0.9458, "step": 218 }, { "epoch": 0.0014556282631164404, "grad_norm": 0.8106166124343872, "learning_rate": 9.37006173595216e-05, "loss": 0.6822, "step": 219 }, { "epoch": 0.0014622749675142323, "grad_norm": 1.0076394081115723, "learning_rate": 9.364289036887213e-05, "loss": 0.9702, "step": 220 }, { "epoch": 0.0014689216719120243, "grad_norm": 1.3260512351989746, "learning_rate": 9.358491803091836e-05, "loss": 0.7443, "step": 221 }, { "epoch": 0.0014755683763098162, "grad_norm": 1.3077126741409302, "learning_rate": 9.35267006715635e-05, "loss": 0.9377, "step": 222 }, { "epoch": 0.0014822150807076082, "grad_norm": 1.0499240159988403, "learning_rate": 9.346823861808817e-05, "loss": 0.9677, "step": 223 }, { "epoch": 0.0014888617851054001, "grad_norm": 1.010362982749939, "learning_rate": 9.340953219914859e-05, "loss": 0.9896, "step": 224 }, { "epoch": 0.001495508489503192, "grad_norm": 1.0022648572921753, "learning_rate": 9.335058174477471e-05, "loss": 0.8454, "step": 225 }, { "epoch": 0.001502155193900984, "grad_norm": 1.1706610918045044, "learning_rate": 9.32913875863684e-05, "loss": 0.7352, "step": 226 }, { "epoch": 0.001508801898298776, "grad_norm": 1.2543138265609741, "learning_rate": 9.323195005670154e-05, "loss": 0.9636, "step": 227 }, { "epoch": 0.001515448602696568, "grad_norm": 0.852353572845459, "learning_rate": 9.31722694899142e-05, "loss": 0.7083, "step": 228 }, { "epoch": 0.00152209530709436, "grad_norm": 1.174993872642517, "learning_rate": 9.311234622151271e-05, "loss": 0.883, "step": 229 }, { "epoch": 0.0015287420114921519, "grad_norm": 1.3355340957641602, "learning_rate": 9.305218058836778e-05, "loss": 0.8925, "step": 230 }, { "epoch": 0.0015353887158899438, "grad_norm": 1.2492835521697998, "learning_rate": 9.299177292871267e-05, "loss": 0.6849, "step": 231 }, { "epoch": 0.0015420354202877358, "grad_norm": 1.6142560243606567, "learning_rate": 9.29311235821412e-05, "loss": 0.9287, "step": 232 }, { "epoch": 0.0015486821246855278, "grad_norm": 1.150844693183899, "learning_rate": 9.28702328896059e-05, "loss": 1.0011, "step": 233 }, { "epoch": 0.0015553288290833197, "grad_norm": 1.2073992490768433, "learning_rate": 9.280910119341609e-05, "loss": 0.8405, "step": 234 }, { "epoch": 0.0015619755334811117, "grad_norm": 1.1892738342285156, "learning_rate": 9.274772883723587e-05, "loss": 1.0335, "step": 235 }, { "epoch": 0.0015686222378789036, "grad_norm": 1.4219963550567627, "learning_rate": 9.268611616608236e-05, "loss": 0.7833, "step": 236 }, { "epoch": 0.0015752689422766956, "grad_norm": 1.2685273885726929, "learning_rate": 9.262426352632358e-05, "loss": 0.8043, "step": 237 }, { "epoch": 0.0015819156466744875, "grad_norm": 1.3635826110839844, "learning_rate": 9.256217126567661e-05, "loss": 0.6737, "step": 238 }, { "epoch": 0.0015885623510722795, "grad_norm": 1.39205002784729, "learning_rate": 9.249983973320563e-05, "loss": 0.8379, "step": 239 }, { "epoch": 0.0015952090554700715, "grad_norm": 1.2609238624572754, "learning_rate": 9.243726927931991e-05, "loss": 0.77, "step": 240 }, { "epoch": 0.0016018557598678634, "grad_norm": 2.1187779903411865, "learning_rate": 9.237446025577183e-05, "loss": 0.8866, "step": 241 }, { "epoch": 0.0016085024642656556, "grad_norm": 1.4301778078079224, "learning_rate": 9.231141301565502e-05, "loss": 0.7434, "step": 242 }, { "epoch": 0.0016151491686634475, "grad_norm": 1.489614725112915, "learning_rate": 9.224812791340222e-05, "loss": 0.6912, "step": 243 }, { "epoch": 0.0016217958730612395, "grad_norm": 1.560828685760498, "learning_rate": 9.218460530478338e-05, "loss": 0.9571, "step": 244 }, { "epoch": 0.0016284425774590315, "grad_norm": 1.5755577087402344, "learning_rate": 9.21208455469037e-05, "loss": 0.9114, "step": 245 }, { "epoch": 0.0016350892818568234, "grad_norm": 2.00559663772583, "learning_rate": 9.205684899820147e-05, "loss": 0.7652, "step": 246 }, { "epoch": 0.0016417359862546154, "grad_norm": 3.0371479988098145, "learning_rate": 9.199261601844617e-05, "loss": 0.9046, "step": 247 }, { "epoch": 0.0016483826906524073, "grad_norm": 2.720205307006836, "learning_rate": 9.192814696873646e-05, "loss": 0.9995, "step": 248 }, { "epoch": 0.0016550293950501993, "grad_norm": 1.7048563957214355, "learning_rate": 9.18634422114981e-05, "loss": 0.62, "step": 249 }, { "epoch": 0.0016616760994479912, "grad_norm": 3.4157466888427734, "learning_rate": 9.179850211048193e-05, "loss": 0.5239, "step": 250 }, { "epoch": 0.0016683228038457832, "grad_norm": 0.5009745359420776, "learning_rate": 9.17333270307618e-05, "loss": 1.1691, "step": 251 }, { "epoch": 0.0016749695082435752, "grad_norm": 0.5671596527099609, "learning_rate": 9.166791733873257e-05, "loss": 1.0, "step": 252 }, { "epoch": 0.0016816162126413671, "grad_norm": 0.9397789835929871, "learning_rate": 9.1602273402108e-05, "loss": 0.8239, "step": 253 }, { "epoch": 0.001688262917039159, "grad_norm": 0.6633840799331665, "learning_rate": 9.15363955899187e-05, "loss": 1.1259, "step": 254 }, { "epoch": 0.001694909621436951, "grad_norm": 0.6279160976409912, "learning_rate": 9.14702842725101e-05, "loss": 0.8066, "step": 255 }, { "epoch": 0.001701556325834743, "grad_norm": 0.7420310974121094, "learning_rate": 9.140393982154026e-05, "loss": 0.7877, "step": 256 }, { "epoch": 0.001708203030232535, "grad_norm": 0.7930522561073303, "learning_rate": 9.133736260997789e-05, "loss": 1.0041, "step": 257 }, { "epoch": 0.001714849734630327, "grad_norm": 0.6999163031578064, "learning_rate": 9.127055301210024e-05, "loss": 1.058, "step": 258 }, { "epoch": 0.0017214964390281189, "grad_norm": 0.9659724831581116, "learning_rate": 9.120351140349089e-05, "loss": 0.9217, "step": 259 }, { "epoch": 0.0017281431434259108, "grad_norm": 0.8148461580276489, "learning_rate": 9.113623816103773e-05, "loss": 0.6892, "step": 260 }, { "epoch": 0.0017347898478237028, "grad_norm": 0.7771252989768982, "learning_rate": 9.106873366293093e-05, "loss": 0.654, "step": 261 }, { "epoch": 0.0017414365522214947, "grad_norm": 0.7547912001609802, "learning_rate": 9.100099828866057e-05, "loss": 0.819, "step": 262 }, { "epoch": 0.0017480832566192867, "grad_norm": 1.1141983270645142, "learning_rate": 9.093303241901473e-05, "loss": 0.5581, "step": 263 }, { "epoch": 0.0017547299610170786, "grad_norm": 0.7499803304672241, "learning_rate": 9.086483643607724e-05, "loss": 0.8813, "step": 264 }, { "epoch": 0.0017613766654148706, "grad_norm": 0.7075815796852112, "learning_rate": 9.079641072322556e-05, "loss": 1.1199, "step": 265 }, { "epoch": 0.0017680233698126626, "grad_norm": 0.8884283900260925, "learning_rate": 9.072775566512863e-05, "loss": 0.7737, "step": 266 }, { "epoch": 0.0017746700742104545, "grad_norm": 0.9830721020698547, "learning_rate": 9.065887164774469e-05, "loss": 0.9366, "step": 267 }, { "epoch": 0.0017813167786082465, "grad_norm": 0.7811346054077148, "learning_rate": 9.058975905831914e-05, "loss": 0.8962, "step": 268 }, { "epoch": 0.0017879634830060384, "grad_norm": 0.7821527719497681, "learning_rate": 9.052041828538233e-05, "loss": 0.8307, "step": 269 }, { "epoch": 0.0017946101874038306, "grad_norm": 1.204016089439392, "learning_rate": 9.045084971874738e-05, "loss": 0.8154, "step": 270 }, { "epoch": 0.0018012568918016226, "grad_norm": 0.9158059358596802, "learning_rate": 9.0381053749508e-05, "loss": 0.8118, "step": 271 }, { "epoch": 0.0018079035961994145, "grad_norm": 1.037025809288025, "learning_rate": 9.031103077003634e-05, "loss": 0.6338, "step": 272 }, { "epoch": 0.0018145503005972065, "grad_norm": 0.9192420840263367, "learning_rate": 9.024078117398062e-05, "loss": 0.8015, "step": 273 }, { "epoch": 0.0018211970049949984, "grad_norm": 0.8506931662559509, "learning_rate": 9.017030535626317e-05, "loss": 0.8368, "step": 274 }, { "epoch": 0.0018278437093927904, "grad_norm": 1.1869866847991943, "learning_rate": 9.009960371307798e-05, "loss": 0.8363, "step": 275 }, { "epoch": 0.0018344904137905823, "grad_norm": 1.514348030090332, "learning_rate": 9.002867664188858e-05, "loss": 0.8468, "step": 276 }, { "epoch": 0.0018411371181883743, "grad_norm": 1.3688427209854126, "learning_rate": 8.99575245414258e-05, "loss": 0.727, "step": 277 }, { "epoch": 0.0018477838225861663, "grad_norm": 1.0658832788467407, "learning_rate": 8.988614781168551e-05, "loss": 1.0289, "step": 278 }, { "epoch": 0.0018544305269839582, "grad_norm": 1.4379167556762695, "learning_rate": 8.98145468539264e-05, "loss": 0.9468, "step": 279 }, { "epoch": 0.0018610772313817502, "grad_norm": 1.25478994846344, "learning_rate": 8.974272207066767e-05, "loss": 1.0358, "step": 280 }, { "epoch": 0.0018677239357795421, "grad_norm": 0.9043642282485962, "learning_rate": 8.967067386568682e-05, "loss": 0.8951, "step": 281 }, { "epoch": 0.001874370640177334, "grad_norm": 0.9954394102096558, "learning_rate": 8.959840264401734e-05, "loss": 0.8863, "step": 282 }, { "epoch": 0.001881017344575126, "grad_norm": 1.3022011518478394, "learning_rate": 8.952590881194648e-05, "loss": 0.9069, "step": 283 }, { "epoch": 0.001887664048972918, "grad_norm": 1.2117016315460205, "learning_rate": 8.945319277701293e-05, "loss": 0.7916, "step": 284 }, { "epoch": 0.00189431075337071, "grad_norm": 1.1791529655456543, "learning_rate": 8.938025494800454e-05, "loss": 0.6957, "step": 285 }, { "epoch": 0.001900957457768502, "grad_norm": 1.416491150856018, "learning_rate": 8.930709573495602e-05, "loss": 0.9042, "step": 286 }, { "epoch": 0.0019076041621662939, "grad_norm": 1.0970172882080078, "learning_rate": 8.923371554914659e-05, "loss": 0.732, "step": 287 }, { "epoch": 0.0019142508665640858, "grad_norm": 1.5363656282424927, "learning_rate": 8.916011480309781e-05, "loss": 1.0781, "step": 288 }, { "epoch": 0.0019208975709618778, "grad_norm": 2.711404323577881, "learning_rate": 8.908629391057109e-05, "loss": 0.7206, "step": 289 }, { "epoch": 0.0019275442753596697, "grad_norm": 1.4729957580566406, "learning_rate": 8.901225328656542e-05, "loss": 0.876, "step": 290 }, { "epoch": 0.0019341909797574617, "grad_norm": 1.724304437637329, "learning_rate": 8.893799334731514e-05, "loss": 0.9673, "step": 291 }, { "epoch": 0.0019408376841552537, "grad_norm": 1.5302904844284058, "learning_rate": 8.886351451028745e-05, "loss": 0.8152, "step": 292 }, { "epoch": 0.0019474843885530456, "grad_norm": 1.5126723051071167, "learning_rate": 8.878881719418013e-05, "loss": 0.7496, "step": 293 }, { "epoch": 0.0019541310929508376, "grad_norm": 1.540969967842102, "learning_rate": 8.871390181891921e-05, "loss": 0.804, "step": 294 }, { "epoch": 0.0019607777973486297, "grad_norm": 2.910700798034668, "learning_rate": 8.863876880565656e-05, "loss": 0.8163, "step": 295 }, { "epoch": 0.0019674245017464215, "grad_norm": 1.713547945022583, "learning_rate": 8.856341857676758e-05, "loss": 0.8926, "step": 296 }, { "epoch": 0.0019740712061442137, "grad_norm": 2.224714994430542, "learning_rate": 8.848785155584874e-05, "loss": 0.7339, "step": 297 }, { "epoch": 0.0019807179105420054, "grad_norm": 2.0042951107025146, "learning_rate": 8.841206816771528e-05, "loss": 0.8993, "step": 298 }, { "epoch": 0.0019873646149397976, "grad_norm": 3.4032013416290283, "learning_rate": 8.833606883839881e-05, "loss": 1.0795, "step": 299 }, { "epoch": 0.0019940113193375893, "grad_norm": 3.5310873985290527, "learning_rate": 8.825985399514487e-05, "loss": 0.9497, "step": 300 }, { "epoch": 0.0020006580237353815, "grad_norm": 0.52362060546875, "learning_rate": 8.818342406641056e-05, "loss": 1.0639, "step": 301 }, { "epoch": 0.0020073047281331732, "grad_norm": 0.7458966374397278, "learning_rate": 8.810677948186213e-05, "loss": 0.9623, "step": 302 }, { "epoch": 0.0020139514325309654, "grad_norm": 0.5583270788192749, "learning_rate": 8.802992067237255e-05, "loss": 0.9494, "step": 303 }, { "epoch": 0.002020598136928757, "grad_norm": 0.7933107018470764, "learning_rate": 8.795284807001912e-05, "loss": 0.873, "step": 304 }, { "epoch": 0.0020272448413265493, "grad_norm": 0.6558129191398621, "learning_rate": 8.787556210808101e-05, "loss": 0.7422, "step": 305 }, { "epoch": 0.002033891545724341, "grad_norm": 0.7539348602294922, "learning_rate": 8.779806322103682e-05, "loss": 0.9463, "step": 306 }, { "epoch": 0.0020405382501221332, "grad_norm": 0.7694026827812195, "learning_rate": 8.772035184456217e-05, "loss": 0.7461, "step": 307 }, { "epoch": 0.002047184954519925, "grad_norm": 0.7448897957801819, "learning_rate": 8.764242841552723e-05, "loss": 0.9075, "step": 308 }, { "epoch": 0.002053831658917717, "grad_norm": 0.9233285784721375, "learning_rate": 8.756429337199425e-05, "loss": 0.9949, "step": 309 }, { "epoch": 0.002060478363315509, "grad_norm": 0.6410421133041382, "learning_rate": 8.748594715321512e-05, "loss": 0.8883, "step": 310 }, { "epoch": 0.002067125067713301, "grad_norm": 0.9382370114326477, "learning_rate": 8.740739019962893e-05, "loss": 0.9967, "step": 311 }, { "epoch": 0.002073771772111093, "grad_norm": 1.0625368356704712, "learning_rate": 8.73286229528594e-05, "loss": 0.775, "step": 312 }, { "epoch": 0.002080418476508885, "grad_norm": 0.7555992603302002, "learning_rate": 8.724964585571248e-05, "loss": 0.7947, "step": 313 }, { "epoch": 0.002087065180906677, "grad_norm": 0.6975390315055847, "learning_rate": 8.717045935217385e-05, "loss": 0.8677, "step": 314 }, { "epoch": 0.002093711885304469, "grad_norm": 0.8929460644721985, "learning_rate": 8.709106388740642e-05, "loss": 0.6437, "step": 315 }, { "epoch": 0.002100358589702261, "grad_norm": 0.9236882925033569, "learning_rate": 8.701145990774775e-05, "loss": 0.9819, "step": 316 }, { "epoch": 0.002107005294100053, "grad_norm": 0.8028077483177185, "learning_rate": 8.693164786070769e-05, "loss": 0.8341, "step": 317 }, { "epoch": 0.002113651998497845, "grad_norm": 1.5441040992736816, "learning_rate": 8.685162819496574e-05, "loss": 0.9731, "step": 318 }, { "epoch": 0.0021202987028956367, "grad_norm": 0.9469773769378662, "learning_rate": 8.677140136036859e-05, "loss": 0.9634, "step": 319 }, { "epoch": 0.002126945407293429, "grad_norm": 0.8910236954689026, "learning_rate": 8.669096780792753e-05, "loss": 0.8829, "step": 320 }, { "epoch": 0.0021335921116912206, "grad_norm": 1.023500680923462, "learning_rate": 8.661032798981604e-05, "loss": 0.9229, "step": 321 }, { "epoch": 0.002140238816089013, "grad_norm": 0.7624151110649109, "learning_rate": 8.652948235936705e-05, "loss": 0.796, "step": 322 }, { "epoch": 0.0021468855204868046, "grad_norm": 0.8581528663635254, "learning_rate": 8.644843137107059e-05, "loss": 0.9611, "step": 323 }, { "epoch": 0.0021535322248845967, "grad_norm": 0.9073342084884644, "learning_rate": 8.636717548057112e-05, "loss": 0.7534, "step": 324 }, { "epoch": 0.0021601789292823885, "grad_norm": 0.9350643754005432, "learning_rate": 8.628571514466501e-05, "loss": 0.8424, "step": 325 }, { "epoch": 0.0021668256336801806, "grad_norm": 0.8973402380943298, "learning_rate": 8.620405082129798e-05, "loss": 0.8081, "step": 326 }, { "epoch": 0.0021734723380779724, "grad_norm": 1.0260740518569946, "learning_rate": 8.612218296956245e-05, "loss": 0.7839, "step": 327 }, { "epoch": 0.0021801190424757646, "grad_norm": 0.9600381255149841, "learning_rate": 8.604011204969507e-05, "loss": 0.7924, "step": 328 }, { "epoch": 0.0021867657468735563, "grad_norm": 1.0301355123519897, "learning_rate": 8.595783852307403e-05, "loss": 0.8323, "step": 329 }, { "epoch": 0.0021934124512713485, "grad_norm": 0.9938191771507263, "learning_rate": 8.587536285221656e-05, "loss": 0.6364, "step": 330 }, { "epoch": 0.00220005915566914, "grad_norm": 1.0596129894256592, "learning_rate": 8.579268550077628e-05, "loss": 0.9593, "step": 331 }, { "epoch": 0.0022067058600669324, "grad_norm": 1.0856947898864746, "learning_rate": 8.570980693354056e-05, "loss": 0.8279, "step": 332 }, { "epoch": 0.002213352564464724, "grad_norm": 1.3220967054367065, "learning_rate": 8.562672761642799e-05, "loss": 0.8541, "step": 333 }, { "epoch": 0.0022199992688625163, "grad_norm": 1.2198119163513184, "learning_rate": 8.554344801648566e-05, "loss": 0.7912, "step": 334 }, { "epoch": 0.002226645973260308, "grad_norm": 1.5559873580932617, "learning_rate": 8.545996860188668e-05, "loss": 0.717, "step": 335 }, { "epoch": 0.0022332926776581, "grad_norm": 1.55154550075531, "learning_rate": 8.537628984192736e-05, "loss": 0.9412, "step": 336 }, { "epoch": 0.002239939382055892, "grad_norm": 1.2657493352890015, "learning_rate": 8.529241220702474e-05, "loss": 0.6607, "step": 337 }, { "epoch": 0.002246586086453684, "grad_norm": 1.3172146081924438, "learning_rate": 8.520833616871383e-05, "loss": 0.7434, "step": 338 }, { "epoch": 0.002253232790851476, "grad_norm": 1.9649335145950317, "learning_rate": 8.512406219964502e-05, "loss": 0.9488, "step": 339 }, { "epoch": 0.002259879495249268, "grad_norm": 1.4579743146896362, "learning_rate": 8.503959077358143e-05, "loss": 0.6775, "step": 340 }, { "epoch": 0.00226652619964706, "grad_norm": 1.2633167505264282, "learning_rate": 8.495492236539623e-05, "loss": 0.8784, "step": 341 }, { "epoch": 0.002273172904044852, "grad_norm": 1.486229658126831, "learning_rate": 8.487005745106988e-05, "loss": 0.8369, "step": 342 }, { "epoch": 0.002279819608442644, "grad_norm": 1.4682197570800781, "learning_rate": 8.478499650768767e-05, "loss": 0.8108, "step": 343 }, { "epoch": 0.002286466312840436, "grad_norm": 1.827052116394043, "learning_rate": 8.469974001343677e-05, "loss": 0.6969, "step": 344 }, { "epoch": 0.002293113017238228, "grad_norm": 1.9171429872512817, "learning_rate": 8.46142884476038e-05, "loss": 0.9739, "step": 345 }, { "epoch": 0.0022997597216360198, "grad_norm": 1.7520502805709839, "learning_rate": 8.452864229057191e-05, "loss": 0.8916, "step": 346 }, { "epoch": 0.002306406426033812, "grad_norm": 1.9049073457717896, "learning_rate": 8.444280202381826e-05, "loss": 0.8202, "step": 347 }, { "epoch": 0.0023130531304316037, "grad_norm": 1.8681999444961548, "learning_rate": 8.435676812991124e-05, "loss": 0.8037, "step": 348 }, { "epoch": 0.002319699834829396, "grad_norm": 1.788421392440796, "learning_rate": 8.427054109250766e-05, "loss": 0.6927, "step": 349 }, { "epoch": 0.0023263465392271876, "grad_norm": 3.0367431640625, "learning_rate": 8.418412139635025e-05, "loss": 0.6724, "step": 350 }, { "epoch": 0.00233299324362498, "grad_norm": 0.38819342851638794, "learning_rate": 8.409750952726472e-05, "loss": 0.9724, "step": 351 }, { "epoch": 0.0023396399480227715, "grad_norm": 0.6049837470054626, "learning_rate": 8.401070597215715e-05, "loss": 0.8079, "step": 352 }, { "epoch": 0.0023462866524205637, "grad_norm": 0.6205918788909912, "learning_rate": 8.392371121901122e-05, "loss": 0.8803, "step": 353 }, { "epoch": 0.0023529333568183554, "grad_norm": 0.589414656162262, "learning_rate": 8.383652575688546e-05, "loss": 0.8397, "step": 354 }, { "epoch": 0.0023595800612161476, "grad_norm": 0.818585991859436, "learning_rate": 8.374915007591053e-05, "loss": 0.8256, "step": 355 }, { "epoch": 0.0023662267656139394, "grad_norm": 0.7183292508125305, "learning_rate": 8.366158466728644e-05, "loss": 0.9431, "step": 356 }, { "epoch": 0.0023728734700117315, "grad_norm": 0.8403149247169495, "learning_rate": 8.357383002327976e-05, "loss": 0.7022, "step": 357 }, { "epoch": 0.0023795201744095233, "grad_norm": 0.8886735439300537, "learning_rate": 8.348588663722092e-05, "loss": 0.7884, "step": 358 }, { "epoch": 0.0023861668788073154, "grad_norm": 0.7179636359214783, "learning_rate": 8.339775500350138e-05, "loss": 0.9792, "step": 359 }, { "epoch": 0.002392813583205107, "grad_norm": 0.5993808507919312, "learning_rate": 8.330943561757092e-05, "loss": 0.9915, "step": 360 }, { "epoch": 0.0023994602876028994, "grad_norm": 1.2541649341583252, "learning_rate": 8.322092897593472e-05, "loss": 0.9209, "step": 361 }, { "epoch": 0.002406106992000691, "grad_norm": 0.691476583480835, "learning_rate": 8.313223557615071e-05, "loss": 0.8014, "step": 362 }, { "epoch": 0.0024127536963984833, "grad_norm": 0.812544584274292, "learning_rate": 8.304335591682674e-05, "loss": 0.9471, "step": 363 }, { "epoch": 0.002419400400796275, "grad_norm": 0.7012845873832703, "learning_rate": 8.295429049761772e-05, "loss": 0.8684, "step": 364 }, { "epoch": 0.002426047105194067, "grad_norm": 0.9507953524589539, "learning_rate": 8.286503981922283e-05, "loss": 1.0666, "step": 365 }, { "epoch": 0.002432693809591859, "grad_norm": 0.7114799618721008, "learning_rate": 8.277560438338278e-05, "loss": 0.8514, "step": 366 }, { "epoch": 0.002439340513989651, "grad_norm": 0.8807584643363953, "learning_rate": 8.268598469287688e-05, "loss": 0.9437, "step": 367 }, { "epoch": 0.002445987218387443, "grad_norm": 0.791029155254364, "learning_rate": 8.259618125152028e-05, "loss": 1.1073, "step": 368 }, { "epoch": 0.002452633922785235, "grad_norm": 0.8901576399803162, "learning_rate": 8.250619456416113e-05, "loss": 0.9061, "step": 369 }, { "epoch": 0.002459280627183027, "grad_norm": 0.8314279317855835, "learning_rate": 8.241602513667774e-05, "loss": 0.8919, "step": 370 }, { "epoch": 0.002465927331580819, "grad_norm": 0.8159176111221313, "learning_rate": 8.232567347597571e-05, "loss": 0.9477, "step": 371 }, { "epoch": 0.002472574035978611, "grad_norm": 0.971042275428772, "learning_rate": 8.22351400899851e-05, "loss": 0.9276, "step": 372 }, { "epoch": 0.002479220740376403, "grad_norm": 0.9706284403800964, "learning_rate": 8.214442548765762e-05, "loss": 0.7869, "step": 373 }, { "epoch": 0.002485867444774195, "grad_norm": 0.7817651629447937, "learning_rate": 8.205353017896365e-05, "loss": 0.828, "step": 374 }, { "epoch": 0.0024925141491719868, "grad_norm": 0.9323488473892212, "learning_rate": 8.19624546748895e-05, "loss": 0.892, "step": 375 }, { "epoch": 0.002499160853569779, "grad_norm": 1.1284369230270386, "learning_rate": 8.18711994874345e-05, "loss": 0.7032, "step": 376 }, { "epoch": 0.0025058075579675707, "grad_norm": 1.2345757484436035, "learning_rate": 8.177976512960803e-05, "loss": 0.9402, "step": 377 }, { "epoch": 0.002512454262365363, "grad_norm": 0.7991147637367249, "learning_rate": 8.16881521154268e-05, "loss": 0.6444, "step": 378 }, { "epoch": 0.0025191009667631546, "grad_norm": 0.9211207032203674, "learning_rate": 8.15963609599118e-05, "loss": 0.8072, "step": 379 }, { "epoch": 0.0025257476711609468, "grad_norm": 1.0338717699050903, "learning_rate": 8.150439217908556e-05, "loss": 0.4949, "step": 380 }, { "epoch": 0.0025323943755587385, "grad_norm": 1.1727211475372314, "learning_rate": 8.141224628996906e-05, "loss": 1.0728, "step": 381 }, { "epoch": 0.0025390410799565307, "grad_norm": 1.5846583843231201, "learning_rate": 8.131992381057904e-05, "loss": 0.701, "step": 382 }, { "epoch": 0.0025456877843543224, "grad_norm": 1.1190587282180786, "learning_rate": 8.12274252599249e-05, "loss": 0.9782, "step": 383 }, { "epoch": 0.0025523344887521146, "grad_norm": 1.2454986572265625, "learning_rate": 8.113475115800586e-05, "loss": 0.6733, "step": 384 }, { "epoch": 0.0025589811931499063, "grad_norm": 1.0867294073104858, "learning_rate": 8.104190202580812e-05, "loss": 0.8579, "step": 385 }, { "epoch": 0.0025656278975476985, "grad_norm": 1.3968238830566406, "learning_rate": 8.094887838530174e-05, "loss": 0.7021, "step": 386 }, { "epoch": 0.0025722746019454902, "grad_norm": 0.9890034794807434, "learning_rate": 8.085568075943787e-05, "loss": 0.8916, "step": 387 }, { "epoch": 0.0025789213063432824, "grad_norm": 1.127411127090454, "learning_rate": 8.076230967214578e-05, "loss": 0.7101, "step": 388 }, { "epoch": 0.002585568010741074, "grad_norm": 1.1789129972457886, "learning_rate": 8.066876564832983e-05, "loss": 0.9108, "step": 389 }, { "epoch": 0.0025922147151388663, "grad_norm": 1.2895052433013916, "learning_rate": 8.05750492138666e-05, "loss": 0.8967, "step": 390 }, { "epoch": 0.002598861419536658, "grad_norm": 1.4819419384002686, "learning_rate": 8.048116089560194e-05, "loss": 0.811, "step": 391 }, { "epoch": 0.0026055081239344502, "grad_norm": 1.2878509759902954, "learning_rate": 8.038710122134791e-05, "loss": 0.6405, "step": 392 }, { "epoch": 0.002612154828332242, "grad_norm": 1.2777119874954224, "learning_rate": 8.029287071987997e-05, "loss": 0.7156, "step": 393 }, { "epoch": 0.002618801532730034, "grad_norm": 1.741945505142212, "learning_rate": 8.019846992093385e-05, "loss": 0.8724, "step": 394 }, { "epoch": 0.002625448237127826, "grad_norm": 1.6703791618347168, "learning_rate": 8.01038993552027e-05, "loss": 0.5237, "step": 395 }, { "epoch": 0.002632094941525618, "grad_norm": 1.928144931793213, "learning_rate": 8.000915955433396e-05, "loss": 0.7999, "step": 396 }, { "epoch": 0.00263874164592341, "grad_norm": 1.7603362798690796, "learning_rate": 7.991425105092656e-05, "loss": 0.6134, "step": 397 }, { "epoch": 0.002645388350321202, "grad_norm": 2.094404935836792, "learning_rate": 7.981917437852777e-05, "loss": 0.7261, "step": 398 }, { "epoch": 0.002652035054718994, "grad_norm": 1.8364744186401367, "learning_rate": 7.972393007163026e-05, "loss": 0.8551, "step": 399 }, { "epoch": 0.002658681759116786, "grad_norm": 4.20158576965332, "learning_rate": 7.962851866566912e-05, "loss": 1.0817, "step": 400 }, { "epoch": 0.002665328463514578, "grad_norm": 0.4498131275177002, "learning_rate": 7.953294069701876e-05, "loss": 1.0174, "step": 401 }, { "epoch": 0.00267197516791237, "grad_norm": 0.6717918515205383, "learning_rate": 7.943719670299002e-05, "loss": 0.8297, "step": 402 }, { "epoch": 0.002678621872310162, "grad_norm": 0.6278716325759888, "learning_rate": 7.934128722182704e-05, "loss": 1.0539, "step": 403 }, { "epoch": 0.0026852685767079537, "grad_norm": 0.5958848595619202, "learning_rate": 7.924521279270432e-05, "loss": 1.0157, "step": 404 }, { "epoch": 0.002691915281105746, "grad_norm": 0.7898643612861633, "learning_rate": 7.91489739557236e-05, "loss": 0.9274, "step": 405 }, { "epoch": 0.0026985619855035376, "grad_norm": 0.5591276288032532, "learning_rate": 7.905257125191094e-05, "loss": 0.7194, "step": 406 }, { "epoch": 0.00270520868990133, "grad_norm": 1.055373191833496, "learning_rate": 7.895600522321354e-05, "loss": 0.8483, "step": 407 }, { "epoch": 0.0027118553942991216, "grad_norm": 0.7240962982177734, "learning_rate": 7.88592764124968e-05, "loss": 0.7843, "step": 408 }, { "epoch": 0.0027185020986969137, "grad_norm": 0.7613973617553711, "learning_rate": 7.876238536354122e-05, "loss": 0.987, "step": 409 }, { "epoch": 0.0027251488030947055, "grad_norm": 0.6913038492202759, "learning_rate": 7.866533262103936e-05, "loss": 0.8608, "step": 410 }, { "epoch": 0.0027317955074924976, "grad_norm": 0.7116532325744629, "learning_rate": 7.856811873059279e-05, "loss": 0.8493, "step": 411 }, { "epoch": 0.0027384422118902894, "grad_norm": 0.777245044708252, "learning_rate": 7.847074423870896e-05, "loss": 1.0772, "step": 412 }, { "epoch": 0.0027450889162880816, "grad_norm": 0.712005078792572, "learning_rate": 7.837320969279822e-05, "loss": 0.8002, "step": 413 }, { "epoch": 0.0027517356206858733, "grad_norm": 1.0246773958206177, "learning_rate": 7.827551564117068e-05, "loss": 0.8742, "step": 414 }, { "epoch": 0.0027583823250836655, "grad_norm": 0.9874679446220398, "learning_rate": 7.817766263303313e-05, "loss": 1.0139, "step": 415 }, { "epoch": 0.0027650290294814572, "grad_norm": 0.7177373170852661, "learning_rate": 7.807965121848597e-05, "loss": 0.9788, "step": 416 }, { "epoch": 0.0027716757338792494, "grad_norm": 0.8287638425827026, "learning_rate": 7.798148194852015e-05, "loss": 0.9904, "step": 417 }, { "epoch": 0.002778322438277041, "grad_norm": 0.8577630519866943, "learning_rate": 7.788315537501399e-05, "loss": 0.6973, "step": 418 }, { "epoch": 0.0027849691426748333, "grad_norm": 0.7897922396659851, "learning_rate": 7.778467205073013e-05, "loss": 0.9903, "step": 419 }, { "epoch": 0.002791615847072625, "grad_norm": 0.8858191967010498, "learning_rate": 7.768603252931243e-05, "loss": 0.8548, "step": 420 }, { "epoch": 0.0027982625514704172, "grad_norm": 0.8166362643241882, "learning_rate": 7.758723736528284e-05, "loss": 0.9559, "step": 421 }, { "epoch": 0.002804909255868209, "grad_norm": 1.3511897325515747, "learning_rate": 7.74882871140383e-05, "loss": 0.9148, "step": 422 }, { "epoch": 0.002811555960266001, "grad_norm": 1.0749465227127075, "learning_rate": 7.738918233184759e-05, "loss": 0.8598, "step": 423 }, { "epoch": 0.002818202664663793, "grad_norm": 0.8789573311805725, "learning_rate": 7.728992357584822e-05, "loss": 0.8576, "step": 424 }, { "epoch": 0.002824849369061585, "grad_norm": 1.0636883974075317, "learning_rate": 7.719051140404327e-05, "loss": 0.8691, "step": 425 }, { "epoch": 0.002831496073459377, "grad_norm": 0.8622593283653259, "learning_rate": 7.70909463752983e-05, "loss": 0.8029, "step": 426 }, { "epoch": 0.002838142777857169, "grad_norm": 1.100450038909912, "learning_rate": 7.699122904933821e-05, "loss": 0.8127, "step": 427 }, { "epoch": 0.002844789482254961, "grad_norm": 0.9693968296051025, "learning_rate": 7.689135998674403e-05, "loss": 0.7787, "step": 428 }, { "epoch": 0.002851436186652753, "grad_norm": 0.960363507270813, "learning_rate": 7.679133974894983e-05, "loss": 0.8171, "step": 429 }, { "epoch": 0.002858082891050545, "grad_norm": 1.4263381958007812, "learning_rate": 7.669116889823955e-05, "loss": 0.8144, "step": 430 }, { "epoch": 0.002864729595448337, "grad_norm": 0.9941753149032593, "learning_rate": 7.659084799774376e-05, "loss": 0.8783, "step": 431 }, { "epoch": 0.002871376299846129, "grad_norm": 1.0290969610214233, "learning_rate": 7.649037761143669e-05, "loss": 0.9556, "step": 432 }, { "epoch": 0.0028780230042439207, "grad_norm": 1.1351956129074097, "learning_rate": 7.638975830413284e-05, "loss": 0.8794, "step": 433 }, { "epoch": 0.002884669708641713, "grad_norm": 1.2696144580841064, "learning_rate": 7.628899064148391e-05, "loss": 0.7921, "step": 434 }, { "epoch": 0.0028913164130395046, "grad_norm": 1.188214898109436, "learning_rate": 7.618807518997563e-05, "loss": 0.9029, "step": 435 }, { "epoch": 0.002897963117437297, "grad_norm": 1.385566234588623, "learning_rate": 7.608701251692457e-05, "loss": 0.8348, "step": 436 }, { "epoch": 0.0029046098218350885, "grad_norm": 1.2165278196334839, "learning_rate": 7.59858031904749e-05, "loss": 0.7446, "step": 437 }, { "epoch": 0.0029112565262328807, "grad_norm": 1.2804996967315674, "learning_rate": 7.588444777959524e-05, "loss": 0.7512, "step": 438 }, { "epoch": 0.0029179032306306724, "grad_norm": 1.6097999811172485, "learning_rate": 7.578294685407548e-05, "loss": 0.7528, "step": 439 }, { "epoch": 0.0029245499350284646, "grad_norm": 1.3481954336166382, "learning_rate": 7.568130098452351e-05, "loss": 0.9365, "step": 440 }, { "epoch": 0.0029311966394262564, "grad_norm": 1.5785322189331055, "learning_rate": 7.557951074236209e-05, "loss": 0.7424, "step": 441 }, { "epoch": 0.0029378433438240485, "grad_norm": 1.2576839923858643, "learning_rate": 7.547757669982559e-05, "loss": 0.6221, "step": 442 }, { "epoch": 0.0029444900482218403, "grad_norm": 1.1837393045425415, "learning_rate": 7.537549942995672e-05, "loss": 0.5253, "step": 443 }, { "epoch": 0.0029511367526196325, "grad_norm": 1.6240267753601074, "learning_rate": 7.527327950660347e-05, "loss": 0.7494, "step": 444 }, { "epoch": 0.002957783457017424, "grad_norm": 1.4825639724731445, "learning_rate": 7.517091750441576e-05, "loss": 0.6376, "step": 445 }, { "epoch": 0.0029644301614152164, "grad_norm": 1.430845022201538, "learning_rate": 7.506841399884217e-05, "loss": 0.5573, "step": 446 }, { "epoch": 0.002971076865813008, "grad_norm": 1.3693525791168213, "learning_rate": 7.496576956612686e-05, "loss": 0.4095, "step": 447 }, { "epoch": 0.0029777235702108003, "grad_norm": 1.5286657810211182, "learning_rate": 7.486298478330615e-05, "loss": 0.6809, "step": 448 }, { "epoch": 0.002984370274608592, "grad_norm": 2.272508382797241, "learning_rate": 7.476006022820545e-05, "loss": 0.7388, "step": 449 }, { "epoch": 0.002991016979006384, "grad_norm": 4.381163597106934, "learning_rate": 7.465699647943586e-05, "loss": 1.0686, "step": 450 }, { "epoch": 0.002997663683404176, "grad_norm": 0.37457677721977234, "learning_rate": 7.455379411639104e-05, "loss": 1.0442, "step": 451 }, { "epoch": 0.003004310387801968, "grad_norm": 0.87198805809021, "learning_rate": 7.445045371924386e-05, "loss": 0.6809, "step": 452 }, { "epoch": 0.00301095709219976, "grad_norm": 0.7326244115829468, "learning_rate": 7.43469758689432e-05, "loss": 0.9076, "step": 453 }, { "epoch": 0.003017603796597552, "grad_norm": 0.618824303150177, "learning_rate": 7.424336114721066e-05, "loss": 0.9333, "step": 454 }, { "epoch": 0.003024250500995344, "grad_norm": 0.7133090496063232, "learning_rate": 7.413961013653726e-05, "loss": 0.9412, "step": 455 }, { "epoch": 0.003030897205393136, "grad_norm": 0.5877029299736023, "learning_rate": 7.403572342018021e-05, "loss": 0.7415, "step": 456 }, { "epoch": 0.003037543909790928, "grad_norm": 0.7065613865852356, "learning_rate": 7.393170158215966e-05, "loss": 0.8234, "step": 457 }, { "epoch": 0.00304419061418872, "grad_norm": 0.7387683391571045, "learning_rate": 7.38275452072553e-05, "loss": 0.7834, "step": 458 }, { "epoch": 0.003050837318586512, "grad_norm": 0.7063631415367126, "learning_rate": 7.372325488100321e-05, "loss": 1.0935, "step": 459 }, { "epoch": 0.0030574840229843038, "grad_norm": 0.8622189164161682, "learning_rate": 7.361883118969247e-05, "loss": 1.1685, "step": 460 }, { "epoch": 0.003064130727382096, "grad_norm": 0.8481305241584778, "learning_rate": 7.351427472036191e-05, "loss": 0.9794, "step": 461 }, { "epoch": 0.0030707774317798877, "grad_norm": 0.8307840824127197, "learning_rate": 7.340958606079679e-05, "loss": 0.9836, "step": 462 }, { "epoch": 0.00307742413617768, "grad_norm": 0.6685025095939636, "learning_rate": 7.330476579952549e-05, "loss": 0.8816, "step": 463 }, { "epoch": 0.0030840708405754716, "grad_norm": 0.7029338479042053, "learning_rate": 7.319981452581629e-05, "loss": 0.9216, "step": 464 }, { "epoch": 0.0030907175449732638, "grad_norm": 0.8017571568489075, "learning_rate": 7.309473282967387e-05, "loss": 0.7618, "step": 465 }, { "epoch": 0.0030973642493710555, "grad_norm": 0.7500448822975159, "learning_rate": 7.29895213018362e-05, "loss": 0.9051, "step": 466 }, { "epoch": 0.0031040109537688477, "grad_norm": 0.8484657406806946, "learning_rate": 7.288418053377107e-05, "loss": 0.8906, "step": 467 }, { "epoch": 0.0031106576581666394, "grad_norm": 0.7417640089988708, "learning_rate": 7.277871111767284e-05, "loss": 0.7899, "step": 468 }, { "epoch": 0.0031173043625644316, "grad_norm": 0.9253821969032288, "learning_rate": 7.26731136464591e-05, "loss": 0.6727, "step": 469 }, { "epoch": 0.0031239510669622233, "grad_norm": 0.868393063545227, "learning_rate": 7.256738871376732e-05, "loss": 0.844, "step": 470 }, { "epoch": 0.0031305977713600155, "grad_norm": 0.8331130743026733, "learning_rate": 7.24615369139515e-05, "loss": 0.7033, "step": 471 }, { "epoch": 0.0031372444757578073, "grad_norm": 1.062641978263855, "learning_rate": 7.235555884207889e-05, "loss": 0.8176, "step": 472 }, { "epoch": 0.0031438911801555994, "grad_norm": 0.7356940507888794, "learning_rate": 7.224945509392654e-05, "loss": 0.8743, "step": 473 }, { "epoch": 0.003150537884553391, "grad_norm": 1.034468173980713, "learning_rate": 7.21432262659781e-05, "loss": 0.7185, "step": 474 }, { "epoch": 0.0031571845889511833, "grad_norm": 0.90153968334198, "learning_rate": 7.203687295542032e-05, "loss": 0.6971, "step": 475 }, { "epoch": 0.003163831293348975, "grad_norm": 1.179221749305725, "learning_rate": 7.193039576013976e-05, "loss": 0.6926, "step": 476 }, { "epoch": 0.0031704779977467673, "grad_norm": 1.0013591051101685, "learning_rate": 7.182379527871945e-05, "loss": 0.8971, "step": 477 }, { "epoch": 0.003177124702144559, "grad_norm": 1.1819143295288086, "learning_rate": 7.171707211043545e-05, "loss": 0.9409, "step": 478 }, { "epoch": 0.003183771406542351, "grad_norm": 1.1591392755508423, "learning_rate": 7.16102268552536e-05, "loss": 0.7557, "step": 479 }, { "epoch": 0.003190418110940143, "grad_norm": 1.0158194303512573, "learning_rate": 7.150326011382604e-05, "loss": 0.8032, "step": 480 }, { "epoch": 0.003197064815337935, "grad_norm": 1.0773541927337646, "learning_rate": 7.139617248748781e-05, "loss": 0.8156, "step": 481 }, { "epoch": 0.003203711519735727, "grad_norm": 1.401308298110962, "learning_rate": 7.128896457825364e-05, "loss": 0.7159, "step": 482 }, { "epoch": 0.003210358224133519, "grad_norm": 0.9914813041687012, "learning_rate": 7.118163698881436e-05, "loss": 0.8771, "step": 483 }, { "epoch": 0.003217004928531311, "grad_norm": 1.0048538446426392, "learning_rate": 7.107419032253368e-05, "loss": 0.762, "step": 484 }, { "epoch": 0.003223651632929103, "grad_norm": 1.7678227424621582, "learning_rate": 7.096662518344468e-05, "loss": 1.0256, "step": 485 }, { "epoch": 0.003230298337326895, "grad_norm": 1.5727065801620483, "learning_rate": 7.085894217624645e-05, "loss": 1.0233, "step": 486 }, { "epoch": 0.003236945041724687, "grad_norm": 0.9283615946769714, "learning_rate": 7.075114190630074e-05, "loss": 0.8238, "step": 487 }, { "epoch": 0.003243591746122479, "grad_norm": 1.20572829246521, "learning_rate": 7.064322497962848e-05, "loss": 0.9388, "step": 488 }, { "epoch": 0.0032502384505202707, "grad_norm": 1.1428848505020142, "learning_rate": 7.053519200290644e-05, "loss": 0.8561, "step": 489 }, { "epoch": 0.003256885154918063, "grad_norm": 1.2887569665908813, "learning_rate": 7.042704358346375e-05, "loss": 0.6956, "step": 490 }, { "epoch": 0.0032635318593158547, "grad_norm": 1.8964046239852905, "learning_rate": 7.031878032927857e-05, "loss": 1.017, "step": 491 }, { "epoch": 0.003270178563713647, "grad_norm": 1.646983027458191, "learning_rate": 7.021040284897458e-05, "loss": 0.5258, "step": 492 }, { "epoch": 0.0032768252681114386, "grad_norm": 1.2678184509277344, "learning_rate": 7.010191175181764e-05, "loss": 0.5785, "step": 493 }, { "epoch": 0.0032834719725092307, "grad_norm": 1.5537736415863037, "learning_rate": 6.999330764771231e-05, "loss": 0.7185, "step": 494 }, { "epoch": 0.0032901186769070225, "grad_norm": 1.9458109140396118, "learning_rate": 6.988459114719849e-05, "loss": 0.9896, "step": 495 }, { "epoch": 0.0032967653813048147, "grad_norm": 1.9748679399490356, "learning_rate": 6.977576286144784e-05, "loss": 0.9894, "step": 496 }, { "epoch": 0.0033034120857026064, "grad_norm": 1.5926192998886108, "learning_rate": 6.966682340226053e-05, "loss": 0.7679, "step": 497 }, { "epoch": 0.0033100587901003986, "grad_norm": 2.037371873855591, "learning_rate": 6.955777338206169e-05, "loss": 0.7648, "step": 498 }, { "epoch": 0.0033167054944981903, "grad_norm": 1.3600577116012573, "learning_rate": 6.9448613413898e-05, "loss": 0.4472, "step": 499 }, { "epoch": 0.0033233521988959825, "grad_norm": 2.713379383087158, "learning_rate": 6.93393441114342e-05, "loss": 0.6972, "step": 500 }, { "epoch": 0.0033299989032937742, "grad_norm": 0.4312986135482788, "learning_rate": 6.922996608894968e-05, "loss": 0.938, "step": 501 }, { "epoch": 0.0033366456076915664, "grad_norm": 0.4976477324962616, "learning_rate": 6.912047996133508e-05, "loss": 0.8754, "step": 502 }, { "epoch": 0.003343292312089358, "grad_norm": 0.8254091739654541, "learning_rate": 6.90108863440887e-05, "loss": 1.0763, "step": 503 }, { "epoch": 0.0033499390164871503, "grad_norm": 0.5634557604789734, "learning_rate": 6.890118585331319e-05, "loss": 0.7935, "step": 504 }, { "epoch": 0.003356585720884942, "grad_norm": 0.5753924250602722, "learning_rate": 6.879137910571191e-05, "loss": 1.0282, "step": 505 }, { "epoch": 0.0033632324252827342, "grad_norm": 0.7259241342544556, "learning_rate": 6.868146671858567e-05, "loss": 0.7291, "step": 506 }, { "epoch": 0.003369879129680526, "grad_norm": 0.7611901760101318, "learning_rate": 6.857144930982908e-05, "loss": 0.8477, "step": 507 }, { "epoch": 0.003376525834078318, "grad_norm": 0.6324353814125061, "learning_rate": 6.846132749792718e-05, "loss": 0.7485, "step": 508 }, { "epoch": 0.00338317253847611, "grad_norm": 0.7248914837837219, "learning_rate": 6.835110190195195e-05, "loss": 1.1058, "step": 509 }, { "epoch": 0.003389819242873902, "grad_norm": 0.733087420463562, "learning_rate": 6.824077314155877e-05, "loss": 1.0394, "step": 510 }, { "epoch": 0.003396465947271694, "grad_norm": 0.7783194780349731, "learning_rate": 6.813034183698301e-05, "loss": 0.6953, "step": 511 }, { "epoch": 0.003403112651669486, "grad_norm": 0.78984135389328, "learning_rate": 6.801980860903651e-05, "loss": 0.99, "step": 512 }, { "epoch": 0.003409759356067278, "grad_norm": 0.6463935971260071, "learning_rate": 6.790917407910408e-05, "loss": 0.851, "step": 513 }, { "epoch": 0.00341640606046507, "grad_norm": 0.601154088973999, "learning_rate": 6.779843886914006e-05, "loss": 0.877, "step": 514 }, { "epoch": 0.003423052764862862, "grad_norm": 0.6204383969306946, "learning_rate": 6.768760360166471e-05, "loss": 0.7131, "step": 515 }, { "epoch": 0.003429699469260654, "grad_norm": 0.720998227596283, "learning_rate": 6.757666889976085e-05, "loss": 0.7918, "step": 516 }, { "epoch": 0.003436346173658446, "grad_norm": 0.7629978656768799, "learning_rate": 6.746563538707022e-05, "loss": 0.9534, "step": 517 }, { "epoch": 0.0034429928780562377, "grad_norm": 0.9741836786270142, "learning_rate": 6.735450368779016e-05, "loss": 0.7624, "step": 518 }, { "epoch": 0.00344963958245403, "grad_norm": 0.7425995469093323, "learning_rate": 6.724327442666987e-05, "loss": 0.7504, "step": 519 }, { "epoch": 0.0034562862868518216, "grad_norm": 0.8310898542404175, "learning_rate": 6.713194822900706e-05, "loss": 1.0872, "step": 520 }, { "epoch": 0.003462932991249614, "grad_norm": 0.8490201234817505, "learning_rate": 6.702052572064442e-05, "loss": 0.792, "step": 521 }, { "epoch": 0.0034695796956474055, "grad_norm": 0.8843750357627869, "learning_rate": 6.690900752796601e-05, "loss": 0.8166, "step": 522 }, { "epoch": 0.0034762264000451977, "grad_norm": 1.045980453491211, "learning_rate": 6.679739427789383e-05, "loss": 0.9737, "step": 523 }, { "epoch": 0.0034828731044429895, "grad_norm": 0.8091442584991455, "learning_rate": 6.668568659788425e-05, "loss": 0.9131, "step": 524 }, { "epoch": 0.0034895198088407816, "grad_norm": 0.7094196677207947, "learning_rate": 6.657388511592452e-05, "loss": 0.7542, "step": 525 }, { "epoch": 0.0034961665132385734, "grad_norm": 0.7974683046340942, "learning_rate": 6.646199046052921e-05, "loss": 0.7324, "step": 526 }, { "epoch": 0.0035028132176363655, "grad_norm": 0.9074555039405823, "learning_rate": 6.635000326073669e-05, "loss": 0.8125, "step": 527 }, { "epoch": 0.0035094599220341573, "grad_norm": 1.316511869430542, "learning_rate": 6.623792414610552e-05, "loss": 0.631, "step": 528 }, { "epoch": 0.0035161066264319495, "grad_norm": 1.061382532119751, "learning_rate": 6.61257537467111e-05, "loss": 0.9479, "step": 529 }, { "epoch": 0.003522753330829741, "grad_norm": 0.9964013695716858, "learning_rate": 6.601349269314188e-05, "loss": 0.7584, "step": 530 }, { "epoch": 0.0035294000352275334, "grad_norm": 0.9392070770263672, "learning_rate": 6.590114161649604e-05, "loss": 0.9367, "step": 531 }, { "epoch": 0.003536046739625325, "grad_norm": 1.226622223854065, "learning_rate": 6.578870114837779e-05, "loss": 0.7428, "step": 532 }, { "epoch": 0.0035426934440231173, "grad_norm": 0.9497167468070984, "learning_rate": 6.567617192089386e-05, "loss": 0.8032, "step": 533 }, { "epoch": 0.003549340148420909, "grad_norm": 1.268905520439148, "learning_rate": 6.556355456665e-05, "loss": 0.7311, "step": 534 }, { "epoch": 0.003555986852818701, "grad_norm": 0.793729305267334, "learning_rate": 6.545084971874738e-05, "loss": 0.683, "step": 535 }, { "epoch": 0.003562633557216493, "grad_norm": 1.6156094074249268, "learning_rate": 6.533805801077899e-05, "loss": 0.887, "step": 536 }, { "epoch": 0.003569280261614285, "grad_norm": 1.031252384185791, "learning_rate": 6.522518007682616e-05, "loss": 0.7859, "step": 537 }, { "epoch": 0.003575926966012077, "grad_norm": 1.3641934394836426, "learning_rate": 6.511221655145496e-05, "loss": 0.9322, "step": 538 }, { "epoch": 0.003582573670409869, "grad_norm": 1.3248331546783447, "learning_rate": 6.49991680697126e-05, "loss": 0.8903, "step": 539 }, { "epoch": 0.003589220374807661, "grad_norm": 1.5821675062179565, "learning_rate": 6.48860352671239e-05, "loss": 0.7581, "step": 540 }, { "epoch": 0.003595867079205453, "grad_norm": 1.2418559789657593, "learning_rate": 6.477281877968772e-05, "loss": 0.841, "step": 541 }, { "epoch": 0.003602513783603245, "grad_norm": 1.2012357711791992, "learning_rate": 6.465951924387338e-05, "loss": 0.8172, "step": 542 }, { "epoch": 0.003609160488001037, "grad_norm": 1.485724687576294, "learning_rate": 6.454613729661702e-05, "loss": 0.8938, "step": 543 }, { "epoch": 0.003615807192398829, "grad_norm": 1.1569510698318481, "learning_rate": 6.443267357531813e-05, "loss": 0.5439, "step": 544 }, { "epoch": 0.0036224538967966208, "grad_norm": 1.6549900770187378, "learning_rate": 6.431912871783586e-05, "loss": 0.8047, "step": 545 }, { "epoch": 0.003629100601194413, "grad_norm": 1.8657948970794678, "learning_rate": 6.420550336248558e-05, "loss": 0.6836, "step": 546 }, { "epoch": 0.0036357473055922047, "grad_norm": 1.8612549304962158, "learning_rate": 6.409179814803504e-05, "loss": 0.6303, "step": 547 }, { "epoch": 0.003642394009989997, "grad_norm": 1.7402323484420776, "learning_rate": 6.397801371370108e-05, "loss": 0.9578, "step": 548 }, { "epoch": 0.0036490407143877886, "grad_norm": 1.6253831386566162, "learning_rate": 6.38641506991458e-05, "loss": 0.5675, "step": 549 }, { "epoch": 0.0036556874187855808, "grad_norm": 2.8043971061706543, "learning_rate": 6.37502097444731e-05, "loss": 0.6618, "step": 550 }, { "epoch": 0.0036623341231833725, "grad_norm": 0.31518086791038513, "learning_rate": 6.3636191490225e-05, "loss": 0.7951, "step": 551 }, { "epoch": 0.0036689808275811647, "grad_norm": 0.6549334526062012, "learning_rate": 6.352209657737814e-05, "loss": 0.7503, "step": 552 }, { "epoch": 0.0036756275319789564, "grad_norm": 0.7057569026947021, "learning_rate": 6.340792564734001e-05, "loss": 0.6862, "step": 553 }, { "epoch": 0.0036822742363767486, "grad_norm": 0.7299575805664062, "learning_rate": 6.329367934194556e-05, "loss": 0.7484, "step": 554 }, { "epoch": 0.0036889209407745403, "grad_norm": 0.6713090538978577, "learning_rate": 6.317935830345338e-05, "loss": 0.6862, "step": 555 }, { "epoch": 0.0036955676451723325, "grad_norm": 0.592124342918396, "learning_rate": 6.306496317454227e-05, "loss": 0.8352, "step": 556 }, { "epoch": 0.0037022143495701243, "grad_norm": 0.7989798784255981, "learning_rate": 6.295049459830746e-05, "loss": 0.613, "step": 557 }, { "epoch": 0.0037088610539679164, "grad_norm": 0.6609835028648376, "learning_rate": 6.283595321825713e-05, "loss": 0.9622, "step": 558 }, { "epoch": 0.003715507758365708, "grad_norm": 0.7341376543045044, "learning_rate": 6.272133967830875e-05, "loss": 0.9564, "step": 559 }, { "epoch": 0.0037221544627635004, "grad_norm": 0.7162797451019287, "learning_rate": 6.260665462278544e-05, "loss": 0.9659, "step": 560 }, { "epoch": 0.003728801167161292, "grad_norm": 0.6263265013694763, "learning_rate": 6.249189869641233e-05, "loss": 0.8669, "step": 561 }, { "epoch": 0.0037354478715590843, "grad_norm": 0.9160195589065552, "learning_rate": 6.237707254431298e-05, "loss": 0.7752, "step": 562 }, { "epoch": 0.003742094575956876, "grad_norm": 0.9640375375747681, "learning_rate": 6.22621768120058e-05, "loss": 0.8584, "step": 563 }, { "epoch": 0.003748741280354668, "grad_norm": 0.9797345399856567, "learning_rate": 6.214721214540027e-05, "loss": 0.9697, "step": 564 }, { "epoch": 0.00375538798475246, "grad_norm": 0.847838282585144, "learning_rate": 6.203217919079342e-05, "loss": 0.8559, "step": 565 }, { "epoch": 0.003762034689150252, "grad_norm": 0.6711984276771545, "learning_rate": 6.191707859486622e-05, "loss": 1.0097, "step": 566 }, { "epoch": 0.003768681393548044, "grad_norm": 0.9938323497772217, "learning_rate": 6.180191100467986e-05, "loss": 0.7758, "step": 567 }, { "epoch": 0.003775328097945836, "grad_norm": 1.1993358135223389, "learning_rate": 6.168667706767214e-05, "loss": 0.9241, "step": 568 }, { "epoch": 0.003781974802343628, "grad_norm": 0.931126594543457, "learning_rate": 6.15713774316539e-05, "loss": 0.9616, "step": 569 }, { "epoch": 0.00378862150674142, "grad_norm": 1.0224496126174927, "learning_rate": 6.145601274480521e-05, "loss": 0.7781, "step": 570 }, { "epoch": 0.003795268211139212, "grad_norm": 0.7947784066200256, "learning_rate": 6.1340583655672e-05, "loss": 0.8695, "step": 571 }, { "epoch": 0.003801914915537004, "grad_norm": 0.7468635439872742, "learning_rate": 6.12250908131621e-05, "loss": 0.9676, "step": 572 }, { "epoch": 0.003808561619934796, "grad_norm": 0.7725012302398682, "learning_rate": 6.110953486654183e-05, "loss": 0.8771, "step": 573 }, { "epoch": 0.0038152083243325878, "grad_norm": 0.9998317956924438, "learning_rate": 6.0993916465432213e-05, "loss": 0.9842, "step": 574 }, { "epoch": 0.00382185502873038, "grad_norm": 1.0166869163513184, "learning_rate": 6.0878236259805396e-05, "loss": 0.6386, "step": 575 }, { "epoch": 0.0038285017331281717, "grad_norm": 0.974464476108551, "learning_rate": 6.076249489998097e-05, "loss": 0.8592, "step": 576 }, { "epoch": 0.003835148437525964, "grad_norm": 0.9121200442314148, "learning_rate": 6.0646693036622334e-05, "loss": 0.8436, "step": 577 }, { "epoch": 0.0038417951419237556, "grad_norm": 0.9333671927452087, "learning_rate": 6.053083132073298e-05, "loss": 0.67, "step": 578 }, { "epoch": 0.0038484418463215478, "grad_norm": 1.0727161169052124, "learning_rate": 6.041491040365291e-05, "loss": 0.5661, "step": 579 }, { "epoch": 0.0038550885507193395, "grad_norm": 1.0697327852249146, "learning_rate": 6.029893093705492e-05, "loss": 1.0136, "step": 580 }, { "epoch": 0.0038617352551171317, "grad_norm": 1.2489475011825562, "learning_rate": 6.0182893572940956e-05, "loss": 0.6802, "step": 581 }, { "epoch": 0.0038683819595149234, "grad_norm": 1.0722562074661255, "learning_rate": 6.006679896363844e-05, "loss": 0.856, "step": 582 }, { "epoch": 0.0038750286639127156, "grad_norm": 1.039819598197937, "learning_rate": 5.995064776179663e-05, "loss": 0.892, "step": 583 }, { "epoch": 0.0038816753683105073, "grad_norm": 1.00333833694458, "learning_rate": 5.9834440620382916e-05, "loss": 0.9249, "step": 584 }, { "epoch": 0.0038883220727082995, "grad_norm": 1.2520968914031982, "learning_rate": 5.971817819267913e-05, "loss": 0.8829, "step": 585 }, { "epoch": 0.0038949687771060912, "grad_norm": 1.2571916580200195, "learning_rate": 5.9601861132278e-05, "loss": 0.9248, "step": 586 }, { "epoch": 0.0039016154815038834, "grad_norm": 1.451646327972412, "learning_rate": 5.948549009307927e-05, "loss": 0.7844, "step": 587 }, { "epoch": 0.003908262185901675, "grad_norm": 1.0194920301437378, "learning_rate": 5.9369065729286245e-05, "loss": 0.5549, "step": 588 }, { "epoch": 0.003914908890299467, "grad_norm": 1.4929895401000977, "learning_rate": 5.92525886954019e-05, "loss": 0.6931, "step": 589 }, { "epoch": 0.0039215555946972595, "grad_norm": 1.4282971620559692, "learning_rate": 5.9136059646225375e-05, "loss": 1.0526, "step": 590 }, { "epoch": 0.003928202299095051, "grad_norm": 1.0959901809692383, "learning_rate": 5.9019479236848216e-05, "loss": 0.8075, "step": 591 }, { "epoch": 0.003934849003492843, "grad_norm": 1.4382838010787964, "learning_rate": 5.8902848122650675e-05, "loss": 0.8176, "step": 592 }, { "epoch": 0.003941495707890635, "grad_norm": 1.3714373111724854, "learning_rate": 5.878616695929808e-05, "loss": 0.5654, "step": 593 }, { "epoch": 0.003948142412288427, "grad_norm": 1.5652027130126953, "learning_rate": 5.866943640273712e-05, "loss": 0.7618, "step": 594 }, { "epoch": 0.003954789116686219, "grad_norm": 2.509178876876831, "learning_rate": 5.855265710919211e-05, "loss": 0.8079, "step": 595 }, { "epoch": 0.003961435821084011, "grad_norm": 1.923766851425171, "learning_rate": 5.8435829735161416e-05, "loss": 0.6197, "step": 596 }, { "epoch": 0.003968082525481803, "grad_norm": 1.357117772102356, "learning_rate": 5.831895493741364e-05, "loss": 0.545, "step": 597 }, { "epoch": 0.003974729229879595, "grad_norm": 1.836143136024475, "learning_rate": 5.820203337298403e-05, "loss": 0.661, "step": 598 }, { "epoch": 0.003981375934277387, "grad_norm": 2.429413318634033, "learning_rate": 5.808506569917074e-05, "loss": 1.1022, "step": 599 }, { "epoch": 0.003988022638675179, "grad_norm": 3.8483147621154785, "learning_rate": 5.7968052573531084e-05, "loss": 1.2283, "step": 600 }, { "epoch": 0.003994669343072971, "grad_norm": 0.3361023962497711, "learning_rate": 5.785099465387797e-05, "loss": 0.885, "step": 601 }, { "epoch": 0.004001316047470763, "grad_norm": 0.6551617383956909, "learning_rate": 5.773389259827604e-05, "loss": 0.7555, "step": 602 }, { "epoch": 0.004007962751868555, "grad_norm": 0.5150473117828369, "learning_rate": 5.7616747065038167e-05, "loss": 0.7154, "step": 603 }, { "epoch": 0.0040146094562663465, "grad_norm": 0.7329347729682922, "learning_rate": 5.749955871272153e-05, "loss": 0.8616, "step": 604 }, { "epoch": 0.004021256160664139, "grad_norm": 0.805991530418396, "learning_rate": 5.738232820012407e-05, "loss": 0.7985, "step": 605 }, { "epoch": 0.004027902865061931, "grad_norm": 0.5902569890022278, "learning_rate": 5.7265056186280754e-05, "loss": 0.8489, "step": 606 }, { "epoch": 0.0040345495694597226, "grad_norm": 0.7035917639732361, "learning_rate": 5.714774333045985e-05, "loss": 0.8908, "step": 607 }, { "epoch": 0.004041196273857514, "grad_norm": 0.9020839929580688, "learning_rate": 5.7030390292159195e-05, "loss": 1.0442, "step": 608 }, { "epoch": 0.004047842978255307, "grad_norm": 0.8471091985702515, "learning_rate": 5.691299773110258e-05, "loss": 0.7269, "step": 609 }, { "epoch": 0.004054489682653099, "grad_norm": 0.6457120776176453, "learning_rate": 5.6795566307235915e-05, "loss": 0.8743, "step": 610 }, { "epoch": 0.00406113638705089, "grad_norm": 0.7119438648223877, "learning_rate": 5.667809668072365e-05, "loss": 0.9006, "step": 611 }, { "epoch": 0.004067783091448682, "grad_norm": 0.722288191318512, "learning_rate": 5.656058951194493e-05, "loss": 0.859, "step": 612 }, { "epoch": 0.004074429795846475, "grad_norm": 0.7655767798423767, "learning_rate": 5.6443045461490016e-05, "loss": 0.9376, "step": 613 }, { "epoch": 0.0040810765002442665, "grad_norm": 0.8561120629310608, "learning_rate": 5.632546519015647e-05, "loss": 1.0914, "step": 614 }, { "epoch": 0.004087723204642058, "grad_norm": 0.6569271087646484, "learning_rate": 5.620784935894547e-05, "loss": 0.8716, "step": 615 }, { "epoch": 0.00409436990903985, "grad_norm": 0.7191852331161499, "learning_rate": 5.6090198629058134e-05, "loss": 0.8314, "step": 616 }, { "epoch": 0.0041010166134376426, "grad_norm": 0.8148145079612732, "learning_rate": 5.597251366189175e-05, "loss": 0.9475, "step": 617 }, { "epoch": 0.004107663317835434, "grad_norm": 0.9393811821937561, "learning_rate": 5.585479511903607e-05, "loss": 0.8579, "step": 618 }, { "epoch": 0.004114310022233226, "grad_norm": 0.9158850312232971, "learning_rate": 5.5737043662269626e-05, "loss": 0.7434, "step": 619 }, { "epoch": 0.004120956726631018, "grad_norm": 0.9305580258369446, "learning_rate": 5.5619259953555945e-05, "loss": 0.9014, "step": 620 }, { "epoch": 0.00412760343102881, "grad_norm": 0.697537899017334, "learning_rate": 5.55014446550399e-05, "loss": 0.9062, "step": 621 }, { "epoch": 0.004134250135426602, "grad_norm": 0.8496726155281067, "learning_rate": 5.538359842904392e-05, "loss": 0.823, "step": 622 }, { "epoch": 0.004140896839824394, "grad_norm": 0.9991664886474609, "learning_rate": 5.526572193806434e-05, "loss": 0.8625, "step": 623 }, { "epoch": 0.004147543544222186, "grad_norm": 0.7568731904029846, "learning_rate": 5.5147815844767604e-05, "loss": 0.5785, "step": 624 }, { "epoch": 0.004154190248619978, "grad_norm": 0.857988178730011, "learning_rate": 5.5029880811986544e-05, "loss": 0.8904, "step": 625 }, { "epoch": 0.00416083695301777, "grad_norm": 1.0831652879714966, "learning_rate": 5.491191750271677e-05, "loss": 0.9787, "step": 626 }, { "epoch": 0.004167483657415562, "grad_norm": 1.230855107307434, "learning_rate": 5.4793926580112755e-05, "loss": 0.6305, "step": 627 }, { "epoch": 0.004174130361813354, "grad_norm": 1.051511526107788, "learning_rate": 5.4675908707484294e-05, "loss": 0.7141, "step": 628 }, { "epoch": 0.004180777066211146, "grad_norm": 1.056456208229065, "learning_rate": 5.4557864548292616e-05, "loss": 0.7441, "step": 629 }, { "epoch": 0.004187423770608938, "grad_norm": 1.2184350490570068, "learning_rate": 5.4439794766146746e-05, "loss": 0.716, "step": 630 }, { "epoch": 0.0041940704750067295, "grad_norm": 1.0148402452468872, "learning_rate": 5.432170002479978e-05, "loss": 0.6101, "step": 631 }, { "epoch": 0.004200717179404522, "grad_norm": 1.0201961994171143, "learning_rate": 5.4203580988145095e-05, "loss": 0.9813, "step": 632 }, { "epoch": 0.004207363883802314, "grad_norm": 1.6791428327560425, "learning_rate": 5.408543832021268e-05, "loss": 0.8735, "step": 633 }, { "epoch": 0.004214010588200106, "grad_norm": 0.9905579686164856, "learning_rate": 5.3967272685165335e-05, "loss": 0.8519, "step": 634 }, { "epoch": 0.004220657292597897, "grad_norm": 0.928503155708313, "learning_rate": 5.384908474729501e-05, "loss": 0.9309, "step": 635 }, { "epoch": 0.00422730399699569, "grad_norm": 1.216955304145813, "learning_rate": 5.373087517101899e-05, "loss": 0.6677, "step": 636 }, { "epoch": 0.004233950701393482, "grad_norm": 1.135047435760498, "learning_rate": 5.361264462087627e-05, "loss": 0.6597, "step": 637 }, { "epoch": 0.0042405974057912734, "grad_norm": 1.2936071157455444, "learning_rate": 5.3494393761523685e-05, "loss": 0.7421, "step": 638 }, { "epoch": 0.004247244110189065, "grad_norm": 1.123592495918274, "learning_rate": 5.3376123257732293e-05, "loss": 0.6327, "step": 639 }, { "epoch": 0.004253890814586858, "grad_norm": 1.152856469154358, "learning_rate": 5.325783377438357e-05, "loss": 0.7773, "step": 640 }, { "epoch": 0.0042605375189846495, "grad_norm": 1.2354307174682617, "learning_rate": 5.313952597646568e-05, "loss": 0.6135, "step": 641 }, { "epoch": 0.004267184223382441, "grad_norm": 1.2968664169311523, "learning_rate": 5.3021200529069735e-05, "loss": 0.7562, "step": 642 }, { "epoch": 0.004273830927780233, "grad_norm": 1.130948543548584, "learning_rate": 5.290285809738612e-05, "loss": 0.7086, "step": 643 }, { "epoch": 0.004280477632178026, "grad_norm": 1.598926305770874, "learning_rate": 5.2784499346700634e-05, "loss": 0.8224, "step": 644 }, { "epoch": 0.004287124336575817, "grad_norm": 1.5448964834213257, "learning_rate": 5.266612494239088e-05, "loss": 0.8497, "step": 645 }, { "epoch": 0.004293771040973609, "grad_norm": 1.5256738662719727, "learning_rate": 5.254773554992242e-05, "loss": 0.6588, "step": 646 }, { "epoch": 0.004300417745371401, "grad_norm": 1.599063515663147, "learning_rate": 5.2429331834845066e-05, "loss": 0.9209, "step": 647 }, { "epoch": 0.0043070644497691934, "grad_norm": 1.5368719100952148, "learning_rate": 5.2310914462789176e-05, "loss": 0.557, "step": 648 }, { "epoch": 0.004313711154166985, "grad_norm": 2.103182554244995, "learning_rate": 5.219248409946188e-05, "loss": 0.7148, "step": 649 }, { "epoch": 0.004320357858564777, "grad_norm": 3.2987327575683594, "learning_rate": 5.207404141064334e-05, "loss": 1.0249, "step": 650 }, { "epoch": 0.004327004562962569, "grad_norm": 0.34882649779319763, "learning_rate": 5.1955587062183e-05, "loss": 0.9497, "step": 651 }, { "epoch": 0.004333651267360361, "grad_norm": 0.6303239464759827, "learning_rate": 5.1837121719995816e-05, "loss": 1.036, "step": 652 }, { "epoch": 0.004340297971758153, "grad_norm": 0.7038097977638245, "learning_rate": 5.1718646050058653e-05, "loss": 1.3745, "step": 653 }, { "epoch": 0.004346944676155945, "grad_norm": 0.5066866874694824, "learning_rate": 5.160016071840631e-05, "loss": 0.7574, "step": 654 }, { "epoch": 0.004353591380553737, "grad_norm": 0.6334048509597778, "learning_rate": 5.148166639112799e-05, "loss": 0.7552, "step": 655 }, { "epoch": 0.004360238084951529, "grad_norm": 0.5856367945671082, "learning_rate": 5.136316373436343e-05, "loss": 0.9462, "step": 656 }, { "epoch": 0.004366884789349321, "grad_norm": 0.6086112260818481, "learning_rate": 5.1244653414299196e-05, "loss": 1.0001, "step": 657 }, { "epoch": 0.004373531493747113, "grad_norm": 0.6570594906806946, "learning_rate": 5.1126136097164935e-05, "loss": 0.9825, "step": 658 }, { "epoch": 0.004380178198144905, "grad_norm": 0.7729535102844238, "learning_rate": 5.100761244922964e-05, "loss": 0.8703, "step": 659 }, { "epoch": 0.004386824902542697, "grad_norm": 0.6259683966636658, "learning_rate": 5.0889083136797875e-05, "loss": 0.749, "step": 660 }, { "epoch": 0.004393471606940489, "grad_norm": 0.6402033567428589, "learning_rate": 5.077054882620606e-05, "loss": 0.9971, "step": 661 }, { "epoch": 0.00440011831133828, "grad_norm": 0.6078352928161621, "learning_rate": 5.065201018381873e-05, "loss": 0.8893, "step": 662 }, { "epoch": 0.004406765015736073, "grad_norm": 0.8452315330505371, "learning_rate": 5.053346787602473e-05, "loss": 0.9117, "step": 663 }, { "epoch": 0.004413411720133865, "grad_norm": 0.7816033959388733, "learning_rate": 5.0414922569233555e-05, "loss": 0.7137, "step": 664 }, { "epoch": 0.0044200584245316565, "grad_norm": 0.765659749507904, "learning_rate": 5.029637492987153e-05, "loss": 0.9411, "step": 665 }, { "epoch": 0.004426705128929448, "grad_norm": 0.762501060962677, "learning_rate": 5.0177825624378114e-05, "loss": 0.9829, "step": 666 }, { "epoch": 0.004433351833327241, "grad_norm": 0.8198716044425964, "learning_rate": 5.0059275319202084e-05, "loss": 0.7453, "step": 667 }, { "epoch": 0.004439998537725033, "grad_norm": 0.6317304372787476, "learning_rate": 4.9940724680797914e-05, "loss": 0.8972, "step": 668 }, { "epoch": 0.004446645242122824, "grad_norm": 0.7842250466346741, "learning_rate": 4.98221743756219e-05, "loss": 0.7495, "step": 669 }, { "epoch": 0.004453291946520616, "grad_norm": 0.7934935092926025, "learning_rate": 4.970362507012848e-05, "loss": 0.8944, "step": 670 }, { "epoch": 0.004459938650918409, "grad_norm": 0.858690619468689, "learning_rate": 4.958507743076645e-05, "loss": 0.8814, "step": 671 }, { "epoch": 0.0044665853553162, "grad_norm": 0.7755974531173706, "learning_rate": 4.946653212397527e-05, "loss": 0.8873, "step": 672 }, { "epoch": 0.004473232059713992, "grad_norm": 0.995033860206604, "learning_rate": 4.934798981618127e-05, "loss": 0.7748, "step": 673 }, { "epoch": 0.004479878764111784, "grad_norm": 1.0964568853378296, "learning_rate": 4.922945117379394e-05, "loss": 0.629, "step": 674 }, { "epoch": 0.0044865254685095765, "grad_norm": 0.901854932308197, "learning_rate": 4.911091686320213e-05, "loss": 0.6407, "step": 675 }, { "epoch": 0.004493172172907368, "grad_norm": 0.9707088470458984, "learning_rate": 4.8992387550770373e-05, "loss": 0.8739, "step": 676 }, { "epoch": 0.00449981887730516, "grad_norm": 0.9366260766983032, "learning_rate": 4.887386390283508e-05, "loss": 0.7199, "step": 677 }, { "epoch": 0.004506465581702952, "grad_norm": 0.7978320717811584, "learning_rate": 4.875534658570081e-05, "loss": 0.735, "step": 678 }, { "epoch": 0.004513112286100744, "grad_norm": 0.917753279209137, "learning_rate": 4.863683626563657e-05, "loss": 0.6231, "step": 679 }, { "epoch": 0.004519758990498536, "grad_norm": 0.9080862402915955, "learning_rate": 4.851833360887201e-05, "loss": 0.9354, "step": 680 }, { "epoch": 0.004526405694896328, "grad_norm": 0.7667110562324524, "learning_rate": 4.839983928159371e-05, "loss": 0.7752, "step": 681 }, { "epoch": 0.00453305239929412, "grad_norm": 1.119476318359375, "learning_rate": 4.828135394994137e-05, "loss": 0.9401, "step": 682 }, { "epoch": 0.004539699103691912, "grad_norm": 1.1330437660217285, "learning_rate": 4.816287828000418e-05, "loss": 0.8267, "step": 683 }, { "epoch": 0.004546345808089704, "grad_norm": 0.904588520526886, "learning_rate": 4.804441293781702e-05, "loss": 0.8909, "step": 684 }, { "epoch": 0.004552992512487496, "grad_norm": 1.2153210639953613, "learning_rate": 4.7925958589356675e-05, "loss": 0.9012, "step": 685 }, { "epoch": 0.004559639216885288, "grad_norm": 1.0748893022537231, "learning_rate": 4.780751590053813e-05, "loss": 0.6757, "step": 686 }, { "epoch": 0.00456628592128308, "grad_norm": 1.0900492668151855, "learning_rate": 4.768908553721085e-05, "loss": 0.7261, "step": 687 }, { "epoch": 0.004572932625680872, "grad_norm": 1.0078903436660767, "learning_rate": 4.757066816515494e-05, "loss": 0.8643, "step": 688 }, { "epoch": 0.0045795793300786635, "grad_norm": 1.585972547531128, "learning_rate": 4.74522644500776e-05, "loss": 0.8907, "step": 689 }, { "epoch": 0.004586226034476456, "grad_norm": 1.3707166910171509, "learning_rate": 4.7333875057609126e-05, "loss": 0.8671, "step": 690 }, { "epoch": 0.004592872738874248, "grad_norm": 1.412148118019104, "learning_rate": 4.721550065329938e-05, "loss": 0.9209, "step": 691 }, { "epoch": 0.0045995194432720396, "grad_norm": 1.1696654558181763, "learning_rate": 4.70971419026139e-05, "loss": 0.6559, "step": 692 }, { "epoch": 0.004606166147669831, "grad_norm": 1.614428997039795, "learning_rate": 4.697879947093027e-05, "loss": 0.9432, "step": 693 }, { "epoch": 0.004612812852067624, "grad_norm": 1.4621384143829346, "learning_rate": 4.6860474023534335e-05, "loss": 0.6001, "step": 694 }, { "epoch": 0.004619459556465416, "grad_norm": 1.82318913936615, "learning_rate": 4.674216622561644e-05, "loss": 0.9496, "step": 695 }, { "epoch": 0.004626106260863207, "grad_norm": 1.641218662261963, "learning_rate": 4.662387674226771e-05, "loss": 0.8027, "step": 696 }, { "epoch": 0.004632752965260999, "grad_norm": 3.2597484588623047, "learning_rate": 4.6505606238476326e-05, "loss": 1.388, "step": 697 }, { "epoch": 0.004639399669658792, "grad_norm": 2.241243839263916, "learning_rate": 4.6387355379123734e-05, "loss": 1.2085, "step": 698 }, { "epoch": 0.0046460463740565835, "grad_norm": 2.4520256519317627, "learning_rate": 4.6269124828981014e-05, "loss": 0.5715, "step": 699 }, { "epoch": 0.004652693078454375, "grad_norm": 2.561048984527588, "learning_rate": 4.615091525270501e-05, "loss": 0.8268, "step": 700 }, { "epoch": 0.004659339782852167, "grad_norm": 0.3658062219619751, "learning_rate": 4.603272731483467e-05, "loss": 1.0607, "step": 701 }, { "epoch": 0.00466598648724996, "grad_norm": 0.5691250562667847, "learning_rate": 4.5914561679787345e-05, "loss": 1.1228, "step": 702 }, { "epoch": 0.004672633191647751, "grad_norm": 0.4382160007953644, "learning_rate": 4.579641901185491e-05, "loss": 0.8865, "step": 703 }, { "epoch": 0.004679279896045543, "grad_norm": 0.6933655142784119, "learning_rate": 4.567829997520023e-05, "loss": 0.7971, "step": 704 }, { "epoch": 0.004685926600443335, "grad_norm": 0.6793907880783081, "learning_rate": 4.5560205233853266e-05, "loss": 0.9222, "step": 705 }, { "epoch": 0.004692573304841127, "grad_norm": 0.842363178730011, "learning_rate": 4.544213545170741e-05, "loss": 0.8264, "step": 706 }, { "epoch": 0.004699220009238919, "grad_norm": 0.6511426568031311, "learning_rate": 4.5324091292515724e-05, "loss": 1.1472, "step": 707 }, { "epoch": 0.004705866713636711, "grad_norm": 0.5798910856246948, "learning_rate": 4.520607341988724e-05, "loss": 0.8783, "step": 708 }, { "epoch": 0.004712513418034503, "grad_norm": 0.5842803120613098, "learning_rate": 4.5088082497283235e-05, "loss": 1.0191, "step": 709 }, { "epoch": 0.004719160122432295, "grad_norm": 0.5944225788116455, "learning_rate": 4.497011918801347e-05, "loss": 0.8818, "step": 710 }, { "epoch": 0.004725806826830087, "grad_norm": 0.6699796319007874, "learning_rate": 4.485218415523242e-05, "loss": 0.8876, "step": 711 }, { "epoch": 0.004732453531227879, "grad_norm": 0.6302338242530823, "learning_rate": 4.473427806193567e-05, "loss": 0.8118, "step": 712 }, { "epoch": 0.004739100235625671, "grad_norm": 0.5817951560020447, "learning_rate": 4.4616401570956075e-05, "loss": 0.8042, "step": 713 }, { "epoch": 0.004745746940023463, "grad_norm": 0.8302817940711975, "learning_rate": 4.4498555344960115e-05, "loss": 0.5791, "step": 714 }, { "epoch": 0.004752393644421255, "grad_norm": 0.6934455633163452, "learning_rate": 4.4380740046444066e-05, "loss": 0.8314, "step": 715 }, { "epoch": 0.0047590403488190465, "grad_norm": 0.8211054801940918, "learning_rate": 4.4262956337730385e-05, "loss": 0.8583, "step": 716 }, { "epoch": 0.004765687053216839, "grad_norm": 0.8365226984024048, "learning_rate": 4.4145204880963945e-05, "loss": 0.7647, "step": 717 }, { "epoch": 0.004772333757614631, "grad_norm": 0.7704525589942932, "learning_rate": 4.402748633810826e-05, "loss": 0.8246, "step": 718 }, { "epoch": 0.004778980462012423, "grad_norm": 0.9756360054016113, "learning_rate": 4.390980137094187e-05, "loss": 0.7665, "step": 719 }, { "epoch": 0.004785627166410214, "grad_norm": 0.8577277660369873, "learning_rate": 4.379215064105454e-05, "loss": 0.7484, "step": 720 }, { "epoch": 0.004792273870808007, "grad_norm": 0.7188462615013123, "learning_rate": 4.367453480984356e-05, "loss": 0.8432, "step": 721 }, { "epoch": 0.004798920575205799, "grad_norm": 0.82792067527771, "learning_rate": 4.355695453851e-05, "loss": 0.9031, "step": 722 }, { "epoch": 0.0048055672796035905, "grad_norm": 0.7311772704124451, "learning_rate": 4.3439410488055075e-05, "loss": 0.7162, "step": 723 }, { "epoch": 0.004812213984001382, "grad_norm": 0.7862158417701721, "learning_rate": 4.332190331927636e-05, "loss": 0.9094, "step": 724 }, { "epoch": 0.004818860688399175, "grad_norm": 0.8980942964553833, "learning_rate": 4.3204433692764096e-05, "loss": 0.9385, "step": 725 }, { "epoch": 0.0048255073927969665, "grad_norm": 0.8081440925598145, "learning_rate": 4.3087002268897434e-05, "loss": 0.81, "step": 726 }, { "epoch": 0.004832154097194758, "grad_norm": 1.0013748407363892, "learning_rate": 4.296960970784082e-05, "loss": 0.7476, "step": 727 }, { "epoch": 0.00483880080159255, "grad_norm": 0.9266440272331238, "learning_rate": 4.285225666954016e-05, "loss": 0.8057, "step": 728 }, { "epoch": 0.004845447505990343, "grad_norm": 0.9051468968391418, "learning_rate": 4.273494381371926e-05, "loss": 0.939, "step": 729 }, { "epoch": 0.004852094210388134, "grad_norm": 0.8008094429969788, "learning_rate": 4.2617671799875944e-05, "loss": 0.8005, "step": 730 }, { "epoch": 0.004858740914785926, "grad_norm": 0.9573123455047607, "learning_rate": 4.2500441287278505e-05, "loss": 0.7531, "step": 731 }, { "epoch": 0.004865387619183718, "grad_norm": 1.2024757862091064, "learning_rate": 4.238325293496186e-05, "loss": 0.6332, "step": 732 }, { "epoch": 0.0048720343235815105, "grad_norm": 1.0710391998291016, "learning_rate": 4.226610740172396e-05, "loss": 0.9639, "step": 733 }, { "epoch": 0.004878681027979302, "grad_norm": 1.2133551836013794, "learning_rate": 4.214900534612205e-05, "loss": 0.5005, "step": 734 }, { "epoch": 0.004885327732377094, "grad_norm": 1.379006266593933, "learning_rate": 4.203194742646893e-05, "loss": 0.8146, "step": 735 }, { "epoch": 0.004891974436774886, "grad_norm": 1.496587872505188, "learning_rate": 4.191493430082929e-05, "loss": 0.7647, "step": 736 }, { "epoch": 0.004898621141172678, "grad_norm": 1.0434892177581787, "learning_rate": 4.179796662701597e-05, "loss": 0.8637, "step": 737 }, { "epoch": 0.00490526784557047, "grad_norm": 1.0825583934783936, "learning_rate": 4.168104506258636e-05, "loss": 0.6391, "step": 738 }, { "epoch": 0.004911914549968262, "grad_norm": 0.9926093220710754, "learning_rate": 4.1564170264838595e-05, "loss": 0.8085, "step": 739 }, { "epoch": 0.004918561254366054, "grad_norm": 1.1647216081619263, "learning_rate": 4.144734289080791e-05, "loss": 0.6383, "step": 740 }, { "epoch": 0.004925207958763846, "grad_norm": 1.0999730825424194, "learning_rate": 4.133056359726289e-05, "loss": 0.7778, "step": 741 }, { "epoch": 0.004931854663161638, "grad_norm": 1.5499298572540283, "learning_rate": 4.121383304070191e-05, "loss": 0.7831, "step": 742 }, { "epoch": 0.00493850136755943, "grad_norm": 1.425614595413208, "learning_rate": 4.1097151877349316e-05, "loss": 0.8449, "step": 743 }, { "epoch": 0.004945148071957222, "grad_norm": 1.5261919498443604, "learning_rate": 4.0980520763151796e-05, "loss": 0.7073, "step": 744 }, { "epoch": 0.004951794776355014, "grad_norm": 1.5911787748336792, "learning_rate": 4.086394035377463e-05, "loss": 0.6253, "step": 745 }, { "epoch": 0.004958441480752806, "grad_norm": 1.6266251802444458, "learning_rate": 4.074741130459813e-05, "loss": 0.8432, "step": 746 }, { "epoch": 0.004965088185150597, "grad_norm": 1.2186232805252075, "learning_rate": 4.063093427071376e-05, "loss": 0.6454, "step": 747 }, { "epoch": 0.00497173488954839, "grad_norm": 1.5160404443740845, "learning_rate": 4.051450990692073e-05, "loss": 0.805, "step": 748 }, { "epoch": 0.004978381593946182, "grad_norm": 2.0402166843414307, "learning_rate": 4.039813886772201e-05, "loss": 0.8304, "step": 749 }, { "epoch": 0.0049850282983439735, "grad_norm": 2.949624538421631, "learning_rate": 4.028182180732088e-05, "loss": 0.6917, "step": 750 }, { "epoch": 0.004991675002741765, "grad_norm": 0.35819950699806213, "learning_rate": 4.016555937961711e-05, "loss": 0.8131, "step": 751 }, { "epoch": 0.004998321707139558, "grad_norm": 0.43927833437919617, "learning_rate": 4.0049352238203376e-05, "loss": 0.857, "step": 752 }, { "epoch": 0.00500496841153735, "grad_norm": 0.4738449156284332, "learning_rate": 3.9933201036361554e-05, "loss": 0.8975, "step": 753 }, { "epoch": 0.005011615115935141, "grad_norm": 1.1976778507232666, "learning_rate": 3.981710642705906e-05, "loss": 0.7099, "step": 754 }, { "epoch": 0.005018261820332933, "grad_norm": 0.5854752063751221, "learning_rate": 3.970106906294509e-05, "loss": 0.9164, "step": 755 }, { "epoch": 0.005024908524730726, "grad_norm": 0.5547404289245605, "learning_rate": 3.9585089596347094e-05, "loss": 0.8185, "step": 756 }, { "epoch": 0.005031555229128517, "grad_norm": 0.7054506540298462, "learning_rate": 3.946916867926702e-05, "loss": 0.9904, "step": 757 }, { "epoch": 0.005038201933526309, "grad_norm": 0.5721423029899597, "learning_rate": 3.935330696337768e-05, "loss": 0.8113, "step": 758 }, { "epoch": 0.005044848637924101, "grad_norm": 0.7183482050895691, "learning_rate": 3.923750510001903e-05, "loss": 1.132, "step": 759 }, { "epoch": 0.0050514953423218935, "grad_norm": 0.6827608346939087, "learning_rate": 3.9121763740194616e-05, "loss": 0.8093, "step": 760 }, { "epoch": 0.005058142046719685, "grad_norm": 0.6405389308929443, "learning_rate": 3.900608353456782e-05, "loss": 0.7248, "step": 761 }, { "epoch": 0.005064788751117477, "grad_norm": 0.9901874661445618, "learning_rate": 3.889046513345818e-05, "loss": 0.6536, "step": 762 }, { "epoch": 0.005071435455515269, "grad_norm": 0.7273656725883484, "learning_rate": 3.87749091868379e-05, "loss": 0.7783, "step": 763 }, { "epoch": 0.005078082159913061, "grad_norm": 0.7552725672721863, "learning_rate": 3.8659416344328e-05, "loss": 0.8353, "step": 764 }, { "epoch": 0.005084728864310853, "grad_norm": 0.9276883602142334, "learning_rate": 3.854398725519479e-05, "loss": 0.804, "step": 765 }, { "epoch": 0.005091375568708645, "grad_norm": 0.6831343173980713, "learning_rate": 3.842862256834613e-05, "loss": 0.8294, "step": 766 }, { "epoch": 0.005098022273106437, "grad_norm": 0.8653516173362732, "learning_rate": 3.831332293232787e-05, "loss": 0.7905, "step": 767 }, { "epoch": 0.005104668977504229, "grad_norm": 0.788194477558136, "learning_rate": 3.819808899532015e-05, "loss": 0.896, "step": 768 }, { "epoch": 0.005111315681902021, "grad_norm": 0.8511560559272766, "learning_rate": 3.808292140513379e-05, "loss": 0.7863, "step": 769 }, { "epoch": 0.005117962386299813, "grad_norm": 0.868413507938385, "learning_rate": 3.796782080920659e-05, "loss": 0.8043, "step": 770 }, { "epoch": 0.005124609090697605, "grad_norm": 0.8420473337173462, "learning_rate": 3.7852787854599746e-05, "loss": 0.9915, "step": 771 }, { "epoch": 0.005131255795095397, "grad_norm": 0.8845599293708801, "learning_rate": 3.77378231879942e-05, "loss": 0.7352, "step": 772 }, { "epoch": 0.005137902499493189, "grad_norm": 0.820316731929779, "learning_rate": 3.762292745568702e-05, "loss": 0.7933, "step": 773 }, { "epoch": 0.0051445492038909805, "grad_norm": 1.0592056512832642, "learning_rate": 3.750810130358769e-05, "loss": 0.8795, "step": 774 }, { "epoch": 0.005151195908288773, "grad_norm": 0.7946357131004333, "learning_rate": 3.7393345377214586e-05, "loss": 0.6885, "step": 775 }, { "epoch": 0.005157842612686565, "grad_norm": 0.8453227877616882, "learning_rate": 3.727866032169127e-05, "loss": 0.7128, "step": 776 }, { "epoch": 0.005164489317084357, "grad_norm": 1.225663185119629, "learning_rate": 3.716404678174288e-05, "loss": 0.9135, "step": 777 }, { "epoch": 0.005171136021482148, "grad_norm": 0.8996406197547913, "learning_rate": 3.704950540169256e-05, "loss": 0.8883, "step": 778 }, { "epoch": 0.005177782725879941, "grad_norm": 0.9237585067749023, "learning_rate": 3.693503682545775e-05, "loss": 0.8482, "step": 779 }, { "epoch": 0.005184429430277733, "grad_norm": 1.2108267545700073, "learning_rate": 3.682064169654663e-05, "loss": 0.8083, "step": 780 }, { "epoch": 0.005191076134675524, "grad_norm": 0.9889872670173645, "learning_rate": 3.670632065805445e-05, "loss": 0.9583, "step": 781 }, { "epoch": 0.005197722839073316, "grad_norm": 0.9854968190193176, "learning_rate": 3.659207435265998e-05, "loss": 0.5858, "step": 782 }, { "epoch": 0.005204369543471109, "grad_norm": 0.9853740930557251, "learning_rate": 3.6477903422621865e-05, "loss": 0.7543, "step": 783 }, { "epoch": 0.0052110162478689005, "grad_norm": 1.0569560527801514, "learning_rate": 3.6363808509774996e-05, "loss": 0.9121, "step": 784 }, { "epoch": 0.005217662952266692, "grad_norm": 1.209022879600525, "learning_rate": 3.6249790255526915e-05, "loss": 0.9692, "step": 785 }, { "epoch": 0.005224309656664484, "grad_norm": 0.822544515132904, "learning_rate": 3.6135849300854225e-05, "loss": 0.5065, "step": 786 }, { "epoch": 0.005230956361062277, "grad_norm": 1.006661057472229, "learning_rate": 3.602198628629893e-05, "loss": 0.7771, "step": 787 }, { "epoch": 0.005237603065460068, "grad_norm": 1.003117322921753, "learning_rate": 3.590820185196497e-05, "loss": 0.7955, "step": 788 }, { "epoch": 0.00524424976985786, "grad_norm": 1.7347395420074463, "learning_rate": 3.579449663751444e-05, "loss": 0.7385, "step": 789 }, { "epoch": 0.005250896474255652, "grad_norm": 1.0848127603530884, "learning_rate": 3.5680871282164144e-05, "loss": 0.5635, "step": 790 }, { "epoch": 0.005257543178653444, "grad_norm": 1.8770695924758911, "learning_rate": 3.556732642468189e-05, "loss": 0.9407, "step": 791 }, { "epoch": 0.005264189883051236, "grad_norm": 1.2975075244903564, "learning_rate": 3.545386270338299e-05, "loss": 0.8492, "step": 792 }, { "epoch": 0.005270836587449028, "grad_norm": 1.0128837823867798, "learning_rate": 3.5340480756126624e-05, "loss": 0.6228, "step": 793 }, { "epoch": 0.00527748329184682, "grad_norm": 1.1561847925186157, "learning_rate": 3.522718122031229e-05, "loss": 0.6094, "step": 794 }, { "epoch": 0.005284129996244612, "grad_norm": 1.5360347032546997, "learning_rate": 3.5113964732876106e-05, "loss": 0.865, "step": 795 }, { "epoch": 0.005290776700642404, "grad_norm": 1.4779386520385742, "learning_rate": 3.500083193028741e-05, "loss": 0.6847, "step": 796 }, { "epoch": 0.005297423405040196, "grad_norm": 1.67745041847229, "learning_rate": 3.488778344854504e-05, "loss": 0.6983, "step": 797 }, { "epoch": 0.005304070109437988, "grad_norm": 1.8071433305740356, "learning_rate": 3.477481992317384e-05, "loss": 0.7865, "step": 798 }, { "epoch": 0.00531071681383578, "grad_norm": 2.242330312728882, "learning_rate": 3.466194198922102e-05, "loss": 0.9295, "step": 799 }, { "epoch": 0.005317363518233572, "grad_norm": 3.3643980026245117, "learning_rate": 3.4549150281252636e-05, "loss": 0.7402, "step": 800 }, { "epoch": 0.0053240102226313635, "grad_norm": 0.3782943785190582, "learning_rate": 3.4436445433350015e-05, "loss": 0.9188, "step": 801 }, { "epoch": 0.005330656927029156, "grad_norm": 0.539129376411438, "learning_rate": 3.432382807910615e-05, "loss": 0.8105, "step": 802 }, { "epoch": 0.005337303631426948, "grad_norm": 0.8282445669174194, "learning_rate": 3.421129885162223e-05, "loss": 0.7792, "step": 803 }, { "epoch": 0.00534395033582474, "grad_norm": 0.5992372035980225, "learning_rate": 3.409885838350397e-05, "loss": 0.6944, "step": 804 }, { "epoch": 0.005350597040222531, "grad_norm": 0.5927280187606812, "learning_rate": 3.3986507306858125e-05, "loss": 0.8305, "step": 805 }, { "epoch": 0.005357243744620324, "grad_norm": 0.6903488636016846, "learning_rate": 3.387424625328892e-05, "loss": 0.8415, "step": 806 }, { "epoch": 0.005363890449018116, "grad_norm": 0.5096772313117981, "learning_rate": 3.376207585389447e-05, "loss": 0.8794, "step": 807 }, { "epoch": 0.0053705371534159075, "grad_norm": 0.6946663856506348, "learning_rate": 3.3649996739263326e-05, "loss": 1.0822, "step": 808 }, { "epoch": 0.005377183857813699, "grad_norm": 0.6458315253257751, "learning_rate": 3.353800953947079e-05, "loss": 0.8732, "step": 809 }, { "epoch": 0.005383830562211492, "grad_norm": 0.5500379204750061, "learning_rate": 3.342611488407549e-05, "loss": 0.9068, "step": 810 }, { "epoch": 0.0053904772666092836, "grad_norm": 0.8139527440071106, "learning_rate": 3.331431340211576e-05, "loss": 0.9336, "step": 811 }, { "epoch": 0.005397123971007075, "grad_norm": 0.6902498602867126, "learning_rate": 3.3202605722106186e-05, "loss": 0.8824, "step": 812 }, { "epoch": 0.005403770675404867, "grad_norm": 0.6358487606048584, "learning_rate": 3.309099247203401e-05, "loss": 0.9743, "step": 813 }, { "epoch": 0.00541041737980266, "grad_norm": 0.8419442176818848, "learning_rate": 3.297947427935559e-05, "loss": 0.8796, "step": 814 }, { "epoch": 0.005417064084200451, "grad_norm": 0.8624058365821838, "learning_rate": 3.286805177099293e-05, "loss": 0.8735, "step": 815 }, { "epoch": 0.005423710788598243, "grad_norm": 1.113830327987671, "learning_rate": 3.275672557333015e-05, "loss": 0.8796, "step": 816 }, { "epoch": 0.005430357492996035, "grad_norm": 0.7120754718780518, "learning_rate": 3.264549631220985e-05, "loss": 0.8, "step": 817 }, { "epoch": 0.0054370041973938275, "grad_norm": 0.8254979252815247, "learning_rate": 3.253436461292978e-05, "loss": 0.702, "step": 818 }, { "epoch": 0.005443650901791619, "grad_norm": 0.7647942900657654, "learning_rate": 3.2423331100239177e-05, "loss": 0.8197, "step": 819 }, { "epoch": 0.005450297606189411, "grad_norm": 0.6956480145454407, "learning_rate": 3.231239639833531e-05, "loss": 0.8276, "step": 820 }, { "epoch": 0.005456944310587203, "grad_norm": 0.9934280514717102, "learning_rate": 3.2201561130859957e-05, "loss": 0.8602, "step": 821 }, { "epoch": 0.005463591014984995, "grad_norm": 0.9009010791778564, "learning_rate": 3.209082592089591e-05, "loss": 0.8127, "step": 822 }, { "epoch": 0.005470237719382787, "grad_norm": 1.0514200925827026, "learning_rate": 3.1980191390963484e-05, "loss": 0.9751, "step": 823 }, { "epoch": 0.005476884423780579, "grad_norm": 1.3160456418991089, "learning_rate": 3.1869658163017e-05, "loss": 0.8358, "step": 824 }, { "epoch": 0.005483531128178371, "grad_norm": 1.054132342338562, "learning_rate": 3.175922685844125e-05, "loss": 0.6201, "step": 825 }, { "epoch": 0.005490177832576163, "grad_norm": 0.984657883644104, "learning_rate": 3.164889809804808e-05, "loss": 1.0336, "step": 826 }, { "epoch": 0.005496824536973955, "grad_norm": 1.1126635074615479, "learning_rate": 3.1538672502072817e-05, "loss": 0.8523, "step": 827 }, { "epoch": 0.005503471241371747, "grad_norm": 0.7525351047515869, "learning_rate": 3.142855069017093e-05, "loss": 0.568, "step": 828 }, { "epoch": 0.005510117945769539, "grad_norm": 0.9859227538108826, "learning_rate": 3.131853328141434e-05, "loss": 0.7202, "step": 829 }, { "epoch": 0.005516764650167331, "grad_norm": 0.9517953991889954, "learning_rate": 3.12086208942881e-05, "loss": 0.7246, "step": 830 }, { "epoch": 0.005523411354565123, "grad_norm": 0.9761250019073486, "learning_rate": 3.1098814146686834e-05, "loss": 1.0411, "step": 831 }, { "epoch": 0.0055300580589629144, "grad_norm": 1.040299415588379, "learning_rate": 3.098911365591129e-05, "loss": 0.8767, "step": 832 }, { "epoch": 0.005536704763360707, "grad_norm": 0.9226474165916443, "learning_rate": 3.087952003866492e-05, "loss": 0.6856, "step": 833 }, { "epoch": 0.005543351467758499, "grad_norm": 0.9735934138298035, "learning_rate": 3.077003391105033e-05, "loss": 0.4014, "step": 834 }, { "epoch": 0.0055499981721562905, "grad_norm": 1.027713418006897, "learning_rate": 3.0660655888565825e-05, "loss": 0.8747, "step": 835 }, { "epoch": 0.005556644876554082, "grad_norm": 1.1405994892120361, "learning_rate": 3.055138658610202e-05, "loss": 0.6844, "step": 836 }, { "epoch": 0.005563291580951875, "grad_norm": 1.0301321744918823, "learning_rate": 3.0442226617938307e-05, "loss": 0.6939, "step": 837 }, { "epoch": 0.005569938285349667, "grad_norm": 1.1772515773773193, "learning_rate": 3.0333176597739476e-05, "loss": 0.4741, "step": 838 }, { "epoch": 0.005576584989747458, "grad_norm": 1.322670340538025, "learning_rate": 3.022423713855218e-05, "loss": 0.6345, "step": 839 }, { "epoch": 0.00558323169414525, "grad_norm": 1.449883222579956, "learning_rate": 3.0115408852801535e-05, "loss": 1.018, "step": 840 }, { "epoch": 0.005589878398543043, "grad_norm": 1.06947660446167, "learning_rate": 3.00066923522877e-05, "loss": 0.6099, "step": 841 }, { "epoch": 0.0055965251029408344, "grad_norm": 1.6659729480743408, "learning_rate": 2.9898088248182366e-05, "loss": 0.6934, "step": 842 }, { "epoch": 0.005603171807338626, "grad_norm": 1.4211032390594482, "learning_rate": 2.9789597151025438e-05, "loss": 0.596, "step": 843 }, { "epoch": 0.005609818511736418, "grad_norm": 1.7341675758361816, "learning_rate": 2.968121967072145e-05, "loss": 0.9223, "step": 844 }, { "epoch": 0.0056164652161342105, "grad_norm": 1.3527710437774658, "learning_rate": 2.9572956416536267e-05, "loss": 0.4806, "step": 845 }, { "epoch": 0.005623111920532002, "grad_norm": 1.7114077806472778, "learning_rate": 2.946480799709358e-05, "loss": 0.7749, "step": 846 }, { "epoch": 0.005629758624929794, "grad_norm": 2.36133074760437, "learning_rate": 2.935677502037152e-05, "loss": 0.9725, "step": 847 }, { "epoch": 0.005636405329327586, "grad_norm": 1.736711025238037, "learning_rate": 2.924885809369926e-05, "loss": 0.9024, "step": 848 }, { "epoch": 0.005643052033725378, "grad_norm": 2.3337650299072266, "learning_rate": 2.9141057823753548e-05, "loss": 0.6143, "step": 849 }, { "epoch": 0.00564969873812317, "grad_norm": 4.121450424194336, "learning_rate": 2.9033374816555338e-05, "loss": 1.1379, "step": 850 }, { "epoch": 0.005656345442520962, "grad_norm": 0.3683931827545166, "learning_rate": 2.892580967746633e-05, "loss": 0.9842, "step": 851 }, { "epoch": 0.005662992146918754, "grad_norm": 0.49938151240348816, "learning_rate": 2.8818363011185647e-05, "loss": 0.9464, "step": 852 }, { "epoch": 0.005669638851316546, "grad_norm": 0.6275522708892822, "learning_rate": 2.8711035421746367e-05, "loss": 0.7492, "step": 853 }, { "epoch": 0.005676285555714338, "grad_norm": 0.5167999267578125, "learning_rate": 2.86038275125122e-05, "loss": 0.6106, "step": 854 }, { "epoch": 0.00568293226011213, "grad_norm": 0.7422475814819336, "learning_rate": 2.8496739886173995e-05, "loss": 0.713, "step": 855 }, { "epoch": 0.005689578964509922, "grad_norm": 0.6441875100135803, "learning_rate": 2.83897731447464e-05, "loss": 0.6015, "step": 856 }, { "epoch": 0.005696225668907714, "grad_norm": 0.7126911878585815, "learning_rate": 2.8282927889564542e-05, "loss": 0.7766, "step": 857 }, { "epoch": 0.005702872373305506, "grad_norm": 0.7499037981033325, "learning_rate": 2.8176204721280563e-05, "loss": 0.6112, "step": 858 }, { "epoch": 0.0057095190777032975, "grad_norm": 0.5948365330696106, "learning_rate": 2.806960423986026e-05, "loss": 0.8759, "step": 859 }, { "epoch": 0.00571616578210109, "grad_norm": 0.6240098476409912, "learning_rate": 2.7963127044579697e-05, "loss": 0.9862, "step": 860 }, { "epoch": 0.005722812486498882, "grad_norm": 0.7282832264900208, "learning_rate": 2.7856773734021913e-05, "loss": 0.6017, "step": 861 }, { "epoch": 0.005729459190896674, "grad_norm": 0.6713678240776062, "learning_rate": 2.7750544906073463e-05, "loss": 0.8692, "step": 862 }, { "epoch": 0.005736105895294465, "grad_norm": 0.6113742589950562, "learning_rate": 2.764444115792113e-05, "loss": 0.9001, "step": 863 }, { "epoch": 0.005742752599692258, "grad_norm": 0.6918427348136902, "learning_rate": 2.7538463086048504e-05, "loss": 0.8363, "step": 864 }, { "epoch": 0.00574939930409005, "grad_norm": 0.9629685282707214, "learning_rate": 2.743261128623269e-05, "loss": 0.9178, "step": 865 }, { "epoch": 0.005756046008487841, "grad_norm": 0.7158761024475098, "learning_rate": 2.7326886353540916e-05, "loss": 0.7963, "step": 866 }, { "epoch": 0.005762692712885633, "grad_norm": 0.6633831858634949, "learning_rate": 2.7221288882327167e-05, "loss": 0.7058, "step": 867 }, { "epoch": 0.005769339417283426, "grad_norm": 0.7890012264251709, "learning_rate": 2.711581946622893e-05, "loss": 0.7596, "step": 868 }, { "epoch": 0.0057759861216812175, "grad_norm": 0.743654727935791, "learning_rate": 2.701047869816381e-05, "loss": 0.9293, "step": 869 }, { "epoch": 0.005782632826079009, "grad_norm": 0.9647628664970398, "learning_rate": 2.6905267170326143e-05, "loss": 0.7642, "step": 870 }, { "epoch": 0.005789279530476801, "grad_norm": 0.7934918403625488, "learning_rate": 2.6800185474183725e-05, "loss": 0.9243, "step": 871 }, { "epoch": 0.005795926234874594, "grad_norm": 0.8000086545944214, "learning_rate": 2.6695234200474495e-05, "loss": 0.9556, "step": 872 }, { "epoch": 0.005802572939272385, "grad_norm": 0.774133563041687, "learning_rate": 2.6590413939203228e-05, "loss": 0.7859, "step": 873 }, { "epoch": 0.005809219643670177, "grad_norm": 0.994034469127655, "learning_rate": 2.6485725279638113e-05, "loss": 0.8363, "step": 874 }, { "epoch": 0.005815866348067969, "grad_norm": 0.8750971555709839, "learning_rate": 2.6381168810307533e-05, "loss": 0.92, "step": 875 }, { "epoch": 0.005822513052465761, "grad_norm": 1.1995213031768799, "learning_rate": 2.627674511899678e-05, "loss": 0.7646, "step": 876 }, { "epoch": 0.005829159756863553, "grad_norm": 0.8142304420471191, "learning_rate": 2.6172454792744706e-05, "loss": 0.867, "step": 877 }, { "epoch": 0.005835806461261345, "grad_norm": 0.9330615401268005, "learning_rate": 2.6068298417840364e-05, "loss": 0.8755, "step": 878 }, { "epoch": 0.005842453165659137, "grad_norm": 1.0684067010879517, "learning_rate": 2.596427657981979e-05, "loss": 0.8391, "step": 879 }, { "epoch": 0.005849099870056929, "grad_norm": 0.7989206910133362, "learning_rate": 2.5860389863462765e-05, "loss": 0.7431, "step": 880 }, { "epoch": 0.005855746574454721, "grad_norm": 0.8344511389732361, "learning_rate": 2.575663885278935e-05, "loss": 0.725, "step": 881 }, { "epoch": 0.005862393278852513, "grad_norm": 1.2674167156219482, "learning_rate": 2.5653024131056802e-05, "loss": 0.9218, "step": 882 }, { "epoch": 0.005869039983250305, "grad_norm": 1.206439733505249, "learning_rate": 2.5549546280756137e-05, "loss": 0.7342, "step": 883 }, { "epoch": 0.005875686687648097, "grad_norm": 1.1278153657913208, "learning_rate": 2.544620588360897e-05, "loss": 0.8005, "step": 884 }, { "epoch": 0.005882333392045889, "grad_norm": 1.1391501426696777, "learning_rate": 2.5343003520564158e-05, "loss": 0.8529, "step": 885 }, { "epoch": 0.0058889800964436806, "grad_norm": 0.9958661794662476, "learning_rate": 2.5239939771794563e-05, "loss": 0.7329, "step": 886 }, { "epoch": 0.005895626800841473, "grad_norm": 1.1319859027862549, "learning_rate": 2.5137015216693848e-05, "loss": 0.7973, "step": 887 }, { "epoch": 0.005902273505239265, "grad_norm": 1.1556273698806763, "learning_rate": 2.5034230433873153e-05, "loss": 0.8337, "step": 888 }, { "epoch": 0.005908920209637057, "grad_norm": 1.0099126100540161, "learning_rate": 2.4931586001157835e-05, "loss": 0.7135, "step": 889 }, { "epoch": 0.005915566914034848, "grad_norm": 1.1238211393356323, "learning_rate": 2.4829082495584242e-05, "loss": 0.5285, "step": 890 }, { "epoch": 0.005922213618432641, "grad_norm": 1.2216733694076538, "learning_rate": 2.4726720493396512e-05, "loss": 0.7336, "step": 891 }, { "epoch": 0.005928860322830433, "grad_norm": 1.766696572303772, "learning_rate": 2.4624500570043278e-05, "loss": 0.8474, "step": 892 }, { "epoch": 0.0059355070272282245, "grad_norm": 1.1542131900787354, "learning_rate": 2.4522423300174437e-05, "loss": 0.7923, "step": 893 }, { "epoch": 0.005942153731626016, "grad_norm": 1.332521915435791, "learning_rate": 2.4420489257637906e-05, "loss": 0.7754, "step": 894 }, { "epoch": 0.005948800436023809, "grad_norm": 1.6716405153274536, "learning_rate": 2.4318699015476493e-05, "loss": 0.7301, "step": 895 }, { "epoch": 0.0059554471404216006, "grad_norm": 1.6850249767303467, "learning_rate": 2.421705314592452e-05, "loss": 0.7965, "step": 896 }, { "epoch": 0.005962093844819392, "grad_norm": 1.4353415966033936, "learning_rate": 2.4115552220404765e-05, "loss": 0.6851, "step": 897 }, { "epoch": 0.005968740549217184, "grad_norm": 2.0517685413360596, "learning_rate": 2.4014196809525102e-05, "loss": 1.0993, "step": 898 }, { "epoch": 0.005975387253614977, "grad_norm": 2.061101198196411, "learning_rate": 2.391298748307544e-05, "loss": 0.6542, "step": 899 }, { "epoch": 0.005982033958012768, "grad_norm": 2.91339373588562, "learning_rate": 2.3811924810024384e-05, "loss": 0.7534, "step": 900 }, { "epoch": 0.00598868066241056, "grad_norm": 0.3465811312198639, "learning_rate": 2.3711009358516102e-05, "loss": 0.9504, "step": 901 }, { "epoch": 0.005995327366808352, "grad_norm": 0.5000481605529785, "learning_rate": 2.3610241695867163e-05, "loss": 0.8834, "step": 902 }, { "epoch": 0.0060019740712061445, "grad_norm": 0.871720552444458, "learning_rate": 2.3509622388563313e-05, "loss": 0.6535, "step": 903 }, { "epoch": 0.006008620775603936, "grad_norm": 0.9901016354560852, "learning_rate": 2.3409152002256247e-05, "loss": 1.0592, "step": 904 }, { "epoch": 0.006015267480001728, "grad_norm": 0.6650891304016113, "learning_rate": 2.3308831101760486e-05, "loss": 0.9605, "step": 905 }, { "epoch": 0.00602191418439952, "grad_norm": 0.6247522830963135, "learning_rate": 2.3208660251050158e-05, "loss": 0.8668, "step": 906 }, { "epoch": 0.006028560888797312, "grad_norm": 0.8756898641586304, "learning_rate": 2.3108640013255968e-05, "loss": 0.9702, "step": 907 }, { "epoch": 0.006035207593195104, "grad_norm": 0.618049144744873, "learning_rate": 2.30087709506618e-05, "loss": 0.7552, "step": 908 }, { "epoch": 0.006041854297592896, "grad_norm": 0.8775025010108948, "learning_rate": 2.2909053624701715e-05, "loss": 0.7956, "step": 909 }, { "epoch": 0.006048501001990688, "grad_norm": 0.6528658866882324, "learning_rate": 2.2809488595956745e-05, "loss": 0.7783, "step": 910 }, { "epoch": 0.00605514770638848, "grad_norm": 0.5745322704315186, "learning_rate": 2.2710076424151788e-05, "loss": 0.9813, "step": 911 }, { "epoch": 0.006061794410786272, "grad_norm": 0.8731324076652527, "learning_rate": 2.2610817668152413e-05, "loss": 0.8946, "step": 912 }, { "epoch": 0.006068441115184064, "grad_norm": 0.7682494521141052, "learning_rate": 2.251171288596171e-05, "loss": 0.9534, "step": 913 }, { "epoch": 0.006075087819581856, "grad_norm": 0.7490772008895874, "learning_rate": 2.241276263471716e-05, "loss": 0.8044, "step": 914 }, { "epoch": 0.006081734523979648, "grad_norm": 0.7320804595947266, "learning_rate": 2.2313967470687593e-05, "loss": 0.7225, "step": 915 }, { "epoch": 0.00608838122837744, "grad_norm": 0.8209467530250549, "learning_rate": 2.2215327949269886e-05, "loss": 0.773, "step": 916 }, { "epoch": 0.0060950279327752314, "grad_norm": 0.7817612290382385, "learning_rate": 2.2116844624986037e-05, "loss": 0.8568, "step": 917 }, { "epoch": 0.006101674637173024, "grad_norm": 0.7729702591896057, "learning_rate": 2.201851805147986e-05, "loss": 0.9524, "step": 918 }, { "epoch": 0.006108321341570816, "grad_norm": 0.8721408247947693, "learning_rate": 2.192034878151404e-05, "loss": 0.9534, "step": 919 }, { "epoch": 0.0061149680459686075, "grad_norm": 0.7396866083145142, "learning_rate": 2.1822337366966898e-05, "loss": 0.8427, "step": 920 }, { "epoch": 0.006121614750366399, "grad_norm": 0.8488925695419312, "learning_rate": 2.1724484358829338e-05, "loss": 0.8558, "step": 921 }, { "epoch": 0.006128261454764192, "grad_norm": 0.8868795037269592, "learning_rate": 2.1626790307201778e-05, "loss": 0.6768, "step": 922 }, { "epoch": 0.006134908159161984, "grad_norm": 0.9627169966697693, "learning_rate": 2.1529255761291052e-05, "loss": 0.6102, "step": 923 }, { "epoch": 0.006141554863559775, "grad_norm": 0.9614560604095459, "learning_rate": 2.1431881269407233e-05, "loss": 0.7419, "step": 924 }, { "epoch": 0.006148201567957567, "grad_norm": 0.9442191123962402, "learning_rate": 2.1334667378960644e-05, "loss": 0.9619, "step": 925 }, { "epoch": 0.00615484827235536, "grad_norm": 0.8390161991119385, "learning_rate": 2.123761463645878e-05, "loss": 0.8088, "step": 926 }, { "epoch": 0.0061614949767531514, "grad_norm": 0.9244080781936646, "learning_rate": 2.114072358750321e-05, "loss": 0.875, "step": 927 }, { "epoch": 0.006168141681150943, "grad_norm": 0.9828105568885803, "learning_rate": 2.1043994776786473e-05, "loss": 0.7579, "step": 928 }, { "epoch": 0.006174788385548735, "grad_norm": 0.8843576908111572, "learning_rate": 2.0947428748089066e-05, "loss": 0.6549, "step": 929 }, { "epoch": 0.0061814350899465275, "grad_norm": 1.1693994998931885, "learning_rate": 2.0851026044276406e-05, "loss": 0.9206, "step": 930 }, { "epoch": 0.006188081794344319, "grad_norm": 1.1045140027999878, "learning_rate": 2.0754787207295696e-05, "loss": 0.8276, "step": 931 }, { "epoch": 0.006194728498742111, "grad_norm": 0.9659402966499329, "learning_rate": 2.0658712778172982e-05, "loss": 0.8977, "step": 932 }, { "epoch": 0.006201375203139903, "grad_norm": 0.9211580157279968, "learning_rate": 2.0562803297009996e-05, "loss": 0.7761, "step": 933 }, { "epoch": 0.006208021907537695, "grad_norm": 1.2015657424926758, "learning_rate": 2.0467059302981263e-05, "loss": 0.8824, "step": 934 }, { "epoch": 0.006214668611935487, "grad_norm": 1.3020975589752197, "learning_rate": 2.0371481334330912e-05, "loss": 0.9115, "step": 935 }, { "epoch": 0.006221315316333279, "grad_norm": 0.8667546510696411, "learning_rate": 2.0276069928369746e-05, "loss": 0.5838, "step": 936 }, { "epoch": 0.006227962020731071, "grad_norm": 1.300489068031311, "learning_rate": 2.0180825621472227e-05, "loss": 0.6866, "step": 937 }, { "epoch": 0.006234608725128863, "grad_norm": 1.0658702850341797, "learning_rate": 2.0085748949073446e-05, "loss": 0.795, "step": 938 }, { "epoch": 0.006241255429526655, "grad_norm": 1.0691542625427246, "learning_rate": 1.9990840445666054e-05, "loss": 0.88, "step": 939 }, { "epoch": 0.006247902133924447, "grad_norm": 1.6725701093673706, "learning_rate": 1.9896100644797317e-05, "loss": 1.0337, "step": 940 }, { "epoch": 0.006254548838322239, "grad_norm": 1.0296176671981812, "learning_rate": 1.9801530079066137e-05, "loss": 0.4251, "step": 941 }, { "epoch": 0.006261195542720031, "grad_norm": 1.954781174659729, "learning_rate": 1.9707129280120036e-05, "loss": 0.7169, "step": 942 }, { "epoch": 0.006267842247117823, "grad_norm": 1.4644161462783813, "learning_rate": 1.96128987786521e-05, "loss": 0.49, "step": 943 }, { "epoch": 0.0062744889515156145, "grad_norm": 1.8161664009094238, "learning_rate": 1.9518839104398075e-05, "loss": 0.9943, "step": 944 }, { "epoch": 0.006281135655913407, "grad_norm": 1.5679110288619995, "learning_rate": 1.942495078613341e-05, "loss": 0.8934, "step": 945 }, { "epoch": 0.006287782360311199, "grad_norm": 1.837007999420166, "learning_rate": 1.933123435167018e-05, "loss": 0.8163, "step": 946 }, { "epoch": 0.006294429064708991, "grad_norm": 1.9493184089660645, "learning_rate": 1.9237690327854236e-05, "loss": 0.9895, "step": 947 }, { "epoch": 0.006301075769106782, "grad_norm": 1.4512784481048584, "learning_rate": 1.9144319240562125e-05, "loss": 0.6742, "step": 948 }, { "epoch": 0.006307722473504575, "grad_norm": 1.9372994899749756, "learning_rate": 1.9051121614698276e-05, "loss": 0.6534, "step": 949 }, { "epoch": 0.006314369177902367, "grad_norm": 4.261740207672119, "learning_rate": 1.8958097974191907e-05, "loss": 0.7541, "step": 950 }, { "epoch": 0.006321015882300158, "grad_norm": 0.39013972878456116, "learning_rate": 1.8865248841994143e-05, "loss": 0.7921, "step": 951 }, { "epoch": 0.00632766258669795, "grad_norm": 0.5134124755859375, "learning_rate": 1.877257474007511e-05, "loss": 0.961, "step": 952 }, { "epoch": 0.006334309291095743, "grad_norm": 0.9366634488105774, "learning_rate": 1.8680076189420964e-05, "loss": 0.8541, "step": 953 }, { "epoch": 0.0063409559954935345, "grad_norm": 0.532407820224762, "learning_rate": 1.858775371003094e-05, "loss": 1.0254, "step": 954 }, { "epoch": 0.006347602699891326, "grad_norm": 0.7640518546104431, "learning_rate": 1.849560782091445e-05, "loss": 0.9868, "step": 955 }, { "epoch": 0.006354249404289118, "grad_norm": 0.49766799807548523, "learning_rate": 1.840363904008819e-05, "loss": 0.8374, "step": 956 }, { "epoch": 0.006360896108686911, "grad_norm": 0.7397210597991943, "learning_rate": 1.831184788457321e-05, "loss": 0.8034, "step": 957 }, { "epoch": 0.006367542813084702, "grad_norm": 0.5797079801559448, "learning_rate": 1.8220234870391983e-05, "loss": 1.059, "step": 958 }, { "epoch": 0.006374189517482494, "grad_norm": 0.634558379650116, "learning_rate": 1.8128800512565513e-05, "loss": 0.8126, "step": 959 }, { "epoch": 0.006380836221880286, "grad_norm": 0.604607343673706, "learning_rate": 1.8037545325110504e-05, "loss": 0.9754, "step": 960 }, { "epoch": 0.006387482926278078, "grad_norm": 0.6816580295562744, "learning_rate": 1.7946469821036354e-05, "loss": 0.7716, "step": 961 }, { "epoch": 0.00639412963067587, "grad_norm": 0.6408112049102783, "learning_rate": 1.7855574512342397e-05, "loss": 0.6089, "step": 962 }, { "epoch": 0.006400776335073662, "grad_norm": 0.609083890914917, "learning_rate": 1.7764859910014893e-05, "loss": 0.9005, "step": 963 }, { "epoch": 0.006407423039471454, "grad_norm": 1.0609853267669678, "learning_rate": 1.7674326524024298e-05, "loss": 0.764, "step": 964 }, { "epoch": 0.006414069743869246, "grad_norm": 0.5911771059036255, "learning_rate": 1.7583974863322274e-05, "loss": 1.0236, "step": 965 }, { "epoch": 0.006420716448267038, "grad_norm": 0.6928749084472656, "learning_rate": 1.7493805435838874e-05, "loss": 0.7597, "step": 966 }, { "epoch": 0.00642736315266483, "grad_norm": 0.8709490299224854, "learning_rate": 1.7403818748479715e-05, "loss": 0.6517, "step": 967 }, { "epoch": 0.006434009857062622, "grad_norm": 0.6218889355659485, "learning_rate": 1.7314015307123133e-05, "loss": 0.9283, "step": 968 }, { "epoch": 0.006440656561460414, "grad_norm": 0.7528246641159058, "learning_rate": 1.7224395616617235e-05, "loss": 0.7588, "step": 969 }, { "epoch": 0.006447303265858206, "grad_norm": 0.7978760004043579, "learning_rate": 1.713496018077717e-05, "loss": 0.6017, "step": 970 }, { "epoch": 0.0064539499702559976, "grad_norm": 0.8703947067260742, "learning_rate": 1.7045709502382285e-05, "loss": 0.8674, "step": 971 }, { "epoch": 0.00646059667465379, "grad_norm": 0.8785549402236938, "learning_rate": 1.6956644083173262e-05, "loss": 0.6892, "step": 972 }, { "epoch": 0.006467243379051582, "grad_norm": 0.9214485883712769, "learning_rate": 1.6867764423849296e-05, "loss": 0.6507, "step": 973 }, { "epoch": 0.006473890083449374, "grad_norm": 0.7563501000404358, "learning_rate": 1.677907102406529e-05, "loss": 0.7163, "step": 974 }, { "epoch": 0.006480536787847165, "grad_norm": 0.9271652102470398, "learning_rate": 1.66905643824291e-05, "loss": 0.8399, "step": 975 }, { "epoch": 0.006487183492244958, "grad_norm": 0.906379759311676, "learning_rate": 1.660224499649861e-05, "loss": 0.6255, "step": 976 }, { "epoch": 0.00649383019664275, "grad_norm": 0.7881162762641907, "learning_rate": 1.6514113362779092e-05, "loss": 0.7797, "step": 977 }, { "epoch": 0.0065004769010405415, "grad_norm": 0.9204055666923523, "learning_rate": 1.6426169976720245e-05, "loss": 0.8043, "step": 978 }, { "epoch": 0.006507123605438333, "grad_norm": 0.9636942148208618, "learning_rate": 1.6338415332713576e-05, "loss": 0.8895, "step": 979 }, { "epoch": 0.006513770309836126, "grad_norm": 0.8163102269172668, "learning_rate": 1.6250849924089484e-05, "loss": 0.9835, "step": 980 }, { "epoch": 0.006520417014233918, "grad_norm": 1.0948930978775024, "learning_rate": 1.616347424311455e-05, "loss": 0.58, "step": 981 }, { "epoch": 0.006527063718631709, "grad_norm": 1.122653603553772, "learning_rate": 1.607628878098879e-05, "loss": 0.8378, "step": 982 }, { "epoch": 0.006533710423029501, "grad_norm": 0.9559472799301147, "learning_rate": 1.5989294027842867e-05, "loss": 0.6431, "step": 983 }, { "epoch": 0.006540357127427294, "grad_norm": 1.3170665502548218, "learning_rate": 1.59024904727353e-05, "loss": 0.4975, "step": 984 }, { "epoch": 0.006547003831825085, "grad_norm": 1.3573777675628662, "learning_rate": 1.581587860364977e-05, "loss": 0.7293, "step": 985 }, { "epoch": 0.006553650536222877, "grad_norm": 0.9671066403388977, "learning_rate": 1.5729458907492327e-05, "loss": 0.9002, "step": 986 }, { "epoch": 0.006560297240620669, "grad_norm": 1.218948245048523, "learning_rate": 1.5643231870088772e-05, "loss": 0.8514, "step": 987 }, { "epoch": 0.0065669439450184615, "grad_norm": 1.1144795417785645, "learning_rate": 1.5557197976181744e-05, "loss": 0.8713, "step": 988 }, { "epoch": 0.006573590649416253, "grad_norm": 1.354432463645935, "learning_rate": 1.5471357709428093e-05, "loss": 0.9549, "step": 989 }, { "epoch": 0.006580237353814045, "grad_norm": 1.4001054763793945, "learning_rate": 1.5385711552396227e-05, "loss": 1.1086, "step": 990 }, { "epoch": 0.006586884058211837, "grad_norm": 1.5661379098892212, "learning_rate": 1.5300259986563232e-05, "loss": 0.8944, "step": 991 }, { "epoch": 0.006593530762609629, "grad_norm": 1.2172316312789917, "learning_rate": 1.5215003492312351e-05, "loss": 1.0221, "step": 992 }, { "epoch": 0.006600177467007421, "grad_norm": 1.4772396087646484, "learning_rate": 1.5129942548930115e-05, "loss": 0.4565, "step": 993 }, { "epoch": 0.006606824171405213, "grad_norm": 1.6136627197265625, "learning_rate": 1.5045077634603787e-05, "loss": 0.9532, "step": 994 }, { "epoch": 0.006613470875803005, "grad_norm": 1.4357903003692627, "learning_rate": 1.4960409226418576e-05, "loss": 0.5671, "step": 995 }, { "epoch": 0.006620117580200797, "grad_norm": 1.6323310136795044, "learning_rate": 1.4875937800354988e-05, "loss": 0.8964, "step": 996 }, { "epoch": 0.006626764284598589, "grad_norm": 1.6246354579925537, "learning_rate": 1.4791663831286184e-05, "loss": 0.9035, "step": 997 }, { "epoch": 0.006633410988996381, "grad_norm": 2.0185391902923584, "learning_rate": 1.470758779297528e-05, "loss": 0.9129, "step": 998 }, { "epoch": 0.006640057693394173, "grad_norm": 1.7859017848968506, "learning_rate": 1.4623710158072663e-05, "loss": 0.772, "step": 999 }, { "epoch": 0.006646704397791965, "grad_norm": 2.8134305477142334, "learning_rate": 1.4540031398113335e-05, "loss": 0.7747, "step": 1000 }, { "epoch": 0.006653351102189757, "grad_norm": 0.31304219365119934, "learning_rate": 1.4456551983514333e-05, "loss": 0.9119, "step": 1001 }, { "epoch": 0.0066599978065875485, "grad_norm": 0.6188201308250427, "learning_rate": 1.4373272383572028e-05, "loss": 0.7513, "step": 1002 }, { "epoch": 0.006666644510985341, "grad_norm": 0.6836460828781128, "learning_rate": 1.4290193066459457e-05, "loss": 0.944, "step": 1003 }, { "epoch": 0.006673291215383133, "grad_norm": 0.6879714131355286, "learning_rate": 1.4207314499223745e-05, "loss": 0.7964, "step": 1004 }, { "epoch": 0.0066799379197809245, "grad_norm": 0.5281482934951782, "learning_rate": 1.4124637147783432e-05, "loss": 0.8331, "step": 1005 }, { "epoch": 0.006686584624178716, "grad_norm": 0.5473004579544067, "learning_rate": 1.404216147692598e-05, "loss": 0.9916, "step": 1006 }, { "epoch": 0.006693231328576509, "grad_norm": 0.6183127760887146, "learning_rate": 1.395988795030495e-05, "loss": 0.6854, "step": 1007 }, { "epoch": 0.006699878032974301, "grad_norm": 0.5756447911262512, "learning_rate": 1.3877817030437568e-05, "loss": 0.7835, "step": 1008 }, { "epoch": 0.006706524737372092, "grad_norm": 0.5753706693649292, "learning_rate": 1.3795949178702033e-05, "loss": 1.0322, "step": 1009 }, { "epoch": 0.006713171441769884, "grad_norm": 0.6262528300285339, "learning_rate": 1.371428485533498e-05, "loss": 0.7982, "step": 1010 }, { "epoch": 0.006719818146167677, "grad_norm": 0.6056203246116638, "learning_rate": 1.3632824519428889e-05, "loss": 0.9525, "step": 1011 }, { "epoch": 0.0067264648505654685, "grad_norm": 0.6044523119926453, "learning_rate": 1.3551568628929434e-05, "loss": 0.8489, "step": 1012 }, { "epoch": 0.00673311155496326, "grad_norm": 1.2201472520828247, "learning_rate": 1.3470517640632969e-05, "loss": 1.0784, "step": 1013 }, { "epoch": 0.006739758259361052, "grad_norm": 0.5861047506332397, "learning_rate": 1.3389672010183984e-05, "loss": 0.8827, "step": 1014 }, { "epoch": 0.0067464049637588445, "grad_norm": 0.8151621222496033, "learning_rate": 1.3309032192072463e-05, "loss": 0.9368, "step": 1015 }, { "epoch": 0.006753051668156636, "grad_norm": 0.6700717806816101, "learning_rate": 1.3228598639631418e-05, "loss": 0.8357, "step": 1016 }, { "epoch": 0.006759698372554428, "grad_norm": 0.9705390334129333, "learning_rate": 1.3148371805034253e-05, "loss": 0.8453, "step": 1017 }, { "epoch": 0.00676634507695222, "grad_norm": 0.7800853252410889, "learning_rate": 1.3068352139292312e-05, "loss": 0.728, "step": 1018 }, { "epoch": 0.006772991781350012, "grad_norm": 0.7995185852050781, "learning_rate": 1.2988540092252266e-05, "loss": 0.9883, "step": 1019 }, { "epoch": 0.006779638485747804, "grad_norm": 0.7061491012573242, "learning_rate": 1.29089361125936e-05, "loss": 0.7797, "step": 1020 }, { "epoch": 0.006786285190145596, "grad_norm": 0.926814079284668, "learning_rate": 1.2829540647826149e-05, "loss": 0.7926, "step": 1021 }, { "epoch": 0.006792931894543388, "grad_norm": 0.8915033340454102, "learning_rate": 1.275035414428753e-05, "loss": 0.9411, "step": 1022 }, { "epoch": 0.00679957859894118, "grad_norm": 1.2160683870315552, "learning_rate": 1.2671377047140625e-05, "loss": 0.8545, "step": 1023 }, { "epoch": 0.006806225303338972, "grad_norm": 0.8952360153198242, "learning_rate": 1.2592609800371086e-05, "loss": 0.9481, "step": 1024 }, { "epoch": 0.006812872007736764, "grad_norm": 0.7336769700050354, "learning_rate": 1.251405284678488e-05, "loss": 0.7532, "step": 1025 }, { "epoch": 0.006819518712134556, "grad_norm": 0.9183105230331421, "learning_rate": 1.2435706628005767e-05, "loss": 0.8165, "step": 1026 }, { "epoch": 0.006826165416532348, "grad_norm": 0.7961068153381348, "learning_rate": 1.2357571584472794e-05, "loss": 0.8613, "step": 1027 }, { "epoch": 0.00683281212093014, "grad_norm": 0.920566737651825, "learning_rate": 1.227964815543784e-05, "loss": 0.9217, "step": 1028 }, { "epoch": 0.0068394588253279315, "grad_norm": 0.9314897656440735, "learning_rate": 1.2201936778963192e-05, "loss": 0.7957, "step": 1029 }, { "epoch": 0.006846105529725724, "grad_norm": 1.002401351928711, "learning_rate": 1.2124437891918993e-05, "loss": 0.8704, "step": 1030 }, { "epoch": 0.006852752234123516, "grad_norm": 1.2262808084487915, "learning_rate": 1.2047151929980888e-05, "loss": 0.8852, "step": 1031 }, { "epoch": 0.006859398938521308, "grad_norm": 1.0216968059539795, "learning_rate": 1.1970079327627453e-05, "loss": 0.7399, "step": 1032 }, { "epoch": 0.006866045642919099, "grad_norm": 1.1232874393463135, "learning_rate": 1.1893220518137882e-05, "loss": 0.7159, "step": 1033 }, { "epoch": 0.006872692347316892, "grad_norm": 1.0752450227737427, "learning_rate": 1.1816575933589458e-05, "loss": 0.6177, "step": 1034 }, { "epoch": 0.006879339051714684, "grad_norm": 0.911821722984314, "learning_rate": 1.174014600485514e-05, "loss": 0.7266, "step": 1035 }, { "epoch": 0.006885985756112475, "grad_norm": 1.2482151985168457, "learning_rate": 1.1663931161601188e-05, "loss": 0.8125, "step": 1036 }, { "epoch": 0.006892632460510267, "grad_norm": 1.0357885360717773, "learning_rate": 1.1587931832284726e-05, "loss": 0.7536, "step": 1037 }, { "epoch": 0.00689927916490806, "grad_norm": 1.0621405839920044, "learning_rate": 1.1512148444151283e-05, "loss": 0.6836, "step": 1038 }, { "epoch": 0.0069059258693058515, "grad_norm": 1.1723105907440186, "learning_rate": 1.1436581423232434e-05, "loss": 0.6806, "step": 1039 }, { "epoch": 0.006912572573703643, "grad_norm": 1.1845663785934448, "learning_rate": 1.1361231194343436e-05, "loss": 0.815, "step": 1040 }, { "epoch": 0.006919219278101435, "grad_norm": 1.6141482591629028, "learning_rate": 1.1286098181080795e-05, "loss": 0.9749, "step": 1041 }, { "epoch": 0.006925865982499228, "grad_norm": 1.34913969039917, "learning_rate": 1.1211182805819881e-05, "loss": 0.6698, "step": 1042 }, { "epoch": 0.006932512686897019, "grad_norm": 1.377968668937683, "learning_rate": 1.1136485489712556e-05, "loss": 0.5934, "step": 1043 }, { "epoch": 0.006939159391294811, "grad_norm": 1.7394827604293823, "learning_rate": 1.1062006652684864e-05, "loss": 0.7956, "step": 1044 }, { "epoch": 0.006945806095692603, "grad_norm": 1.4188005924224854, "learning_rate": 1.0987746713434576e-05, "loss": 0.7873, "step": 1045 }, { "epoch": 0.0069524528000903954, "grad_norm": 1.7469269037246704, "learning_rate": 1.0913706089428932e-05, "loss": 1.0878, "step": 1046 }, { "epoch": 0.006959099504488187, "grad_norm": 2.151268482208252, "learning_rate": 1.0839885196902194e-05, "loss": 1.1574, "step": 1047 }, { "epoch": 0.006965746208885979, "grad_norm": 2.0133984088897705, "learning_rate": 1.0766284450853415e-05, "loss": 0.716, "step": 1048 }, { "epoch": 0.006972392913283771, "grad_norm": 3.2061398029327393, "learning_rate": 1.0692904265044012e-05, "loss": 0.915, "step": 1049 }, { "epoch": 0.006979039617681563, "grad_norm": 2.7918288707733154, "learning_rate": 1.0619745051995472e-05, "loss": 1.219, "step": 1050 }, { "epoch": 0.006985686322079355, "grad_norm": 0.3655790686607361, "learning_rate": 1.0546807222987071e-05, "loss": 1.0736, "step": 1051 }, { "epoch": 0.006992333026477147, "grad_norm": 0.4837401509284973, "learning_rate": 1.047409118805353e-05, "loss": 0.9356, "step": 1052 }, { "epoch": 0.006998979730874939, "grad_norm": 0.45560112595558167, "learning_rate": 1.0401597355982678e-05, "loss": 0.8369, "step": 1053 }, { "epoch": 0.007005626435272731, "grad_norm": 0.5554627180099487, "learning_rate": 1.03293261343132e-05, "loss": 0.8466, "step": 1054 }, { "epoch": 0.007012273139670523, "grad_norm": 0.7247781753540039, "learning_rate": 1.0257277929332332e-05, "loss": 1.0188, "step": 1055 }, { "epoch": 0.007018919844068315, "grad_norm": 0.5876094102859497, "learning_rate": 1.0185453146073604e-05, "loss": 0.8758, "step": 1056 }, { "epoch": 0.007025566548466107, "grad_norm": 0.5293012857437134, "learning_rate": 1.0113852188314493e-05, "loss": 0.8288, "step": 1057 }, { "epoch": 0.007032213252863899, "grad_norm": 0.5751774311065674, "learning_rate": 1.0042475458574202e-05, "loss": 0.8582, "step": 1058 }, { "epoch": 0.007038859957261691, "grad_norm": 0.7522995471954346, "learning_rate": 9.971323358111434e-06, "loss": 0.7952, "step": 1059 }, { "epoch": 0.007045506661659482, "grad_norm": 0.6282753348350525, "learning_rate": 9.900396286922026e-06, "loss": 0.7768, "step": 1060 }, { "epoch": 0.007052153366057275, "grad_norm": 0.6144838333129883, "learning_rate": 9.829694643736836e-06, "loss": 0.7815, "step": 1061 }, { "epoch": 0.007058800070455067, "grad_norm": 0.6692332029342651, "learning_rate": 9.759218826019378e-06, "loss": 0.7859, "step": 1062 }, { "epoch": 0.0070654467748528585, "grad_norm": 0.8164696097373962, "learning_rate": 9.688969229963685e-06, "loss": 0.8053, "step": 1063 }, { "epoch": 0.00707209347925065, "grad_norm": 0.7802616953849792, "learning_rate": 9.618946250492011e-06, "loss": 0.93, "step": 1064 }, { "epoch": 0.007078740183648443, "grad_norm": 0.7973951697349548, "learning_rate": 9.549150281252633e-06, "loss": 0.931, "step": 1065 }, { "epoch": 0.007085386888046235, "grad_norm": 0.6746591925621033, "learning_rate": 9.479581714617668e-06, "loss": 0.9751, "step": 1066 }, { "epoch": 0.007092033592444026, "grad_norm": 0.8076299428939819, "learning_rate": 9.410240941680859e-06, "loss": 0.7754, "step": 1067 }, { "epoch": 0.007098680296841818, "grad_norm": 2.197793483734131, "learning_rate": 9.341128352255313e-06, "loss": 0.8414, "step": 1068 }, { "epoch": 0.007105327001239611, "grad_norm": 0.7308064699172974, "learning_rate": 9.272244334871377e-06, "loss": 0.7137, "step": 1069 }, { "epoch": 0.007111973705637402, "grad_norm": 0.8364386558532715, "learning_rate": 9.203589276774439e-06, "loss": 0.858, "step": 1070 }, { "epoch": 0.007118620410035194, "grad_norm": 0.8338163495063782, "learning_rate": 9.135163563922766e-06, "loss": 0.709, "step": 1071 }, { "epoch": 0.007125267114432986, "grad_norm": 0.7769277691841125, "learning_rate": 9.06696758098528e-06, "loss": 0.7744, "step": 1072 }, { "epoch": 0.0071319138188307785, "grad_norm": 0.8195592164993286, "learning_rate": 8.999001711339434e-06, "loss": 0.8514, "step": 1073 }, { "epoch": 0.00713856052322857, "grad_norm": 0.7981391549110413, "learning_rate": 8.931266337069083e-06, "loss": 0.7724, "step": 1074 }, { "epoch": 0.007145207227626362, "grad_norm": 1.2387161254882812, "learning_rate": 8.86376183896226e-06, "loss": 0.6626, "step": 1075 }, { "epoch": 0.007151853932024154, "grad_norm": 1.1157495975494385, "learning_rate": 8.796488596509133e-06, "loss": 0.643, "step": 1076 }, { "epoch": 0.007158500636421946, "grad_norm": 0.8519596457481384, "learning_rate": 8.72944698789977e-06, "loss": 0.6917, "step": 1077 }, { "epoch": 0.007165147340819738, "grad_norm": 1.1191961765289307, "learning_rate": 8.66263739002211e-06, "loss": 0.659, "step": 1078 }, { "epoch": 0.00717179404521753, "grad_norm": 0.961754322052002, "learning_rate": 8.596060178459758e-06, "loss": 0.9327, "step": 1079 }, { "epoch": 0.007178440749615322, "grad_norm": 1.1048482656478882, "learning_rate": 8.529715727489912e-06, "loss": 0.8056, "step": 1080 }, { "epoch": 0.007185087454013114, "grad_norm": 0.9219740033149719, "learning_rate": 8.463604410081293e-06, "loss": 0.6779, "step": 1081 }, { "epoch": 0.007191734158410906, "grad_norm": 1.114214539527893, "learning_rate": 8.397726597892008e-06, "loss": 0.9183, "step": 1082 }, { "epoch": 0.007198380862808698, "grad_norm": 0.9252001643180847, "learning_rate": 8.332082661267443e-06, "loss": 0.5787, "step": 1083 }, { "epoch": 0.00720502756720649, "grad_norm": 1.4290266036987305, "learning_rate": 8.266672969238216e-06, "loss": 0.7914, "step": 1084 }, { "epoch": 0.007211674271604282, "grad_norm": 1.0707231760025024, "learning_rate": 8.201497889518073e-06, "loss": 0.6836, "step": 1085 }, { "epoch": 0.007218320976002074, "grad_norm": 1.3955053091049194, "learning_rate": 8.136557788501903e-06, "loss": 0.5338, "step": 1086 }, { "epoch": 0.0072249676803998655, "grad_norm": 1.097495675086975, "learning_rate": 8.071853031263554e-06, "loss": 0.7974, "step": 1087 }, { "epoch": 0.007231614384797658, "grad_norm": 1.1199660301208496, "learning_rate": 8.007383981553857e-06, "loss": 0.8519, "step": 1088 }, { "epoch": 0.00723826108919545, "grad_norm": 0.9631024599075317, "learning_rate": 7.943151001798554e-06, "loss": 0.5342, "step": 1089 }, { "epoch": 0.0072449077935932416, "grad_norm": 1.3137375116348267, "learning_rate": 7.879154453096304e-06, "loss": 0.9322, "step": 1090 }, { "epoch": 0.007251554497991033, "grad_norm": 1.220885992050171, "learning_rate": 7.81539469521661e-06, "loss": 0.8128, "step": 1091 }, { "epoch": 0.007258201202388826, "grad_norm": 1.5835233926773071, "learning_rate": 7.751872086597783e-06, "loss": 0.7819, "step": 1092 }, { "epoch": 0.007264847906786618, "grad_norm": 1.3007620573043823, "learning_rate": 7.688586984344992e-06, "loss": 0.8538, "step": 1093 }, { "epoch": 0.007271494611184409, "grad_norm": 1.304919719696045, "learning_rate": 7.6255397442281825e-06, "loss": 0.5072, "step": 1094 }, { "epoch": 0.007278141315582201, "grad_norm": 1.584822654724121, "learning_rate": 7.562730720680112e-06, "loss": 0.983, "step": 1095 }, { "epoch": 0.007284788019979994, "grad_norm": 1.5479097366333008, "learning_rate": 7.500160266794371e-06, "loss": 0.9376, "step": 1096 }, { "epoch": 0.0072914347243777855, "grad_norm": 1.8360904455184937, "learning_rate": 7.437828734323393e-06, "loss": 1.0823, "step": 1097 }, { "epoch": 0.007298081428775577, "grad_norm": 1.8177344799041748, "learning_rate": 7.375736473676442e-06, "loss": 0.704, "step": 1098 }, { "epoch": 0.007304728133173369, "grad_norm": 2.0094518661499023, "learning_rate": 7.3138838339176675e-06, "loss": 0.8205, "step": 1099 }, { "epoch": 0.0073113748375711616, "grad_norm": 3.6242754459381104, "learning_rate": 7.252271162764129e-06, "loss": 1.2131, "step": 1100 }, { "epoch": 0.007318021541968953, "grad_norm": 0.379312127828598, "learning_rate": 7.190898806583929e-06, "loss": 1.0036, "step": 1101 }, { "epoch": 0.007324668246366745, "grad_norm": 0.6471967697143555, "learning_rate": 7.1297671103941035e-06, "loss": 1.1373, "step": 1102 }, { "epoch": 0.007331314950764537, "grad_norm": 0.5435198545455933, "learning_rate": 7.068876417858811e-06, "loss": 0.7672, "step": 1103 }, { "epoch": 0.007337961655162329, "grad_norm": 0.6249924898147583, "learning_rate": 7.008227071287338e-06, "loss": 0.9681, "step": 1104 }, { "epoch": 0.007344608359560121, "grad_norm": 0.8528724312782288, "learning_rate": 6.947819411632223e-06, "loss": 0.848, "step": 1105 }, { "epoch": 0.007351255063957913, "grad_norm": 0.6179990768432617, "learning_rate": 6.887653778487307e-06, "loss": 0.901, "step": 1106 }, { "epoch": 0.007357901768355705, "grad_norm": 0.5665982365608215, "learning_rate": 6.827730510085817e-06, "loss": 0.8572, "step": 1107 }, { "epoch": 0.007364548472753497, "grad_norm": 0.5486406087875366, "learning_rate": 6.7680499432984654e-06, "loss": 0.5723, "step": 1108 }, { "epoch": 0.007371195177151289, "grad_norm": 0.5960477590560913, "learning_rate": 6.708612413631615e-06, "loss": 0.8061, "step": 1109 }, { "epoch": 0.007377841881549081, "grad_norm": 0.7446190714836121, "learning_rate": 6.649418255225298e-06, "loss": 0.8753, "step": 1110 }, { "epoch": 0.007384488585946873, "grad_norm": 0.6754137873649597, "learning_rate": 6.590467800851419e-06, "loss": 0.7474, "step": 1111 }, { "epoch": 0.007391135290344665, "grad_norm": 0.7214033603668213, "learning_rate": 6.531761381911827e-06, "loss": 0.8932, "step": 1112 }, { "epoch": 0.007397781994742457, "grad_norm": 0.6419174075126648, "learning_rate": 6.473299328436499e-06, "loss": 0.7526, "step": 1113 }, { "epoch": 0.0074044286991402485, "grad_norm": 0.6379873156547546, "learning_rate": 6.415081969081649e-06, "loss": 0.7635, "step": 1114 }, { "epoch": 0.007411075403538041, "grad_norm": 0.7169990539550781, "learning_rate": 6.357109631127889e-06, "loss": 0.7471, "step": 1115 }, { "epoch": 0.007417722107935833, "grad_norm": 0.64534592628479, "learning_rate": 6.2993826404783965e-06, "loss": 1.0755, "step": 1116 }, { "epoch": 0.007424368812333625, "grad_norm": 0.6701549291610718, "learning_rate": 6.241901321657112e-06, "loss": 0.9706, "step": 1117 }, { "epoch": 0.007431015516731416, "grad_norm": 0.7817533612251282, "learning_rate": 6.184665997806832e-06, "loss": 1.0468, "step": 1118 }, { "epoch": 0.007437662221129209, "grad_norm": 0.6312013864517212, "learning_rate": 6.127676990687453e-06, "loss": 0.9234, "step": 1119 }, { "epoch": 0.007444308925527001, "grad_norm": 0.7704412937164307, "learning_rate": 6.07093462067419e-06, "loss": 0.8273, "step": 1120 }, { "epoch": 0.0074509556299247924, "grad_norm": 0.9202284216880798, "learning_rate": 6.014439206755706e-06, "loss": 0.6653, "step": 1121 }, { "epoch": 0.007457602334322584, "grad_norm": 0.922299861907959, "learning_rate": 5.958191066532354e-06, "loss": 0.9426, "step": 1122 }, { "epoch": 0.007464249038720377, "grad_norm": 0.8126717209815979, "learning_rate": 5.902190516214384e-06, "loss": 1.0114, "step": 1123 }, { "epoch": 0.0074708957431181685, "grad_norm": 1.1028474569320679, "learning_rate": 5.846437870620192e-06, "loss": 0.891, "step": 1124 }, { "epoch": 0.00747754244751596, "grad_norm": 0.8131560683250427, "learning_rate": 5.79093344317449e-06, "loss": 0.7927, "step": 1125 }, { "epoch": 0.007484189151913752, "grad_norm": 0.8165668845176697, "learning_rate": 5.735677545906626e-06, "loss": 0.7592, "step": 1126 }, { "epoch": 0.007490835856311545, "grad_norm": 0.9261493682861328, "learning_rate": 5.680670489448742e-06, "loss": 0.7585, "step": 1127 }, { "epoch": 0.007497482560709336, "grad_norm": 0.8316916823387146, "learning_rate": 5.625912583034115e-06, "loss": 0.7128, "step": 1128 }, { "epoch": 0.007504129265107128, "grad_norm": 0.9080417156219482, "learning_rate": 5.5714041344953445e-06, "loss": 0.663, "step": 1129 }, { "epoch": 0.00751077596950492, "grad_norm": 0.9159788489341736, "learning_rate": 5.51714545026264e-06, "loss": 0.8107, "step": 1130 }, { "epoch": 0.0075174226739027124, "grad_norm": 1.1240578889846802, "learning_rate": 5.463136835362148e-06, "loss": 0.6353, "step": 1131 }, { "epoch": 0.007524069378300504, "grad_norm": 1.2321254014968872, "learning_rate": 5.409378593414194e-06, "loss": 0.995, "step": 1132 }, { "epoch": 0.007530716082698296, "grad_norm": 0.9713672399520874, "learning_rate": 5.355871026631554e-06, "loss": 0.7501, "step": 1133 }, { "epoch": 0.007537362787096088, "grad_norm": 1.0002257823944092, "learning_rate": 5.302614435817793e-06, "loss": 0.8235, "step": 1134 }, { "epoch": 0.00754400949149388, "grad_norm": 1.2293847799301147, "learning_rate": 5.249609120365578e-06, "loss": 0.731, "step": 1135 }, { "epoch": 0.007550656195891672, "grad_norm": 0.8931113481521606, "learning_rate": 5.196855378254989e-06, "loss": 0.7542, "step": 1136 }, { "epoch": 0.007557302900289464, "grad_norm": 1.4215387105941772, "learning_rate": 5.144353506051797e-06, "loss": 0.751, "step": 1137 }, { "epoch": 0.007563949604687256, "grad_norm": 1.4619780778884888, "learning_rate": 5.0921037989058614e-06, "loss": 0.8514, "step": 1138 }, { "epoch": 0.007570596309085048, "grad_norm": 2.04518723487854, "learning_rate": 5.0401065505494445e-06, "loss": 1.0564, "step": 1139 }, { "epoch": 0.00757724301348284, "grad_norm": 1.1913663148880005, "learning_rate": 4.988362053295564e-06, "loss": 0.8242, "step": 1140 }, { "epoch": 0.007583889717880632, "grad_norm": 1.0685111284255981, "learning_rate": 4.9368705980363415e-06, "loss": 0.5443, "step": 1141 }, { "epoch": 0.007590536422278424, "grad_norm": 1.0553430318832397, "learning_rate": 4.885632474241347e-06, "loss": 0.624, "step": 1142 }, { "epoch": 0.007597183126676216, "grad_norm": 1.1717983484268188, "learning_rate": 4.834647969956052e-06, "loss": 0.7475, "step": 1143 }, { "epoch": 0.007603829831074008, "grad_norm": 1.592492938041687, "learning_rate": 4.783917371800101e-06, "loss": 0.7852, "step": 1144 }, { "epoch": 0.007610476535471799, "grad_norm": 1.7482496500015259, "learning_rate": 4.733440964965791e-06, "loss": 1.0631, "step": 1145 }, { "epoch": 0.007617123239869592, "grad_norm": 1.701235055923462, "learning_rate": 4.683219033216402e-06, "loss": 0.9337, "step": 1146 }, { "epoch": 0.007623769944267384, "grad_norm": 2.070103168487549, "learning_rate": 4.633251858884657e-06, "loss": 0.814, "step": 1147 }, { "epoch": 0.0076304166486651755, "grad_norm": 2.205002784729004, "learning_rate": 4.583539722871094e-06, "loss": 0.7706, "step": 1148 }, { "epoch": 0.007637063353062967, "grad_norm": 2.1740975379943848, "learning_rate": 4.534082904642495e-06, "loss": 0.8495, "step": 1149 }, { "epoch": 0.00764371005746076, "grad_norm": 3.0469541549682617, "learning_rate": 4.484881682230341e-06, "loss": 0.7312, "step": 1150 }, { "epoch": 0.007650356761858552, "grad_norm": 0.3347352147102356, "learning_rate": 4.435936332229229e-06, "loss": 0.8578, "step": 1151 }, { "epoch": 0.007657003466256343, "grad_norm": 0.734880805015564, "learning_rate": 4.3872471297952965e-06, "loss": 0.9773, "step": 1152 }, { "epoch": 0.007663650170654135, "grad_norm": 0.7428154349327087, "learning_rate": 4.3388143486447045e-06, "loss": 0.8733, "step": 1153 }, { "epoch": 0.007670296875051928, "grad_norm": 0.5158607363700867, "learning_rate": 4.290638261052099e-06, "loss": 0.9492, "step": 1154 }, { "epoch": 0.007676943579449719, "grad_norm": 0.5684289932250977, "learning_rate": 4.242719137849077e-06, "loss": 0.8136, "step": 1155 }, { "epoch": 0.007683590283847511, "grad_norm": 0.7612115144729614, "learning_rate": 4.1950572484226345e-06, "loss": 0.914, "step": 1156 }, { "epoch": 0.007690236988245303, "grad_norm": 0.6157774925231934, "learning_rate": 4.147652860713685e-06, "loss": 0.947, "step": 1157 }, { "epoch": 0.0076968836926430955, "grad_norm": 0.7647441029548645, "learning_rate": 4.100506241215562e-06, "loss": 0.8584, "step": 1158 }, { "epoch": 0.007703530397040887, "grad_norm": 0.7151400446891785, "learning_rate": 4.0536176549724806e-06, "loss": 0.6786, "step": 1159 }, { "epoch": 0.007710177101438679, "grad_norm": 0.6203306913375854, "learning_rate": 4.00698736557808e-06, "loss": 0.9418, "step": 1160 }, { "epoch": 0.007716823805836471, "grad_norm": 0.7037209868431091, "learning_rate": 3.960615635173925e-06, "loss": 0.8577, "step": 1161 }, { "epoch": 0.007723470510234263, "grad_norm": 0.6116876006126404, "learning_rate": 3.914502724448061e-06, "loss": 0.9221, "step": 1162 }, { "epoch": 0.007730117214632055, "grad_norm": 0.8219679594039917, "learning_rate": 3.868648892633497e-06, "loss": 0.8342, "step": 1163 }, { "epoch": 0.007736763919029847, "grad_norm": 0.6384825110435486, "learning_rate": 3.823054397506781e-06, "loss": 0.8674, "step": 1164 }, { "epoch": 0.007743410623427639, "grad_norm": 0.6831299662590027, "learning_rate": 3.7777194953865667e-06, "loss": 0.7916, "step": 1165 }, { "epoch": 0.007750057327825431, "grad_norm": 0.7946675419807434, "learning_rate": 3.7326444411321547e-06, "loss": 0.8928, "step": 1166 }, { "epoch": 0.007756704032223223, "grad_norm": 0.6539303064346313, "learning_rate": 3.6878294881420363e-06, "loss": 0.9344, "step": 1167 }, { "epoch": 0.007763350736621015, "grad_norm": 1.1968379020690918, "learning_rate": 3.6432748883524935e-06, "loss": 0.9727, "step": 1168 }, { "epoch": 0.007769997441018807, "grad_norm": 0.6433477401733398, "learning_rate": 3.598980892236198e-06, "loss": 0.6713, "step": 1169 }, { "epoch": 0.007776644145416599, "grad_norm": 0.9618440866470337, "learning_rate": 3.5549477488007854e-06, "loss": 1.0181, "step": 1170 }, { "epoch": 0.007783290849814391, "grad_norm": 0.8468819856643677, "learning_rate": 3.511175705587433e-06, "loss": 0.8858, "step": 1171 }, { "epoch": 0.0077899375542121825, "grad_norm": 0.7299633026123047, "learning_rate": 3.4676650086695016e-06, "loss": 0.7895, "step": 1172 }, { "epoch": 0.007796584258609975, "grad_norm": 0.9545450210571289, "learning_rate": 3.4244159026511566e-06, "loss": 0.8403, "step": 1173 }, { "epoch": 0.007803230963007767, "grad_norm": 0.9891371726989746, "learning_rate": 3.3814286306659502e-06, "loss": 0.9864, "step": 1174 }, { "epoch": 0.0078098776674055586, "grad_norm": 0.7887678146362305, "learning_rate": 3.3387034343755065e-06, "loss": 0.9753, "step": 1175 }, { "epoch": 0.00781652437180335, "grad_norm": 0.9488683938980103, "learning_rate": 3.2962405539681217e-06, "loss": 0.6611, "step": 1176 }, { "epoch": 0.007823171076201143, "grad_norm": 1.0425969362258911, "learning_rate": 3.25404022815744e-06, "loss": 0.7946, "step": 1177 }, { "epoch": 0.007829817780598934, "grad_norm": 1.1102572679519653, "learning_rate": 3.2121026941811015e-06, "loss": 0.4979, "step": 1178 }, { "epoch": 0.007836464484996726, "grad_norm": 0.8967680931091309, "learning_rate": 3.1704281877993903e-06, "loss": 0.8126, "step": 1179 }, { "epoch": 0.007843111189394519, "grad_norm": 0.7562099099159241, "learning_rate": 3.1290169432939553e-06, "loss": 0.7026, "step": 1180 }, { "epoch": 0.00784975789379231, "grad_norm": 0.9412071108818054, "learning_rate": 3.087869193466458e-06, "loss": 0.8846, "step": 1181 }, { "epoch": 0.007856404598190102, "grad_norm": 1.510528564453125, "learning_rate": 3.0469851696372564e-06, "loss": 1.1582, "step": 1182 }, { "epoch": 0.007863051302587895, "grad_norm": 1.3257495164871216, "learning_rate": 3.0063651016441428e-06, "loss": 0.7229, "step": 1183 }, { "epoch": 0.007869698006985686, "grad_norm": 1.0000684261322021, "learning_rate": 2.9660092178409927e-06, "loss": 0.8639, "step": 1184 }, { "epoch": 0.007876344711383479, "grad_norm": 1.1794826984405518, "learning_rate": 2.9259177450965682e-06, "loss": 0.8521, "step": 1185 }, { "epoch": 0.00788299141578127, "grad_norm": 1.2365461587905884, "learning_rate": 2.8860909087931543e-06, "loss": 0.7315, "step": 1186 }, { "epoch": 0.007889638120179062, "grad_norm": 1.2363101243972778, "learning_rate": 2.8465289328253376e-06, "loss": 0.873, "step": 1187 }, { "epoch": 0.007896284824576855, "grad_norm": 1.118465781211853, "learning_rate": 2.8072320395987285e-06, "loss": 0.8294, "step": 1188 }, { "epoch": 0.007902931528974646, "grad_norm": 1.2116254568099976, "learning_rate": 2.7682004500287464e-06, "loss": 0.7468, "step": 1189 }, { "epoch": 0.007909578233372438, "grad_norm": 1.4756786823272705, "learning_rate": 2.7294343835393368e-06, "loss": 0.7626, "step": 1190 }, { "epoch": 0.00791622493777023, "grad_norm": 1.321200966835022, "learning_rate": 2.690934058061756e-06, "loss": 0.9202, "step": 1191 }, { "epoch": 0.007922871642168022, "grad_norm": 1.2931435108184814, "learning_rate": 2.6526996900333277e-06, "loss": 0.7292, "step": 1192 }, { "epoch": 0.007929518346565814, "grad_norm": 1.1711453199386597, "learning_rate": 2.614731494396283e-06, "loss": 0.7659, "step": 1193 }, { "epoch": 0.007936165050963607, "grad_norm": 1.4512163400650024, "learning_rate": 2.577029684596466e-06, "loss": 0.7325, "step": 1194 }, { "epoch": 0.007942811755361398, "grad_norm": 1.4844986200332642, "learning_rate": 2.539594472582213e-06, "loss": 0.7598, "step": 1195 }, { "epoch": 0.00794945845975919, "grad_norm": 1.782923936843872, "learning_rate": 2.5024260688030987e-06, "loss": 0.659, "step": 1196 }, { "epoch": 0.007956105164156981, "grad_norm": 2.0441620349884033, "learning_rate": 2.465524682208814e-06, "loss": 0.9054, "step": 1197 }, { "epoch": 0.007962751868554774, "grad_norm": 2.2920122146606445, "learning_rate": 2.4288905202479283e-06, "loss": 0.9065, "step": 1198 }, { "epoch": 0.007969398572952566, "grad_norm": 2.007251501083374, "learning_rate": 2.3925237888667572e-06, "loss": 0.8426, "step": 1199 }, { "epoch": 0.007976045277350357, "grad_norm": 2.4775331020355225, "learning_rate": 2.3564246925082357e-06, "loss": 0.6222, "step": 1200 }, { "epoch": 0.00798269198174815, "grad_norm": 0.31705576181411743, "learning_rate": 2.320593434110696e-06, "loss": 1.0729, "step": 1201 }, { "epoch": 0.007989338686145942, "grad_norm": 0.6680214405059814, "learning_rate": 2.2850302151067814e-06, "loss": 0.884, "step": 1202 }, { "epoch": 0.007995985390543733, "grad_norm": 0.5763593316078186, "learning_rate": 2.2497352354222902e-06, "loss": 0.7567, "step": 1203 }, { "epoch": 0.008002632094941526, "grad_norm": 0.5514530539512634, "learning_rate": 2.214708693475065e-06, "loss": 0.768, "step": 1204 }, { "epoch": 0.008009278799339317, "grad_norm": 0.6078202128410339, "learning_rate": 2.179950786173879e-06, "loss": 0.839, "step": 1205 }, { "epoch": 0.00801592550373711, "grad_norm": 0.630172610282898, "learning_rate": 2.145461708917312e-06, "loss": 0.8652, "step": 1206 }, { "epoch": 0.008022572208134902, "grad_norm": 0.7208753824234009, "learning_rate": 2.1112416555926497e-06, "loss": 0.9369, "step": 1207 }, { "epoch": 0.008029218912532693, "grad_norm": 0.8084291815757751, "learning_rate": 2.077290818574834e-06, "loss": 0.7762, "step": 1208 }, { "epoch": 0.008035865616930486, "grad_norm": 0.5719032287597656, "learning_rate": 2.043609388725326e-06, "loss": 0.9558, "step": 1209 }, { "epoch": 0.008042512321328278, "grad_norm": 0.6796174645423889, "learning_rate": 2.01019755539108e-06, "loss": 1.0185, "step": 1210 }, { "epoch": 0.008049159025726069, "grad_norm": 0.6527438759803772, "learning_rate": 1.9770555064034467e-06, "loss": 0.7037, "step": 1211 }, { "epoch": 0.008055805730123862, "grad_norm": 0.776106595993042, "learning_rate": 1.944183428077145e-06, "loss": 0.8553, "step": 1212 }, { "epoch": 0.008062452434521653, "grad_norm": 0.5876369476318359, "learning_rate": 1.911581505209176e-06, "loss": 0.9049, "step": 1213 }, { "epoch": 0.008069099138919445, "grad_norm": 0.7080634832382202, "learning_rate": 1.8792499210778191e-06, "loss": 0.8793, "step": 1214 }, { "epoch": 0.008075745843317238, "grad_norm": 0.9152222275733948, "learning_rate": 1.8471888574415951e-06, "loss": 0.6502, "step": 1215 }, { "epoch": 0.008082392547715029, "grad_norm": 0.7787750959396362, "learning_rate": 1.8153984945382452e-06, "loss": 0.8039, "step": 1216 }, { "epoch": 0.008089039252112821, "grad_norm": 0.8649187088012695, "learning_rate": 1.783879011083689e-06, "loss": 0.8385, "step": 1217 }, { "epoch": 0.008095685956510614, "grad_norm": 0.6336601376533508, "learning_rate": 1.7526305842710532e-06, "loss": 0.8588, "step": 1218 }, { "epoch": 0.008102332660908405, "grad_norm": 0.6925593614578247, "learning_rate": 1.7216533897696675e-06, "loss": 0.9039, "step": 1219 }, { "epoch": 0.008108979365306197, "grad_norm": 0.7388715147972107, "learning_rate": 1.6909476017240912e-06, "loss": 0.8403, "step": 1220 }, { "epoch": 0.008115626069703988, "grad_norm": 0.7469501495361328, "learning_rate": 1.6605133927530825e-06, "loss": 0.7552, "step": 1221 }, { "epoch": 0.00812227277410178, "grad_norm": 0.8448594212532043, "learning_rate": 1.6303509339486823e-06, "loss": 0.8702, "step": 1222 }, { "epoch": 0.008128919478499573, "grad_norm": 0.8358093500137329, "learning_rate": 1.6004603948752473e-06, "loss": 0.8922, "step": 1223 }, { "epoch": 0.008135566182897364, "grad_norm": 1.0381255149841309, "learning_rate": 1.5708419435684462e-06, "loss": 0.7692, "step": 1224 }, { "epoch": 0.008142212887295157, "grad_norm": 0.8204221129417419, "learning_rate": 1.5414957465343882e-06, "loss": 0.8635, "step": 1225 }, { "epoch": 0.00814885959169295, "grad_norm": 1.3811793327331543, "learning_rate": 1.512421968748623e-06, "loss": 1.0606, "step": 1226 }, { "epoch": 0.00815550629609074, "grad_norm": 0.7987362742424011, "learning_rate": 1.4836207736552642e-06, "loss": 0.8876, "step": 1227 }, { "epoch": 0.008162153000488533, "grad_norm": 1.040132761001587, "learning_rate": 1.455092323166024e-06, "loss": 0.8985, "step": 1228 }, { "epoch": 0.008168799704886326, "grad_norm": 0.9249858260154724, "learning_rate": 1.4268367776593405e-06, "loss": 0.8616, "step": 1229 }, { "epoch": 0.008175446409284116, "grad_norm": 1.1802650690078735, "learning_rate": 1.3988542959794627e-06, "loss": 0.6785, "step": 1230 }, { "epoch": 0.008182093113681909, "grad_norm": 1.0302826166152954, "learning_rate": 1.3711450354355449e-06, "loss": 0.6362, "step": 1231 }, { "epoch": 0.0081887398180797, "grad_norm": 1.121963381767273, "learning_rate": 1.3437091518007816e-06, "loss": 1.023, "step": 1232 }, { "epoch": 0.008195386522477493, "grad_norm": 1.0118569135665894, "learning_rate": 1.3165467993115244e-06, "loss": 0.7098, "step": 1233 }, { "epoch": 0.008202033226875285, "grad_norm": 1.052306890487671, "learning_rate": 1.2896581306664047e-06, "loss": 0.7376, "step": 1234 }, { "epoch": 0.008208679931273076, "grad_norm": 1.0704679489135742, "learning_rate": 1.2630432970255013e-06, "loss": 1.0065, "step": 1235 }, { "epoch": 0.008215326635670869, "grad_norm": 1.216379165649414, "learning_rate": 1.2367024480094691e-06, "loss": 0.515, "step": 1236 }, { "epoch": 0.008221973340068661, "grad_norm": 1.3317914009094238, "learning_rate": 1.2106357316986838e-06, "loss": 0.7376, "step": 1237 }, { "epoch": 0.008228620044466452, "grad_norm": 1.7680779695510864, "learning_rate": 1.1848432946324594e-06, "loss": 0.9229, "step": 1238 }, { "epoch": 0.008235266748864245, "grad_norm": 1.3019485473632812, "learning_rate": 1.1593252818081658e-06, "loss": 1.0404, "step": 1239 }, { "epoch": 0.008241913453262036, "grad_norm": 1.3333247900009155, "learning_rate": 1.1340818366804729e-06, "loss": 0.6831, "step": 1240 }, { "epoch": 0.008248560157659828, "grad_norm": 1.3237649202346802, "learning_rate": 1.1091131011604804e-06, "loss": 0.7026, "step": 1241 }, { "epoch": 0.00825520686205762, "grad_norm": 1.5985360145568848, "learning_rate": 1.084419215614979e-06, "loss": 0.854, "step": 1242 }, { "epoch": 0.008261853566455412, "grad_norm": 1.1438730955123901, "learning_rate": 1.0600003188656117e-06, "loss": 0.5545, "step": 1243 }, { "epoch": 0.008268500270853204, "grad_norm": 1.2783243656158447, "learning_rate": 1.0358565481881356e-06, "loss": 0.4873, "step": 1244 }, { "epoch": 0.008275146975250997, "grad_norm": 1.4403537511825562, "learning_rate": 1.0119880393116176e-06, "loss": 0.7985, "step": 1245 }, { "epoch": 0.008281793679648788, "grad_norm": 1.6538810729980469, "learning_rate": 9.883949264176962e-07, "loss": 0.8637, "step": 1246 }, { "epoch": 0.00828844038404658, "grad_norm": 1.6638885736465454, "learning_rate": 9.65077342139814e-07, "loss": 0.7477, "step": 1247 }, { "epoch": 0.008295087088444371, "grad_norm": 1.9725651741027832, "learning_rate": 9.420354175624591e-07, "loss": 0.9379, "step": 1248 }, { "epoch": 0.008301733792842164, "grad_norm": 2.747512102127075, "learning_rate": 9.19269282220464e-07, "loss": 0.8132, "step": 1249 }, { "epoch": 0.008308380497239956, "grad_norm": 3.339989423751831, "learning_rate": 8.967790640982465e-07, "loss": 0.9292, "step": 1250 }, { "epoch": 0.008315027201637747, "grad_norm": 0.36648422479629517, "learning_rate": 8.745648896290981e-07, "loss": 0.9416, "step": 1251 }, { "epoch": 0.00832167390603554, "grad_norm": 0.429190456867218, "learning_rate": 8.52626883694474e-07, "loss": 0.9262, "step": 1252 }, { "epoch": 0.008328320610433333, "grad_norm": 0.6208512187004089, "learning_rate": 8.309651696233045e-07, "loss": 0.9098, "step": 1253 }, { "epoch": 0.008334967314831123, "grad_norm": 0.5772883892059326, "learning_rate": 8.095798691912737e-07, "loss": 0.8938, "step": 1254 }, { "epoch": 0.008341614019228916, "grad_norm": 0.6024505496025085, "learning_rate": 7.884711026201585e-07, "loss": 0.6757, "step": 1255 }, { "epoch": 0.008348260723626709, "grad_norm": 0.47945520281791687, "learning_rate": 7.676389885771518e-07, "loss": 0.769, "step": 1256 }, { "epoch": 0.0083549074280245, "grad_norm": 0.5948473215103149, "learning_rate": 7.470836441741736e-07, "loss": 0.8862, "step": 1257 }, { "epoch": 0.008361554132422292, "grad_norm": 0.5873749852180481, "learning_rate": 7.268051849672441e-07, "loss": 0.8728, "step": 1258 }, { "epoch": 0.008368200836820083, "grad_norm": 0.5595722794532776, "learning_rate": 7.068037249557957e-07, "loss": 0.8866, "step": 1259 }, { "epoch": 0.008374847541217876, "grad_norm": 0.6838152408599854, "learning_rate": 6.870793765820782e-07, "loss": 0.7628, "step": 1260 }, { "epoch": 0.008381494245615668, "grad_norm": 0.6558036804199219, "learning_rate": 6.676322507304877e-07, "loss": 0.9166, "step": 1261 }, { "epoch": 0.008388140950013459, "grad_norm": 0.6153919100761414, "learning_rate": 6.484624567269615e-07, "loss": 0.7852, "step": 1262 }, { "epoch": 0.008394787654411252, "grad_norm": 0.7325903177261353, "learning_rate": 6.29570102338356e-07, "loss": 0.9253, "step": 1263 }, { "epoch": 0.008401434358809044, "grad_norm": 0.7558137774467468, "learning_rate": 6.109552937718588e-07, "loss": 0.6489, "step": 1264 }, { "epoch": 0.008408081063206835, "grad_norm": 0.6297517418861389, "learning_rate": 5.92618135674361e-07, "loss": 0.9842, "step": 1265 }, { "epoch": 0.008414727767604628, "grad_norm": 0.7743880748748779, "learning_rate": 5.745587311318968e-07, "loss": 0.8271, "step": 1266 }, { "epoch": 0.008421374472002419, "grad_norm": 0.7147794365882874, "learning_rate": 5.567771816690381e-07, "loss": 0.9824, "step": 1267 }, { "epoch": 0.008428021176400211, "grad_norm": 0.831110954284668, "learning_rate": 5.392735872483623e-07, "loss": 0.8822, "step": 1268 }, { "epoch": 0.008434667880798004, "grad_norm": 0.7674936652183533, "learning_rate": 5.220480462698462e-07, "loss": 0.7066, "step": 1269 }, { "epoch": 0.008441314585195795, "grad_norm": 0.8405891060829163, "learning_rate": 5.051006555703453e-07, "loss": 0.7432, "step": 1270 }, { "epoch": 0.008447961289593587, "grad_norm": 0.8006777167320251, "learning_rate": 4.884315104230264e-07, "loss": 0.8399, "step": 1271 }, { "epoch": 0.00845460799399138, "grad_norm": 0.8176466226577759, "learning_rate": 4.7204070453685244e-07, "loss": 0.6448, "step": 1272 }, { "epoch": 0.00846125469838917, "grad_norm": 0.74027019739151, "learning_rate": 4.5592833005603796e-07, "loss": 0.8987, "step": 1273 }, { "epoch": 0.008467901402786963, "grad_norm": 0.9606915712356567, "learning_rate": 4.4009447755954944e-07, "loss": 1.0994, "step": 1274 }, { "epoch": 0.008474548107184754, "grad_norm": 1.3926945924758911, "learning_rate": 4.2453923606057265e-07, "loss": 0.7428, "step": 1275 }, { "epoch": 0.008481194811582547, "grad_norm": 0.9520140290260315, "learning_rate": 4.0926269300603503e-07, "loss": 0.9258, "step": 1276 }, { "epoch": 0.00848784151598034, "grad_norm": 0.8461837768554688, "learning_rate": 3.9426493427611177e-07, "loss": 0.7099, "step": 1277 }, { "epoch": 0.00849448822037813, "grad_norm": 0.9905888438224792, "learning_rate": 3.795460441837095e-07, "loss": 0.8662, "step": 1278 }, { "epoch": 0.008501134924775923, "grad_norm": 0.9792956113815308, "learning_rate": 3.651061054740501e-07, "loss": 0.8404, "step": 1279 }, { "epoch": 0.008507781629173716, "grad_norm": 1.0338367223739624, "learning_rate": 3.5094519932415417e-07, "loss": 0.7575, "step": 1280 }, { "epoch": 0.008514428333571506, "grad_norm": 1.1163334846496582, "learning_rate": 3.370634053424082e-07, "loss": 0.7396, "step": 1281 }, { "epoch": 0.008521075037969299, "grad_norm": 1.0261398553848267, "learning_rate": 3.234608015681151e-07, "loss": 0.7362, "step": 1282 }, { "epoch": 0.008527721742367092, "grad_norm": 0.927574872970581, "learning_rate": 3.1013746447104975e-07, "loss": 0.8644, "step": 1283 }, { "epoch": 0.008534368446764883, "grad_norm": 1.1198316812515259, "learning_rate": 2.970934689510485e-07, "loss": 0.7248, "step": 1284 }, { "epoch": 0.008541015151162675, "grad_norm": 1.0062837600708008, "learning_rate": 2.843288883375539e-07, "loss": 0.7071, "step": 1285 }, { "epoch": 0.008547661855560466, "grad_norm": 1.5024248361587524, "learning_rate": 2.71843794389226e-07, "loss": 0.7523, "step": 1286 }, { "epoch": 0.008554308559958259, "grad_norm": 1.3791800737380981, "learning_rate": 2.596382572935374e-07, "loss": 0.7903, "step": 1287 }, { "epoch": 0.008560955264356051, "grad_norm": 1.1591501235961914, "learning_rate": 2.477123456663788e-07, "loss": 0.7717, "step": 1288 }, { "epoch": 0.008567601968753842, "grad_norm": 1.3647840023040771, "learning_rate": 2.3606612655166504e-07, "loss": 0.7711, "step": 1289 }, { "epoch": 0.008574248673151635, "grad_norm": 1.3846338987350464, "learning_rate": 2.2469966542096322e-07, "loss": 0.95, "step": 1290 }, { "epoch": 0.008580895377549427, "grad_norm": 1.1583385467529297, "learning_rate": 2.1361302617312619e-07, "loss": 0.5463, "step": 1291 }, { "epoch": 0.008587542081947218, "grad_norm": 1.452928900718689, "learning_rate": 2.028062711339318e-07, "loss": 0.8323, "step": 1292 }, { "epoch": 0.00859418878634501, "grad_norm": 1.6725733280181885, "learning_rate": 1.922794610557277e-07, "loss": 0.9138, "step": 1293 }, { "epoch": 0.008600835490742802, "grad_norm": 1.342020034790039, "learning_rate": 1.8203265511710365e-07, "loss": 0.7097, "step": 1294 }, { "epoch": 0.008607482195140594, "grad_norm": 1.666353464126587, "learning_rate": 1.7206591092253642e-07, "loss": 0.8405, "step": 1295 }, { "epoch": 0.008614128899538387, "grad_norm": 2.023453712463379, "learning_rate": 1.623792845020955e-07, "loss": 0.9479, "step": 1296 }, { "epoch": 0.008620775603936178, "grad_norm": 2.3681578636169434, "learning_rate": 1.529728303110989e-07, "loss": 0.8534, "step": 1297 }, { "epoch": 0.00862742230833397, "grad_norm": 1.3655821084976196, "learning_rate": 1.4384660122983007e-07, "loss": 0.6382, "step": 1298 }, { "epoch": 0.008634069012731763, "grad_norm": 2.0608959197998047, "learning_rate": 1.3500064856321603e-07, "loss": 0.7469, "step": 1299 }, { "epoch": 0.008640715717129554, "grad_norm": 3.251729726791382, "learning_rate": 1.264350220405719e-07, "loss": 1.0694, "step": 1300 }, { "epoch": 0.008647362421527346, "grad_norm": 0.3500362038612366, "learning_rate": 1.1814976981529002e-07, "loss": 1.0851, "step": 1301 }, { "epoch": 0.008654009125925137, "grad_norm": 0.7794168591499329, "learning_rate": 1.1014493846457919e-07, "loss": 1.0559, "step": 1302 }, { "epoch": 0.00866065583032293, "grad_norm": 0.49550482630729675, "learning_rate": 1.0242057298922581e-07, "loss": 0.791, "step": 1303 }, { "epoch": 0.008667302534720723, "grad_norm": 0.5143431425094604, "learning_rate": 9.497671681329423e-08, "loss": 0.7753, "step": 1304 }, { "epoch": 0.008673949239118513, "grad_norm": 0.49252620339393616, "learning_rate": 8.781341178393244e-08, "loss": 0.6776, "step": 1305 }, { "epoch": 0.008680595943516306, "grad_norm": 0.4753943681716919, "learning_rate": 8.093069817109445e-08, "loss": 0.9467, "step": 1306 }, { "epoch": 0.008687242647914099, "grad_norm": 0.5471718311309814, "learning_rate": 7.432861466734608e-08, "loss": 0.9961, "step": 1307 }, { "epoch": 0.00869388935231189, "grad_norm": 0.5803403258323669, "learning_rate": 6.800719838763182e-08, "loss": 0.7374, "step": 1308 }, { "epoch": 0.008700536056709682, "grad_norm": 0.7243188619613647, "learning_rate": 6.196648486906375e-08, "loss": 0.7798, "step": 1309 }, { "epoch": 0.008707182761107475, "grad_norm": 0.8324836492538452, "learning_rate": 5.620650807073857e-08, "loss": 1.129, "step": 1310 }, { "epoch": 0.008713829465505266, "grad_norm": 0.7195192575454712, "learning_rate": 5.072730037351536e-08, "loss": 0.8921, "step": 1311 }, { "epoch": 0.008720476169903058, "grad_norm": 0.6834086775779724, "learning_rate": 4.552889257987136e-08, "loss": 0.7839, "step": 1312 }, { "epoch": 0.008727122874300849, "grad_norm": 0.7733263373374939, "learning_rate": 4.06113139137021e-08, "loss": 0.8451, "step": 1313 }, { "epoch": 0.008733769578698642, "grad_norm": 0.7354872226715088, "learning_rate": 3.5974592020165954e-08, "loss": 0.8272, "step": 1314 }, { "epoch": 0.008740416283096434, "grad_norm": 0.646259069442749, "learning_rate": 3.161875296553429e-08, "loss": 0.7925, "step": 1315 }, { "epoch": 0.008747062987494225, "grad_norm": 0.7745428085327148, "learning_rate": 2.7543821237030475e-08, "loss": 0.9565, "step": 1316 }, { "epoch": 0.008753709691892018, "grad_norm": 0.6693745851516724, "learning_rate": 2.3749819742702185e-08, "loss": 0.7318, "step": 1317 }, { "epoch": 0.00876035639628981, "grad_norm": 0.731195330619812, "learning_rate": 2.0236769811299294e-08, "loss": 0.7627, "step": 1318 }, { "epoch": 0.008767003100687601, "grad_norm": 0.7857605814933777, "learning_rate": 1.7004691192135104e-08, "loss": 0.8653, "step": 1319 }, { "epoch": 0.008773649805085394, "grad_norm": 0.7077126502990723, "learning_rate": 1.4053602054991955e-08, "loss": 0.8137, "step": 1320 }, { "epoch": 0.008780296509483185, "grad_norm": 0.7648908495903015, "learning_rate": 1.1383518990015773e-08, "loss": 0.741, "step": 1321 }, { "epoch": 0.008786943213880977, "grad_norm": 0.9375500679016113, "learning_rate": 8.99445700761059e-09, "loss": 0.9017, "step": 1322 }, { "epoch": 0.00879358991827877, "grad_norm": 0.8353835940361023, "learning_rate": 6.886429538377481e-09, "loss": 0.7284, "step": 1323 }, { "epoch": 0.00880023662267656, "grad_norm": 1.0171085596084595, "learning_rate": 5.0594484330090955e-09, "loss": 0.7838, "step": 1324 }, { "epoch": 0.008806883327074353, "grad_norm": 0.786939263343811, "learning_rate": 3.513523962256349e-09, "loss": 0.8177, "step": 1325 }, { "epoch": 0.008813530031472146, "grad_norm": 0.8693004250526428, "learning_rate": 2.2486648168396075e-09, "loss": 0.8491, "step": 1326 }, { "epoch": 0.008820176735869937, "grad_norm": 0.9159846305847168, "learning_rate": 1.2648781074209304e-09, "loss": 0.7735, "step": 1327 }, { "epoch": 0.00882682344026773, "grad_norm": 1.1475635766983032, "learning_rate": 5.621693645541104e-10, "loss": 0.6395, "step": 1328 }, { "epoch": 0.00883347014466552, "grad_norm": 0.9265028238296509, "learning_rate": 1.4054253866246925e-10, "loss": 0.8467, "step": 1329 }, { "epoch": 0.008840116849063313, "grad_norm": 0.9815409779548645, "learning_rate": 0.0, "loss": 0.793, "step": 1330 } ], "logging_steps": 1, "max_steps": 1330, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 333, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.701942326080307e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }