{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0556576529272775, "eval_steps": 358, "global_step": 1431, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.8894236846455274e-05, "grad_norm": 1.1662758588790894, "learning_rate": 2e-05, "loss": 11.8744, "step": 1 }, { "epoch": 3.8894236846455274e-05, "eval_loss": 11.888253211975098, "eval_runtime": 141.0277, "eval_samples_per_second": 76.765, "eval_steps_per_second": 38.383, "step": 1 }, { "epoch": 7.778847369291055e-05, "grad_norm": 0.9679811000823975, "learning_rate": 4e-05, "loss": 11.8993, "step": 2 }, { "epoch": 0.00011668271053936583, "grad_norm": 1.1977771520614624, "learning_rate": 6e-05, "loss": 11.8772, "step": 3 }, { "epoch": 0.0001555769473858211, "grad_norm": 0.9583384394645691, "learning_rate": 8e-05, "loss": 11.8819, "step": 4 }, { "epoch": 0.0001944711842322764, "grad_norm": 0.9274106621742249, "learning_rate": 0.0001, "loss": 11.8968, "step": 5 }, { "epoch": 0.00023336542107873167, "grad_norm": 0.9043604731559753, "learning_rate": 0.00012, "loss": 11.8907, "step": 6 }, { "epoch": 0.00027225965792518694, "grad_norm": 1.1428650617599487, "learning_rate": 0.00014, "loss": 11.8915, "step": 7 }, { "epoch": 0.0003111538947716422, "grad_norm": 0.9495051503181458, "learning_rate": 0.00016, "loss": 11.8453, "step": 8 }, { "epoch": 0.0003500481316180975, "grad_norm": 1.1615668535232544, "learning_rate": 0.00018, "loss": 11.8653, "step": 9 }, { "epoch": 0.0003889423684645528, "grad_norm": 1.0326706171035767, "learning_rate": 0.0002, "loss": 11.8705, "step": 10 }, { "epoch": 0.00042783660531100804, "grad_norm": 1.2258696556091309, "learning_rate": 0.00019999975561113358, "loss": 11.884, "step": 11 }, { "epoch": 0.00046673084215746334, "grad_norm": 0.68500155210495, "learning_rate": 0.00019999902244572878, "loss": 11.876, "step": 12 }, { "epoch": 0.0005056250790039186, "grad_norm": 1.1503891944885254, "learning_rate": 0.00019999780050736918, "loss": 11.8235, "step": 13 }, { "epoch": 0.0005445193158503739, "grad_norm": 0.9620792269706726, "learning_rate": 0.00019999608980202736, "loss": 11.8692, "step": 14 }, { "epoch": 0.0005834135526968291, "grad_norm": 1.0733370780944824, "learning_rate": 0.00019999389033806478, "loss": 11.8313, "step": 15 }, { "epoch": 0.0006223077895432844, "grad_norm": 1.0507014989852905, "learning_rate": 0.00019999120212623207, "loss": 11.843, "step": 16 }, { "epoch": 0.0006612020263897397, "grad_norm": 1.0197004079818726, "learning_rate": 0.0001999880251796685, "loss": 11.8083, "step": 17 }, { "epoch": 0.000700096263236195, "grad_norm": 1.0033892393112183, "learning_rate": 0.0001999843595139023, "loss": 11.8158, "step": 18 }, { "epoch": 0.0007389905000826502, "grad_norm": 0.928495466709137, "learning_rate": 0.00019998020514685045, "loss": 11.8076, "step": 19 }, { "epoch": 0.0007778847369291056, "grad_norm": 0.973197340965271, "learning_rate": 0.00019997556209881858, "loss": 11.8024, "step": 20 }, { "epoch": 0.0008167789737755608, "grad_norm": 1.0069469213485718, "learning_rate": 0.00019997043039250083, "loss": 11.7835, "step": 21 }, { "epoch": 0.0008556732106220161, "grad_norm": 0.9379525780677795, "learning_rate": 0.00019996481005297987, "loss": 11.7693, "step": 22 }, { "epoch": 0.0008945674474684713, "grad_norm": 0.9633327722549438, "learning_rate": 0.00019995870110772667, "loss": 11.7792, "step": 23 }, { "epoch": 0.0009334616843149267, "grad_norm": 0.7901795506477356, "learning_rate": 0.00019995210358660038, "loss": 11.8143, "step": 24 }, { "epoch": 0.0009723559211613819, "grad_norm": 0.9895073175430298, "learning_rate": 0.0001999450175218482, "loss": 11.7626, "step": 25 }, { "epoch": 0.0010112501580078373, "grad_norm": 0.93720942735672, "learning_rate": 0.0001999374429481053, "loss": 11.7931, "step": 26 }, { "epoch": 0.0010501443948542925, "grad_norm": 0.872528076171875, "learning_rate": 0.00019992937990239443, "loss": 11.7729, "step": 27 }, { "epoch": 0.0010890386317007478, "grad_norm": 1.1012974977493286, "learning_rate": 0.00019992082842412603, "loss": 11.7914, "step": 28 }, { "epoch": 0.001127932868547203, "grad_norm": 1.0047305822372437, "learning_rate": 0.00019991178855509776, "loss": 11.7445, "step": 29 }, { "epoch": 0.0011668271053936583, "grad_norm": 0.9516699910163879, "learning_rate": 0.00019990226033949452, "loss": 11.7589, "step": 30 }, { "epoch": 0.0012057213422401135, "grad_norm": 0.8217305541038513, "learning_rate": 0.00019989224382388813, "loss": 11.7902, "step": 31 }, { "epoch": 0.0012446155790865688, "grad_norm": 0.9108098149299622, "learning_rate": 0.00019988173905723705, "loss": 11.7662, "step": 32 }, { "epoch": 0.0012835098159330242, "grad_norm": 0.7646684646606445, "learning_rate": 0.00019987074609088622, "loss": 11.7422, "step": 33 }, { "epoch": 0.0013224040527794795, "grad_norm": 0.8126352429389954, "learning_rate": 0.00019985926497856688, "loss": 11.7323, "step": 34 }, { "epoch": 0.0013612982896259347, "grad_norm": 0.8516243100166321, "learning_rate": 0.0001998472957763961, "loss": 11.7802, "step": 35 }, { "epoch": 0.00140019252647239, "grad_norm": 0.8620424866676331, "learning_rate": 0.00019983483854287668, "loss": 11.7233, "step": 36 }, { "epoch": 0.0014390867633188452, "grad_norm": 0.856157660484314, "learning_rate": 0.00019982189333889682, "loss": 11.7062, "step": 37 }, { "epoch": 0.0014779810001653005, "grad_norm": 0.8806939125061035, "learning_rate": 0.00019980846022772978, "loss": 11.717, "step": 38 }, { "epoch": 0.0015168752370117557, "grad_norm": 0.834979236125946, "learning_rate": 0.00019979453927503364, "loss": 11.7402, "step": 39 }, { "epoch": 0.0015557694738582112, "grad_norm": 0.6993672847747803, "learning_rate": 0.0001997801305488509, "loss": 11.8186, "step": 40 }, { "epoch": 0.0015946637107046664, "grad_norm": 0.8203376531600952, "learning_rate": 0.0001997652341196082, "loss": 11.694, "step": 41 }, { "epoch": 0.0016335579475511217, "grad_norm": 0.703465461730957, "learning_rate": 0.00019974985006011595, "loss": 11.7357, "step": 42 }, { "epoch": 0.001672452184397577, "grad_norm": 0.7644296884536743, "learning_rate": 0.00019973397844556806, "loss": 11.6971, "step": 43 }, { "epoch": 0.0017113464212440322, "grad_norm": 0.7772036790847778, "learning_rate": 0.0001997176193535414, "loss": 11.7109, "step": 44 }, { "epoch": 0.0017502406580904874, "grad_norm": 0.6891399025917053, "learning_rate": 0.0001997007728639956, "loss": 11.7127, "step": 45 }, { "epoch": 0.0017891348949369426, "grad_norm": 0.5980880856513977, "learning_rate": 0.00019968343905927256, "loss": 11.6656, "step": 46 }, { "epoch": 0.0018280291317833979, "grad_norm": 0.6086148619651794, "learning_rate": 0.000199665618024096, "loss": 11.661, "step": 47 }, { "epoch": 0.0018669233686298534, "grad_norm": 0.8212317824363708, "learning_rate": 0.00019964730984557122, "loss": 11.6534, "step": 48 }, { "epoch": 0.0019058176054763086, "grad_norm": 0.5709652304649353, "learning_rate": 0.00019962851461318454, "loss": 11.6627, "step": 49 }, { "epoch": 0.0019447118423227638, "grad_norm": 0.7298044562339783, "learning_rate": 0.00019960923241880277, "loss": 11.6767, "step": 50 }, { "epoch": 0.001983606079169219, "grad_norm": 0.589881956577301, "learning_rate": 0.0001995894633566731, "loss": 11.6827, "step": 51 }, { "epoch": 0.0020225003160156746, "grad_norm": 0.6468479037284851, "learning_rate": 0.00019956920752342225, "loss": 11.6714, "step": 52 }, { "epoch": 0.0020613945528621296, "grad_norm": 0.5610571503639221, "learning_rate": 0.00019954846501805623, "loss": 11.6745, "step": 53 }, { "epoch": 0.002100288789708585, "grad_norm": 0.5923136472702026, "learning_rate": 0.00019952723594195978, "loss": 11.6542, "step": 54 }, { "epoch": 0.00213918302655504, "grad_norm": 0.44219300150871277, "learning_rate": 0.00019950552039889585, "loss": 11.6558, "step": 55 }, { "epoch": 0.0021780772634014955, "grad_norm": 0.5585126280784607, "learning_rate": 0.0001994833184950053, "loss": 11.6534, "step": 56 }, { "epoch": 0.0022169715002479506, "grad_norm": 0.41971588134765625, "learning_rate": 0.00019946063033880596, "loss": 11.6404, "step": 57 }, { "epoch": 0.002255865737094406, "grad_norm": 0.4337867498397827, "learning_rate": 0.00019943745604119258, "loss": 11.6692, "step": 58 }, { "epoch": 0.0022947599739408615, "grad_norm": 0.4114167392253876, "learning_rate": 0.00019941379571543596, "loss": 11.6514, "step": 59 }, { "epoch": 0.0023336542107873165, "grad_norm": 0.4948188364505768, "learning_rate": 0.0001993896494771825, "loss": 11.63, "step": 60 }, { "epoch": 0.002372548447633772, "grad_norm": 0.37826672196388245, "learning_rate": 0.00019936501744445356, "loss": 11.6152, "step": 61 }, { "epoch": 0.002411442684480227, "grad_norm": 0.39225053787231445, "learning_rate": 0.00019933989973764508, "loss": 11.6293, "step": 62 }, { "epoch": 0.0024503369213266825, "grad_norm": 0.3635271191596985, "learning_rate": 0.00019931429647952685, "loss": 11.6844, "step": 63 }, { "epoch": 0.0024892311581731375, "grad_norm": 0.3496195077896118, "learning_rate": 0.00019928820779524186, "loss": 11.6328, "step": 64 }, { "epoch": 0.002528125395019593, "grad_norm": 0.3208658695220947, "learning_rate": 0.0001992616338123058, "loss": 11.6904, "step": 65 }, { "epoch": 0.0025670196318660484, "grad_norm": 0.2557298243045807, "learning_rate": 0.00019923457466060636, "loss": 11.6284, "step": 66 }, { "epoch": 0.0026059138687125035, "grad_norm": 0.3118024170398712, "learning_rate": 0.0001992070304724027, "loss": 11.6056, "step": 67 }, { "epoch": 0.002644808105558959, "grad_norm": 0.3541165590286255, "learning_rate": 0.0001991790013823246, "loss": 11.6168, "step": 68 }, { "epoch": 0.002683702342405414, "grad_norm": 0.22280801832675934, "learning_rate": 0.0001991504875273721, "loss": 11.6403, "step": 69 }, { "epoch": 0.0027225965792518694, "grad_norm": 0.2828383445739746, "learning_rate": 0.00019912148904691453, "loss": 11.6285, "step": 70 }, { "epoch": 0.0027614908160983245, "grad_norm": 0.28467994928359985, "learning_rate": 0.00019909200608268999, "loss": 11.6314, "step": 71 }, { "epoch": 0.00280038505294478, "grad_norm": 0.3228735029697418, "learning_rate": 0.00019906203877880464, "loss": 11.6323, "step": 72 }, { "epoch": 0.0028392792897912354, "grad_norm": 0.28654950857162476, "learning_rate": 0.00019903158728173205, "loss": 11.6147, "step": 73 }, { "epoch": 0.0028781735266376904, "grad_norm": 0.36778876185417175, "learning_rate": 0.0001990006517403123, "loss": 11.6292, "step": 74 }, { "epoch": 0.002917067763484146, "grad_norm": 0.2530018091201782, "learning_rate": 0.00019896923230575144, "loss": 11.6101, "step": 75 }, { "epoch": 0.002955962000330601, "grad_norm": 0.3366001844406128, "learning_rate": 0.0001989373291316207, "loss": 11.6587, "step": 76 }, { "epoch": 0.0029948562371770564, "grad_norm": 0.3933141827583313, "learning_rate": 0.00019890494237385562, "loss": 11.6827, "step": 77 }, { "epoch": 0.0030337504740235114, "grad_norm": 0.23036500811576843, "learning_rate": 0.00019887207219075552, "loss": 11.6536, "step": 78 }, { "epoch": 0.003072644710869967, "grad_norm": 0.2456391602754593, "learning_rate": 0.00019883871874298254, "loss": 11.5873, "step": 79 }, { "epoch": 0.0031115389477164223, "grad_norm": 0.24947503209114075, "learning_rate": 0.00019880488219356087, "loss": 11.6148, "step": 80 }, { "epoch": 0.0031504331845628774, "grad_norm": 0.33319100737571716, "learning_rate": 0.00019877056270787603, "loss": 11.5873, "step": 81 }, { "epoch": 0.003189327421409333, "grad_norm": 0.38822099566459656, "learning_rate": 0.00019873576045367406, "loss": 11.6151, "step": 82 }, { "epoch": 0.003228221658255788, "grad_norm": 0.25635385513305664, "learning_rate": 0.00019870047560106063, "loss": 11.6253, "step": 83 }, { "epoch": 0.0032671158951022433, "grad_norm": 0.2719604969024658, "learning_rate": 0.0001986647083225002, "loss": 11.5815, "step": 84 }, { "epoch": 0.0033060101319486983, "grad_norm": 0.2555078864097595, "learning_rate": 0.0001986284587928153, "loss": 11.5739, "step": 85 }, { "epoch": 0.003344904368795154, "grad_norm": 0.3243318498134613, "learning_rate": 0.00019859172718918552, "loss": 11.6231, "step": 86 }, { "epoch": 0.003383798605641609, "grad_norm": 0.29934969544410706, "learning_rate": 0.00019855451369114676, "loss": 11.6195, "step": 87 }, { "epoch": 0.0034226928424880643, "grad_norm": 0.34273651242256165, "learning_rate": 0.00019851681848059039, "loss": 11.6021, "step": 88 }, { "epoch": 0.0034615870793345198, "grad_norm": 0.2454499751329422, "learning_rate": 0.0001984786417417621, "loss": 11.6107, "step": 89 }, { "epoch": 0.003500481316180975, "grad_norm": 0.2224448025226593, "learning_rate": 0.0001984399836612614, "loss": 11.5895, "step": 90 }, { "epoch": 0.0035393755530274303, "grad_norm": 0.31493574380874634, "learning_rate": 0.0001984008444280403, "loss": 11.5723, "step": 91 }, { "epoch": 0.0035782697898738853, "grad_norm": 0.42259109020233154, "learning_rate": 0.00019836122423340266, "loss": 11.5755, "step": 92 }, { "epoch": 0.0036171640267203408, "grad_norm": 0.36388954520225525, "learning_rate": 0.0001983211232710032, "loss": 11.576, "step": 93 }, { "epoch": 0.0036560582635667958, "grad_norm": 0.26240676641464233, "learning_rate": 0.00019828054173684644, "loss": 11.6007, "step": 94 }, { "epoch": 0.0036949525004132512, "grad_norm": 0.3404752314090729, "learning_rate": 0.00019823947982928596, "loss": 11.5708, "step": 95 }, { "epoch": 0.0037338467372597067, "grad_norm": 0.27526772022247314, "learning_rate": 0.0001981979377490232, "loss": 11.6016, "step": 96 }, { "epoch": 0.0037727409741061617, "grad_norm": 0.3854866027832031, "learning_rate": 0.00019815591569910654, "loss": 11.6739, "step": 97 }, { "epoch": 0.003811635210952617, "grad_norm": 0.3097264766693115, "learning_rate": 0.00019811341388493046, "loss": 11.5611, "step": 98 }, { "epoch": 0.0038505294477990722, "grad_norm": 0.29225629568099976, "learning_rate": 0.00019807043251423437, "loss": 11.5627, "step": 99 }, { "epoch": 0.0038894236846455277, "grad_norm": 0.2493603229522705, "learning_rate": 0.0001980269717971016, "loss": 11.5669, "step": 100 }, { "epoch": 0.003928317921491983, "grad_norm": 0.22307652235031128, "learning_rate": 0.00019798303194595846, "loss": 11.5798, "step": 101 }, { "epoch": 0.003967212158338438, "grad_norm": 0.283069908618927, "learning_rate": 0.0001979386131755732, "loss": 11.5932, "step": 102 }, { "epoch": 0.004006106395184893, "grad_norm": 0.24232083559036255, "learning_rate": 0.00019789371570305483, "loss": 11.5431, "step": 103 }, { "epoch": 0.004045000632031349, "grad_norm": 0.2204054594039917, "learning_rate": 0.00019784833974785223, "loss": 11.5386, "step": 104 }, { "epoch": 0.004083894868877804, "grad_norm": 0.2796245217323303, "learning_rate": 0.00019780248553175297, "loss": 11.6006, "step": 105 }, { "epoch": 0.004122789105724259, "grad_norm": 0.25308501720428467, "learning_rate": 0.0001977561532788822, "loss": 11.5659, "step": 106 }, { "epoch": 0.004161683342570714, "grad_norm": 0.2785077393054962, "learning_rate": 0.00019770934321570175, "loss": 11.5672, "step": 107 }, { "epoch": 0.00420057757941717, "grad_norm": 0.24214982986450195, "learning_rate": 0.00019766205557100868, "loss": 11.5405, "step": 108 }, { "epoch": 0.004239471816263625, "grad_norm": 0.27231502532958984, "learning_rate": 0.00019761429057593453, "loss": 11.5551, "step": 109 }, { "epoch": 0.00427836605311008, "grad_norm": 0.2523971199989319, "learning_rate": 0.00019756604846394394, "loss": 11.577, "step": 110 }, { "epoch": 0.004317260289956536, "grad_norm": 0.3524065613746643, "learning_rate": 0.0001975173294708336, "loss": 11.5515, "step": 111 }, { "epoch": 0.004356154526802991, "grad_norm": 0.24760575592517853, "learning_rate": 0.0001974681338347311, "loss": 11.6259, "step": 112 }, { "epoch": 0.004395048763649446, "grad_norm": 0.2742294371128082, "learning_rate": 0.00019741846179609378, "loss": 11.5397, "step": 113 }, { "epoch": 0.004433943000495901, "grad_norm": 0.3052935004234314, "learning_rate": 0.00019736831359770752, "loss": 11.5505, "step": 114 }, { "epoch": 0.004472837237342357, "grad_norm": 0.24488042294979095, "learning_rate": 0.00019731768948468549, "loss": 11.5381, "step": 115 }, { "epoch": 0.004511731474188812, "grad_norm": 0.25622838735580444, "learning_rate": 0.00019726658970446712, "loss": 11.5123, "step": 116 }, { "epoch": 0.004550625711035267, "grad_norm": 0.3100413680076599, "learning_rate": 0.00019721501450681674, "loss": 11.5561, "step": 117 }, { "epoch": 0.004589519947881723, "grad_norm": 0.3365084230899811, "learning_rate": 0.00019716296414382247, "loss": 11.5776, "step": 118 }, { "epoch": 0.004628414184728178, "grad_norm": 0.23528669774532318, "learning_rate": 0.00019711043886989484, "loss": 11.5334, "step": 119 }, { "epoch": 0.004667308421574633, "grad_norm": 0.18646976351737976, "learning_rate": 0.00019705743894176574, "loss": 11.5241, "step": 120 }, { "epoch": 0.004706202658421088, "grad_norm": 0.1631908267736435, "learning_rate": 0.00019700396461848696, "loss": 11.5248, "step": 121 }, { "epoch": 0.004745096895267544, "grad_norm": 0.27178144454956055, "learning_rate": 0.00019695001616142915, "loss": 11.6115, "step": 122 }, { "epoch": 0.004783991132113999, "grad_norm": 0.37406253814697266, "learning_rate": 0.00019689559383428032, "loss": 11.53, "step": 123 }, { "epoch": 0.004822885368960454, "grad_norm": 0.16064143180847168, "learning_rate": 0.0001968406979030447, "loss": 11.5086, "step": 124 }, { "epoch": 0.00486177960580691, "grad_norm": 0.2815974950790405, "learning_rate": 0.00019678532863604136, "loss": 11.5819, "step": 125 }, { "epoch": 0.004900673842653365, "grad_norm": 0.27079054713249207, "learning_rate": 0.00019672948630390294, "loss": 11.5352, "step": 126 }, { "epoch": 0.00493956807949982, "grad_norm": 0.19512930512428284, "learning_rate": 0.0001966731711795744, "loss": 11.5129, "step": 127 }, { "epoch": 0.004978462316346275, "grad_norm": 0.1568814218044281, "learning_rate": 0.00019661638353831142, "loss": 11.5294, "step": 128 }, { "epoch": 0.005017356553192731, "grad_norm": 0.20038583874702454, "learning_rate": 0.0001965591236576794, "loss": 11.5469, "step": 129 }, { "epoch": 0.005056250790039186, "grad_norm": 0.18642742931842804, "learning_rate": 0.0001965013918175519, "loss": 11.5535, "step": 130 }, { "epoch": 0.005095145026885641, "grad_norm": 0.23478096723556519, "learning_rate": 0.00019644318830010926, "loss": 11.5114, "step": 131 }, { "epoch": 0.005134039263732097, "grad_norm": 0.20690174400806427, "learning_rate": 0.00019638451338983736, "loss": 11.5184, "step": 132 }, { "epoch": 0.005172933500578552, "grad_norm": 0.2394125759601593, "learning_rate": 0.00019632536737352607, "loss": 11.5161, "step": 133 }, { "epoch": 0.005211827737425007, "grad_norm": 0.1465281844139099, "learning_rate": 0.00019626575054026795, "loss": 11.5224, "step": 134 }, { "epoch": 0.005250721974271462, "grad_norm": 0.22588512301445007, "learning_rate": 0.0001962056631814568, "loss": 11.4984, "step": 135 }, { "epoch": 0.005289616211117918, "grad_norm": 0.30839601159095764, "learning_rate": 0.00019614510559078625, "loss": 11.662, "step": 136 }, { "epoch": 0.005328510447964373, "grad_norm": 0.17439375817775726, "learning_rate": 0.00019608407806424833, "loss": 11.5528, "step": 137 }, { "epoch": 0.005367404684810828, "grad_norm": 0.15316593647003174, "learning_rate": 0.000196022580900132, "loss": 11.523, "step": 138 }, { "epoch": 0.005406298921657284, "grad_norm": 0.3325675427913666, "learning_rate": 0.0001959606143990217, "loss": 11.6042, "step": 139 }, { "epoch": 0.005445193158503739, "grad_norm": 0.12072927504777908, "learning_rate": 0.00019589817886379586, "loss": 11.5029, "step": 140 }, { "epoch": 0.005484087395350194, "grad_norm": 0.22607655823230743, "learning_rate": 0.00019583527459962553, "loss": 11.5007, "step": 141 }, { "epoch": 0.005522981632196649, "grad_norm": 0.18810729682445526, "learning_rate": 0.00019577190191397273, "loss": 11.5657, "step": 142 }, { "epoch": 0.005561875869043105, "grad_norm": 0.1946786642074585, "learning_rate": 0.00019570806111658898, "loss": 11.5044, "step": 143 }, { "epoch": 0.00560077010588956, "grad_norm": 0.2539864778518677, "learning_rate": 0.00019564375251951397, "loss": 11.5446, "step": 144 }, { "epoch": 0.005639664342736015, "grad_norm": 0.3369433879852295, "learning_rate": 0.00019557897643707376, "loss": 11.6018, "step": 145 }, { "epoch": 0.005678558579582471, "grad_norm": 0.1706317812204361, "learning_rate": 0.0001955137331858794, "loss": 11.5318, "step": 146 }, { "epoch": 0.005717452816428926, "grad_norm": 0.23957085609436035, "learning_rate": 0.00019544802308482542, "loss": 11.5257, "step": 147 }, { "epoch": 0.005756347053275381, "grad_norm": 0.2712627649307251, "learning_rate": 0.0001953818464550881, "loss": 11.5593, "step": 148 }, { "epoch": 0.005795241290121836, "grad_norm": 0.19239541888237, "learning_rate": 0.0001953152036201241, "loss": 11.5102, "step": 149 }, { "epoch": 0.005834135526968292, "grad_norm": 0.18179698288440704, "learning_rate": 0.00019524809490566877, "loss": 11.5242, "step": 150 }, { "epoch": 0.005873029763814747, "grad_norm": 0.2665189802646637, "learning_rate": 0.00019518052063973457, "loss": 11.5768, "step": 151 }, { "epoch": 0.005911924000661202, "grad_norm": 0.29264965653419495, "learning_rate": 0.00019511248115260945, "loss": 11.5513, "step": 152 }, { "epoch": 0.005950818237507658, "grad_norm": 0.28770771622657776, "learning_rate": 0.00019504397677685528, "loss": 11.5175, "step": 153 }, { "epoch": 0.005989712474354113, "grad_norm": 0.11623543500900269, "learning_rate": 0.00019497500784730615, "loss": 11.5372, "step": 154 }, { "epoch": 0.006028606711200568, "grad_norm": 0.1338321566581726, "learning_rate": 0.00019490557470106686, "loss": 11.543, "step": 155 }, { "epoch": 0.006067500948047023, "grad_norm": 0.1583881825208664, "learning_rate": 0.00019483567767751122, "loss": 11.4993, "step": 156 }, { "epoch": 0.006106395184893479, "grad_norm": 0.22257395088672638, "learning_rate": 0.00019476531711828027, "loss": 11.4959, "step": 157 }, { "epoch": 0.006145289421739934, "grad_norm": 0.20336033403873444, "learning_rate": 0.00019469449336728073, "loss": 11.4997, "step": 158 }, { "epoch": 0.006184183658586389, "grad_norm": 0.16265328228473663, "learning_rate": 0.00019462320677068336, "loss": 11.5433, "step": 159 }, { "epoch": 0.006223077895432845, "grad_norm": 0.17649494111537933, "learning_rate": 0.00019455145767692118, "loss": 11.509, "step": 160 }, { "epoch": 0.0062619721322793, "grad_norm": 0.18802830576896667, "learning_rate": 0.00019447924643668777, "loss": 11.5124, "step": 161 }, { "epoch": 0.006300866369125755, "grad_norm": 0.19875842332839966, "learning_rate": 0.0001944065734029356, "loss": 11.5046, "step": 162 }, { "epoch": 0.00633976060597221, "grad_norm": 0.30943620204925537, "learning_rate": 0.00019433343893087422, "loss": 11.5691, "step": 163 }, { "epoch": 0.006378654842818666, "grad_norm": 0.1045689508318901, "learning_rate": 0.0001942598433779687, "loss": 11.5418, "step": 164 }, { "epoch": 0.006417549079665121, "grad_norm": 0.18484659492969513, "learning_rate": 0.00019418578710393773, "loss": 11.5109, "step": 165 }, { "epoch": 0.006456443316511576, "grad_norm": 0.15224042534828186, "learning_rate": 0.00019411127047075187, "loss": 11.484, "step": 166 }, { "epoch": 0.006495337553358032, "grad_norm": 0.20575569570064545, "learning_rate": 0.0001940362938426318, "loss": 11.534, "step": 167 }, { "epoch": 0.006534231790204487, "grad_norm": 0.1405501365661621, "learning_rate": 0.00019396085758604663, "loss": 11.4943, "step": 168 }, { "epoch": 0.006573126027050942, "grad_norm": 0.11041560769081116, "learning_rate": 0.00019388496206971197, "loss": 11.5116, "step": 169 }, { "epoch": 0.006612020263897397, "grad_norm": 0.1681826114654541, "learning_rate": 0.0001938086076645882, "loss": 11.5459, "step": 170 }, { "epoch": 0.006650914500743853, "grad_norm": 0.09526358544826508, "learning_rate": 0.00019373179474387858, "loss": 11.4893, "step": 171 }, { "epoch": 0.006689808737590308, "grad_norm": 0.26661109924316406, "learning_rate": 0.0001936545236830277, "loss": 11.497, "step": 172 }, { "epoch": 0.006728702974436763, "grad_norm": 0.12802819907665253, "learning_rate": 0.00019357679485971921, "loss": 11.5208, "step": 173 }, { "epoch": 0.006767597211283218, "grad_norm": 0.1455337256193161, "learning_rate": 0.0001934986086538743, "loss": 11.4953, "step": 174 }, { "epoch": 0.006806491448129674, "grad_norm": 0.13375167548656464, "learning_rate": 0.00019341996544764976, "loss": 11.4877, "step": 175 }, { "epoch": 0.006845385684976129, "grad_norm": 0.21124620735645294, "learning_rate": 0.00019334086562543604, "loss": 11.5414, "step": 176 }, { "epoch": 0.006884279921822584, "grad_norm": 0.1629767268896103, "learning_rate": 0.00019326130957385547, "loss": 11.4901, "step": 177 }, { "epoch": 0.0069231741586690395, "grad_norm": 0.14561820030212402, "learning_rate": 0.00019318129768176032, "loss": 11.4858, "step": 178 }, { "epoch": 0.0069620683955154946, "grad_norm": 0.17226560413837433, "learning_rate": 0.0001931008303402309, "loss": 11.4949, "step": 179 }, { "epoch": 0.00700096263236195, "grad_norm": 0.13079045712947845, "learning_rate": 0.00019301990794257367, "loss": 11.499, "step": 180 }, { "epoch": 0.007039856869208405, "grad_norm": 0.12004582583904266, "learning_rate": 0.00019293853088431925, "loss": 11.4715, "step": 181 }, { "epoch": 0.0070787511060548605, "grad_norm": 0.16466955840587616, "learning_rate": 0.00019285669956322062, "loss": 11.5104, "step": 182 }, { "epoch": 0.0071176453429013155, "grad_norm": 0.1606258898973465, "learning_rate": 0.00019277441437925107, "loss": 11.5074, "step": 183 }, { "epoch": 0.007156539579747771, "grad_norm": 0.18595169484615326, "learning_rate": 0.0001926916757346022, "loss": 11.491, "step": 184 }, { "epoch": 0.0071954338165942265, "grad_norm": 0.14777301251888275, "learning_rate": 0.0001926084840336821, "loss": 11.5086, "step": 185 }, { "epoch": 0.0072343280534406815, "grad_norm": 0.1498323529958725, "learning_rate": 0.0001925248396831133, "loss": 11.4873, "step": 186 }, { "epoch": 0.0072732222902871365, "grad_norm": 0.25931549072265625, "learning_rate": 0.00019244074309173077, "loss": 11.5798, "step": 187 }, { "epoch": 0.0073121165271335916, "grad_norm": 0.20122070610523224, "learning_rate": 0.0001923561946705799, "loss": 11.5076, "step": 188 }, { "epoch": 0.0073510107639800475, "grad_norm": 0.15286849439144135, "learning_rate": 0.00019227119483291455, "loss": 11.5127, "step": 189 }, { "epoch": 0.0073899050008265025, "grad_norm": 0.18253396451473236, "learning_rate": 0.000192185743994195, "loss": 11.5885, "step": 190 }, { "epoch": 0.0074287992376729575, "grad_norm": 0.16337540745735168, "learning_rate": 0.0001920998425720859, "loss": 11.5003, "step": 191 }, { "epoch": 0.007467693474519413, "grad_norm": 0.17238447070121765, "learning_rate": 0.00019201349098645434, "loss": 11.5201, "step": 192 }, { "epoch": 0.0075065877113658684, "grad_norm": 0.12057357281446457, "learning_rate": 0.00019192668965936757, "loss": 11.4665, "step": 193 }, { "epoch": 0.0075454819482123235, "grad_norm": 0.11605846136808395, "learning_rate": 0.00019183943901509117, "loss": 11.4775, "step": 194 }, { "epoch": 0.0075843761850587785, "grad_norm": 0.10071561485528946, "learning_rate": 0.00019175173948008688, "loss": 11.5028, "step": 195 }, { "epoch": 0.007623270421905234, "grad_norm": 0.18784727156162262, "learning_rate": 0.0001916635914830105, "loss": 11.4792, "step": 196 }, { "epoch": 0.0076621646587516894, "grad_norm": 0.1296684890985489, "learning_rate": 0.00019157499545470978, "loss": 11.5104, "step": 197 }, { "epoch": 0.0077010588955981445, "grad_norm": 0.14614522457122803, "learning_rate": 0.0001914859518282224, "loss": 11.5186, "step": 198 }, { "epoch": 0.0077399531324446, "grad_norm": 0.13492991030216217, "learning_rate": 0.0001913964610387738, "loss": 11.492, "step": 199 }, { "epoch": 0.007778847369291055, "grad_norm": 0.19949865341186523, "learning_rate": 0.00019130652352377498, "loss": 11.5476, "step": 200 }, { "epoch": 0.007817741606137511, "grad_norm": 0.16694244742393494, "learning_rate": 0.00019121613972282052, "loss": 11.4981, "step": 201 }, { "epoch": 0.007856635842983966, "grad_norm": 0.1532403826713562, "learning_rate": 0.00019112531007768632, "loss": 11.4917, "step": 202 }, { "epoch": 0.007895530079830421, "grad_norm": 0.3644372224807739, "learning_rate": 0.0001910340350323274, "loss": 11.6989, "step": 203 }, { "epoch": 0.007934424316676876, "grad_norm": 0.1219576820731163, "learning_rate": 0.00019094231503287597, "loss": 11.4976, "step": 204 }, { "epoch": 0.007973318553523331, "grad_norm": 0.3214426338672638, "learning_rate": 0.00019085015052763886, "loss": 11.5762, "step": 205 }, { "epoch": 0.008012212790369786, "grad_norm": 0.29731321334838867, "learning_rate": 0.00019075754196709572, "loss": 11.5047, "step": 206 }, { "epoch": 0.008051107027216241, "grad_norm": 0.11155258864164352, "learning_rate": 0.0001906644898038965, "loss": 11.5168, "step": 207 }, { "epoch": 0.008090001264062698, "grad_norm": 0.18182918429374695, "learning_rate": 0.00019057099449285952, "loss": 11.5551, "step": 208 }, { "epoch": 0.008128895500909153, "grad_norm": 0.14495058357715607, "learning_rate": 0.00019047705649096903, "loss": 11.5106, "step": 209 }, { "epoch": 0.008167789737755608, "grad_norm": 0.12827762961387634, "learning_rate": 0.00019038267625737305, "loss": 11.5301, "step": 210 }, { "epoch": 0.008206683974602063, "grad_norm": 0.22183631360530853, "learning_rate": 0.00019028785425338114, "loss": 11.5512, "step": 211 }, { "epoch": 0.008245578211448518, "grad_norm": 0.2280638962984085, "learning_rate": 0.00019019259094246214, "loss": 11.5021, "step": 212 }, { "epoch": 0.008284472448294973, "grad_norm": 0.09937578439712524, "learning_rate": 0.0001900968867902419, "loss": 11.4953, "step": 213 }, { "epoch": 0.008323366685141428, "grad_norm": 0.1843242347240448, "learning_rate": 0.00019000074226450107, "loss": 11.5246, "step": 214 }, { "epoch": 0.008362260921987885, "grad_norm": 0.17141276597976685, "learning_rate": 0.00018990415783517257, "loss": 11.4873, "step": 215 }, { "epoch": 0.00840115515883434, "grad_norm": 0.12739145755767822, "learning_rate": 0.00018980713397433969, "loss": 11.4796, "step": 216 }, { "epoch": 0.008440049395680795, "grad_norm": 0.16645096242427826, "learning_rate": 0.00018970967115623338, "loss": 11.5137, "step": 217 }, { "epoch": 0.00847894363252725, "grad_norm": 0.16383016109466553, "learning_rate": 0.00018961176985723025, "loss": 11.534, "step": 218 }, { "epoch": 0.008517837869373705, "grad_norm": 0.11116816848516464, "learning_rate": 0.00018951343055585, "loss": 11.4808, "step": 219 }, { "epoch": 0.00855673210622016, "grad_norm": 0.2168058156967163, "learning_rate": 0.0001894146537327533, "loss": 11.5187, "step": 220 }, { "epoch": 0.008595626343066615, "grad_norm": 0.2552565634250641, "learning_rate": 0.00018931543987073918, "loss": 11.4963, "step": 221 }, { "epoch": 0.008634520579913072, "grad_norm": 0.15340295433998108, "learning_rate": 0.00018921578945474296, "loss": 11.4824, "step": 222 }, { "epoch": 0.008673414816759527, "grad_norm": 0.17909100651741028, "learning_rate": 0.0001891157029718337, "loss": 11.4815, "step": 223 }, { "epoch": 0.008712309053605982, "grad_norm": 0.11708688735961914, "learning_rate": 0.00018901518091121183, "loss": 11.4783, "step": 224 }, { "epoch": 0.008751203290452437, "grad_norm": 0.15015773475170135, "learning_rate": 0.00018891422376420675, "loss": 11.4911, "step": 225 }, { "epoch": 0.008790097527298892, "grad_norm": 0.24841494858264923, "learning_rate": 0.00018881283202427457, "loss": 11.5282, "step": 226 }, { "epoch": 0.008828991764145347, "grad_norm": 0.15825675427913666, "learning_rate": 0.00018871100618699554, "loss": 11.5035, "step": 227 }, { "epoch": 0.008867886000991802, "grad_norm": 0.23037609457969666, "learning_rate": 0.00018860874675007166, "loss": 11.5717, "step": 228 }, { "epoch": 0.008906780237838259, "grad_norm": 0.1274932324886322, "learning_rate": 0.00018850605421332425, "loss": 11.5099, "step": 229 }, { "epoch": 0.008945674474684714, "grad_norm": 0.15414027869701385, "learning_rate": 0.00018840292907869164, "loss": 11.5021, "step": 230 }, { "epoch": 0.008984568711531169, "grad_norm": 0.14080509543418884, "learning_rate": 0.00018829937185022648, "loss": 11.4605, "step": 231 }, { "epoch": 0.009023462948377624, "grad_norm": 0.1649276167154312, "learning_rate": 0.00018819538303409343, "loss": 11.5563, "step": 232 }, { "epoch": 0.00906235718522408, "grad_norm": 0.20261478424072266, "learning_rate": 0.0001880909631385667, "loss": 11.4842, "step": 233 }, { "epoch": 0.009101251422070534, "grad_norm": 0.15972651541233063, "learning_rate": 0.00018798611267402746, "loss": 11.5308, "step": 234 }, { "epoch": 0.00914014565891699, "grad_norm": 0.12716291844844818, "learning_rate": 0.00018788083215296147, "loss": 11.5493, "step": 235 }, { "epoch": 0.009179039895763446, "grad_norm": 0.11277750879526138, "learning_rate": 0.00018777512208995644, "loss": 11.4873, "step": 236 }, { "epoch": 0.009217934132609901, "grad_norm": 0.2449299842119217, "learning_rate": 0.00018766898300169964, "loss": 11.4818, "step": 237 }, { "epoch": 0.009256828369456356, "grad_norm": 0.24685290455818176, "learning_rate": 0.0001875624154069753, "loss": 11.5448, "step": 238 }, { "epoch": 0.009295722606302811, "grad_norm": 0.11273160576820374, "learning_rate": 0.00018745541982666204, "loss": 11.4993, "step": 239 }, { "epoch": 0.009334616843149266, "grad_norm": 0.19707056879997253, "learning_rate": 0.0001873479967837305, "loss": 11.4912, "step": 240 }, { "epoch": 0.009373511079995721, "grad_norm": 0.19559109210968018, "learning_rate": 0.00018724014680324057, "loss": 11.5583, "step": 241 }, { "epoch": 0.009412405316842176, "grad_norm": 0.12464620918035507, "learning_rate": 0.00018713187041233896, "loss": 11.5049, "step": 242 }, { "epoch": 0.009451299553688633, "grad_norm": 0.16976694762706757, "learning_rate": 0.00018702316814025652, "loss": 11.531, "step": 243 }, { "epoch": 0.009490193790535088, "grad_norm": 0.12237662822008133, "learning_rate": 0.00018691404051830577, "loss": 11.5151, "step": 244 }, { "epoch": 0.009529088027381543, "grad_norm": 0.23064935207366943, "learning_rate": 0.0001868044880798782, "loss": 11.4778, "step": 245 }, { "epoch": 0.009567982264227998, "grad_norm": 0.23447439074516296, "learning_rate": 0.0001866945113604418, "loss": 11.486, "step": 246 }, { "epoch": 0.009606876501074453, "grad_norm": 0.133781298995018, "learning_rate": 0.00018658411089753822, "loss": 11.4716, "step": 247 }, { "epoch": 0.009645770737920908, "grad_norm": 0.13565121591091156, "learning_rate": 0.00018647328723078038, "loss": 11.4709, "step": 248 }, { "epoch": 0.009684664974767363, "grad_norm": 0.17978216707706451, "learning_rate": 0.0001863620409018497, "loss": 11.4723, "step": 249 }, { "epoch": 0.00972355921161382, "grad_norm": 0.23136495053768158, "learning_rate": 0.00018625037245449338, "loss": 11.4649, "step": 250 }, { "epoch": 0.009762453448460275, "grad_norm": 0.17434270679950714, "learning_rate": 0.00018613828243452206, "loss": 11.4591, "step": 251 }, { "epoch": 0.00980134768530673, "grad_norm": 0.12544648349285126, "learning_rate": 0.00018602577138980664, "loss": 11.5083, "step": 252 }, { "epoch": 0.009840241922153185, "grad_norm": 0.14478613436222076, "learning_rate": 0.00018591283987027615, "loss": 11.4851, "step": 253 }, { "epoch": 0.00987913615899964, "grad_norm": 0.12133371084928513, "learning_rate": 0.00018579948842791473, "loss": 11.5108, "step": 254 }, { "epoch": 0.009918030395846095, "grad_norm": 0.1275811493396759, "learning_rate": 0.00018568571761675893, "loss": 11.4951, "step": 255 }, { "epoch": 0.00995692463269255, "grad_norm": 0.11182014644145966, "learning_rate": 0.00018557152799289516, "loss": 11.4864, "step": 256 }, { "epoch": 0.009995818869539007, "grad_norm": 0.09808533638715744, "learning_rate": 0.00018545692011445692, "loss": 11.4967, "step": 257 }, { "epoch": 0.010034713106385462, "grad_norm": 0.17356520891189575, "learning_rate": 0.00018534189454162193, "loss": 11.4949, "step": 258 }, { "epoch": 0.010073607343231917, "grad_norm": 0.17073096334934235, "learning_rate": 0.0001852264518366096, "loss": 11.5942, "step": 259 }, { "epoch": 0.010112501580078372, "grad_norm": 0.08817508071660995, "learning_rate": 0.0001851105925636782, "loss": 11.4701, "step": 260 }, { "epoch": 0.010151395816924827, "grad_norm": 0.11892060935497284, "learning_rate": 0.00018499431728912202, "loss": 11.5024, "step": 261 }, { "epoch": 0.010190290053771282, "grad_norm": 0.2091784030199051, "learning_rate": 0.0001848776265812687, "loss": 11.5089, "step": 262 }, { "epoch": 0.010229184290617737, "grad_norm": 0.11762546002864838, "learning_rate": 0.0001847605210104765, "loss": 11.501, "step": 263 }, { "epoch": 0.010268078527464194, "grad_norm": 0.170832097530365, "learning_rate": 0.00018464300114913131, "loss": 11.5595, "step": 264 }, { "epoch": 0.010306972764310649, "grad_norm": 0.1387489289045334, "learning_rate": 0.00018452506757164408, "loss": 11.4831, "step": 265 }, { "epoch": 0.010345867001157104, "grad_norm": 0.2430105209350586, "learning_rate": 0.00018440672085444785, "loss": 11.4895, "step": 266 }, { "epoch": 0.010384761238003559, "grad_norm": 0.1432645618915558, "learning_rate": 0.000184287961575995, "loss": 11.4757, "step": 267 }, { "epoch": 0.010423655474850014, "grad_norm": 0.2623089849948883, "learning_rate": 0.00018416879031675455, "loss": 11.4758, "step": 268 }, { "epoch": 0.010462549711696469, "grad_norm": 0.20094339549541473, "learning_rate": 0.00018404920765920896, "loss": 11.5302, "step": 269 }, { "epoch": 0.010501443948542924, "grad_norm": 0.20814162492752075, "learning_rate": 0.0001839292141878517, "loss": 11.517, "step": 270 }, { "epoch": 0.01054033818538938, "grad_norm": 0.2994871139526367, "learning_rate": 0.00018380881048918405, "loss": 11.5087, "step": 271 }, { "epoch": 0.010579232422235836, "grad_norm": 0.16648192703723907, "learning_rate": 0.0001836879971517126, "loss": 11.485, "step": 272 }, { "epoch": 0.01061812665908229, "grad_norm": 0.1507142335176468, "learning_rate": 0.00018356677476594598, "loss": 11.4861, "step": 273 }, { "epoch": 0.010657020895928746, "grad_norm": 0.1451101005077362, "learning_rate": 0.00018344514392439222, "loss": 11.4866, "step": 274 }, { "epoch": 0.0106959151327752, "grad_norm": 0.240260511636734, "learning_rate": 0.00018332310522155577, "loss": 11.5395, "step": 275 }, { "epoch": 0.010734809369621656, "grad_norm": 0.14388932287693024, "learning_rate": 0.00018320065925393468, "loss": 11.4862, "step": 276 }, { "epoch": 0.010773703606468111, "grad_norm": 0.10203976929187775, "learning_rate": 0.00018307780662001757, "loss": 11.4856, "step": 277 }, { "epoch": 0.010812597843314568, "grad_norm": 0.22282658517360687, "learning_rate": 0.00018295454792028072, "loss": 11.5122, "step": 278 }, { "epoch": 0.010851492080161023, "grad_norm": 0.2494766116142273, "learning_rate": 0.00018283088375718523, "loss": 11.5375, "step": 279 }, { "epoch": 0.010890386317007478, "grad_norm": 0.2778933048248291, "learning_rate": 0.000182706814735174, "loss": 11.4991, "step": 280 }, { "epoch": 0.010929280553853933, "grad_norm": 0.27196282148361206, "learning_rate": 0.00018258234146066875, "loss": 11.4813, "step": 281 }, { "epoch": 0.010968174790700388, "grad_norm": 0.40496769547462463, "learning_rate": 0.00018245746454206719, "loss": 11.5469, "step": 282 }, { "epoch": 0.011007069027546843, "grad_norm": 0.271714985370636, "learning_rate": 0.00018233218458973984, "loss": 11.4572, "step": 283 }, { "epoch": 0.011045963264393298, "grad_norm": 0.15667541325092316, "learning_rate": 0.00018220650221602723, "loss": 11.4822, "step": 284 }, { "epoch": 0.011084857501239755, "grad_norm": 0.1764007955789566, "learning_rate": 0.00018208041803523682, "loss": 11.4946, "step": 285 }, { "epoch": 0.01112375173808621, "grad_norm": 0.23075497150421143, "learning_rate": 0.00018195393266363997, "loss": 11.488, "step": 286 }, { "epoch": 0.011162645974932665, "grad_norm": 0.215908482670784, "learning_rate": 0.00018182704671946908, "loss": 11.4877, "step": 287 }, { "epoch": 0.01120154021177912, "grad_norm": 0.17155992984771729, "learning_rate": 0.00018169976082291436, "loss": 11.475, "step": 288 }, { "epoch": 0.011240434448625575, "grad_norm": 0.47240957617759705, "learning_rate": 0.0001815720755961209, "loss": 11.4831, "step": 289 }, { "epoch": 0.01127932868547203, "grad_norm": 0.14636549353599548, "learning_rate": 0.00018144399166318572, "loss": 11.4903, "step": 290 }, { "epoch": 0.011318222922318485, "grad_norm": 0.29348665475845337, "learning_rate": 0.0001813155096501545, "loss": 11.553, "step": 291 }, { "epoch": 0.011357117159164942, "grad_norm": 0.1544867604970932, "learning_rate": 0.00018118663018501873, "loss": 11.4704, "step": 292 }, { "epoch": 0.011396011396011397, "grad_norm": 0.15028752386569977, "learning_rate": 0.00018105735389771255, "loss": 11.5807, "step": 293 }, { "epoch": 0.011434905632857852, "grad_norm": 0.15231017768383026, "learning_rate": 0.0001809276814201097, "loss": 11.4876, "step": 294 }, { "epoch": 0.011473799869704307, "grad_norm": 0.2768847942352295, "learning_rate": 0.0001807976133860203, "loss": 11.5484, "step": 295 }, { "epoch": 0.011512694106550762, "grad_norm": 0.30941978096961975, "learning_rate": 0.00018066715043118796, "loss": 11.5026, "step": 296 }, { "epoch": 0.011551588343397217, "grad_norm": 0.2332240790128708, "learning_rate": 0.00018053629319328662, "loss": 11.5484, "step": 297 }, { "epoch": 0.011590482580243672, "grad_norm": 0.2580146789550781, "learning_rate": 0.00018040504231191723, "loss": 11.5982, "step": 298 }, { "epoch": 0.011629376817090128, "grad_norm": 0.30972573161125183, "learning_rate": 0.00018027339842860491, "loss": 11.6134, "step": 299 }, { "epoch": 0.011668271053936584, "grad_norm": 0.20016920566558838, "learning_rate": 0.00018014136218679567, "loss": 11.4852, "step": 300 }, { "epoch": 0.011707165290783039, "grad_norm": 0.20848527550697327, "learning_rate": 0.00018000893423185326, "loss": 11.4712, "step": 301 }, { "epoch": 0.011746059527629494, "grad_norm": 0.47000396251678467, "learning_rate": 0.00017987611521105596, "loss": 11.5168, "step": 302 }, { "epoch": 0.011784953764475949, "grad_norm": 0.18191461265087128, "learning_rate": 0.00017974290577359368, "loss": 11.471, "step": 303 }, { "epoch": 0.011823848001322404, "grad_norm": 0.1994115114212036, "learning_rate": 0.00017960930657056438, "loss": 11.4781, "step": 304 }, { "epoch": 0.011862742238168859, "grad_norm": 0.14457939565181732, "learning_rate": 0.0001794753182549713, "loss": 11.4769, "step": 305 }, { "epoch": 0.011901636475015315, "grad_norm": 0.19035786390304565, "learning_rate": 0.00017934094148171944, "loss": 11.4802, "step": 306 }, { "epoch": 0.01194053071186177, "grad_norm": 0.2894309461116791, "learning_rate": 0.0001792061769076126, "loss": 11.4748, "step": 307 }, { "epoch": 0.011979424948708225, "grad_norm": 0.14909781515598297, "learning_rate": 0.00017907102519134992, "loss": 11.4956, "step": 308 }, { "epoch": 0.01201831918555468, "grad_norm": 0.40601280331611633, "learning_rate": 0.000178935486993523, "loss": 11.4823, "step": 309 }, { "epoch": 0.012057213422401136, "grad_norm": 0.3468347489833832, "learning_rate": 0.0001787995629766123, "loss": 11.5814, "step": 310 }, { "epoch": 0.01209610765924759, "grad_norm": 0.18564815819263458, "learning_rate": 0.00017866325380498416, "loss": 11.493, "step": 311 }, { "epoch": 0.012135001896094046, "grad_norm": 0.2548716068267822, "learning_rate": 0.00017852656014488748, "loss": 11.4803, "step": 312 }, { "epoch": 0.012173896132940502, "grad_norm": 0.30909988284111023, "learning_rate": 0.00017838948266445042, "loss": 11.4792, "step": 313 }, { "epoch": 0.012212790369786957, "grad_norm": 0.15396501123905182, "learning_rate": 0.00017825202203367718, "loss": 11.4803, "step": 314 }, { "epoch": 0.012251684606633412, "grad_norm": 0.23552362620830536, "learning_rate": 0.00017811417892444473, "loss": 11.4715, "step": 315 }, { "epoch": 0.012290578843479867, "grad_norm": 0.18171647191047668, "learning_rate": 0.00017797595401049948, "loss": 11.46, "step": 316 }, { "epoch": 0.012329473080326322, "grad_norm": 0.13703273236751556, "learning_rate": 0.000177837347967454, "loss": 11.5141, "step": 317 }, { "epoch": 0.012368367317172778, "grad_norm": 0.21532994508743286, "learning_rate": 0.0001776983614727838, "loss": 11.4668, "step": 318 }, { "epoch": 0.012407261554019233, "grad_norm": 0.1492289900779724, "learning_rate": 0.00017755899520582394, "loss": 11.4488, "step": 319 }, { "epoch": 0.01244615579086569, "grad_norm": 0.26667195558547974, "learning_rate": 0.0001774192498477657, "loss": 11.529, "step": 320 }, { "epoch": 0.012485050027712144, "grad_norm": 0.12595298886299133, "learning_rate": 0.00017727912608165317, "loss": 11.477, "step": 321 }, { "epoch": 0.0125239442645586, "grad_norm": 0.18781131505966187, "learning_rate": 0.00017713862459238025, "loss": 11.578, "step": 322 }, { "epoch": 0.012562838501405054, "grad_norm": 0.18776938319206238, "learning_rate": 0.00017699774606668684, "loss": 11.4694, "step": 323 }, { "epoch": 0.01260173273825151, "grad_norm": 0.16067877411842346, "learning_rate": 0.0001768564911931559, "loss": 11.4986, "step": 324 }, { "epoch": 0.012640626975097964, "grad_norm": 0.17368176579475403, "learning_rate": 0.00017671486066220965, "loss": 11.4754, "step": 325 }, { "epoch": 0.01267952121194442, "grad_norm": 0.1696547567844391, "learning_rate": 0.00017657285516610674, "loss": 11.4623, "step": 326 }, { "epoch": 0.012718415448790876, "grad_norm": 0.17458011209964752, "learning_rate": 0.00017643047539893836, "loss": 11.4611, "step": 327 }, { "epoch": 0.012757309685637331, "grad_norm": 0.17413705587387085, "learning_rate": 0.00017628772205662506, "loss": 11.4753, "step": 328 }, { "epoch": 0.012796203922483786, "grad_norm": 0.3501397669315338, "learning_rate": 0.00017614459583691346, "loss": 11.4867, "step": 329 }, { "epoch": 0.012835098159330241, "grad_norm": 0.1743520200252533, "learning_rate": 0.0001760010974393726, "loss": 11.4719, "step": 330 }, { "epoch": 0.012873992396176696, "grad_norm": 0.1871674358844757, "learning_rate": 0.00017585722756539073, "loss": 11.4499, "step": 331 }, { "epoch": 0.012912886633023151, "grad_norm": 0.2043357491493225, "learning_rate": 0.00017571298691817177, "loss": 11.4926, "step": 332 }, { "epoch": 0.012951780869869606, "grad_norm": 0.12393887341022491, "learning_rate": 0.0001755683762027318, "loss": 11.4711, "step": 333 }, { "epoch": 0.012990675106716063, "grad_norm": 0.12969204783439636, "learning_rate": 0.0001754233961258959, "loss": 11.4762, "step": 334 }, { "epoch": 0.013029569343562518, "grad_norm": 0.19811223447322845, "learning_rate": 0.00017527804739629437, "loss": 11.5223, "step": 335 }, { "epoch": 0.013068463580408973, "grad_norm": 0.2735598385334015, "learning_rate": 0.0001751323307243594, "loss": 11.4442, "step": 336 }, { "epoch": 0.013107357817255428, "grad_norm": 0.6070407629013062, "learning_rate": 0.00017498624682232166, "loss": 11.4486, "step": 337 }, { "epoch": 0.013146252054101883, "grad_norm": 0.11541075259447098, "learning_rate": 0.00017483979640420678, "loss": 11.5045, "step": 338 }, { "epoch": 0.013185146290948338, "grad_norm": 0.15650533139705658, "learning_rate": 0.0001746929801858317, "loss": 11.523, "step": 339 }, { "epoch": 0.013224040527794793, "grad_norm": 0.46195220947265625, "learning_rate": 0.00017454579888480148, "loss": 11.5236, "step": 340 }, { "epoch": 0.013262934764641248, "grad_norm": 0.17538081109523773, "learning_rate": 0.00017439825322050547, "loss": 11.4582, "step": 341 }, { "epoch": 0.013301829001487705, "grad_norm": 0.2557459771633148, "learning_rate": 0.0001742503439141141, "loss": 11.4396, "step": 342 }, { "epoch": 0.01334072323833416, "grad_norm": 0.13369663059711456, "learning_rate": 0.00017410207168857511, "loss": 11.4582, "step": 343 }, { "epoch": 0.013379617475180615, "grad_norm": 0.15669149160385132, "learning_rate": 0.00017395343726861012, "loss": 11.4719, "step": 344 }, { "epoch": 0.01341851171202707, "grad_norm": 0.17750877141952515, "learning_rate": 0.00017380444138071104, "loss": 11.4899, "step": 345 }, { "epoch": 0.013457405948873525, "grad_norm": 0.1812746524810791, "learning_rate": 0.0001736550847531366, "loss": 11.4777, "step": 346 }, { "epoch": 0.01349630018571998, "grad_norm": 0.22354459762573242, "learning_rate": 0.0001735053681159088, "loss": 11.4815, "step": 347 }, { "epoch": 0.013535194422566435, "grad_norm": 0.21211768686771393, "learning_rate": 0.00017335529220080916, "loss": 11.4873, "step": 348 }, { "epoch": 0.013574088659412892, "grad_norm": 0.1898476928472519, "learning_rate": 0.00017320485774137537, "loss": 11.4736, "step": 349 }, { "epoch": 0.013612982896259347, "grad_norm": 0.15847232937812805, "learning_rate": 0.00017305406547289754, "loss": 11.4438, "step": 350 }, { "epoch": 0.013651877133105802, "grad_norm": 0.17429223656654358, "learning_rate": 0.00017290291613241473, "loss": 11.4899, "step": 351 }, { "epoch": 0.013690771369952257, "grad_norm": 0.16740380227565765, "learning_rate": 0.00017275141045871125, "loss": 11.4782, "step": 352 }, { "epoch": 0.013729665606798712, "grad_norm": 0.15039494633674622, "learning_rate": 0.0001725995491923131, "loss": 11.4772, "step": 353 }, { "epoch": 0.013768559843645167, "grad_norm": 0.12922844290733337, "learning_rate": 0.00017244733307548432, "loss": 11.4517, "step": 354 }, { "epoch": 0.013807454080491622, "grad_norm": 0.15773743391036987, "learning_rate": 0.00017229476285222342, "loss": 11.5229, "step": 355 }, { "epoch": 0.013846348317338079, "grad_norm": 0.23168423771858215, "learning_rate": 0.00017214183926825966, "loss": 11.4593, "step": 356 }, { "epoch": 0.013885242554184534, "grad_norm": 0.14979079365730286, "learning_rate": 0.00017198856307104946, "loss": 11.4771, "step": 357 }, { "epoch": 0.013924136791030989, "grad_norm": 0.3258258104324341, "learning_rate": 0.00017183493500977278, "loss": 11.4833, "step": 358 }, { "epoch": 0.013924136791030989, "eval_loss": 11.507861137390137, "eval_runtime": 138.2894, "eval_samples_per_second": 78.285, "eval_steps_per_second": 39.143, "step": 358 }, { "epoch": 0.013963031027877444, "grad_norm": 0.3205585479736328, "learning_rate": 0.0001716809558353293, "loss": 11.5049, "step": 359 }, { "epoch": 0.0140019252647239, "grad_norm": 0.12936124205589294, "learning_rate": 0.00017152662630033505, "loss": 11.4437, "step": 360 }, { "epoch": 0.014040819501570354, "grad_norm": 0.13870011270046234, "learning_rate": 0.00017137194715911833, "loss": 11.4819, "step": 361 }, { "epoch": 0.01407971373841681, "grad_norm": 0.10452631115913391, "learning_rate": 0.00017121691916771638, "loss": 11.4855, "step": 362 }, { "epoch": 0.014118607975263266, "grad_norm": 0.30000603199005127, "learning_rate": 0.0001710615430838715, "loss": 11.4618, "step": 363 }, { "epoch": 0.014157502212109721, "grad_norm": 0.23039597272872925, "learning_rate": 0.0001709058196670274, "loss": 11.5625, "step": 364 }, { "epoch": 0.014196396448956176, "grad_norm": 0.12431518733501434, "learning_rate": 0.00017074974967832545, "loss": 11.4401, "step": 365 }, { "epoch": 0.014235290685802631, "grad_norm": 0.29899483919143677, "learning_rate": 0.00017059333388060098, "loss": 11.4839, "step": 366 }, { "epoch": 0.014274184922649086, "grad_norm": 0.11857055872678757, "learning_rate": 0.00017043657303837963, "loss": 11.4696, "step": 367 }, { "epoch": 0.014313079159495541, "grad_norm": 0.15218877792358398, "learning_rate": 0.00017027946791787352, "loss": 11.5398, "step": 368 }, { "epoch": 0.014351973396341996, "grad_norm": 0.09413671493530273, "learning_rate": 0.00017012201928697738, "loss": 11.4368, "step": 369 }, { "epoch": 0.014390867633188453, "grad_norm": 0.1152544692158699, "learning_rate": 0.00016996422791526515, "loss": 11.4649, "step": 370 }, { "epoch": 0.014429761870034908, "grad_norm": 0.1539829671382904, "learning_rate": 0.0001698060945739859, "loss": 11.4294, "step": 371 }, { "epoch": 0.014468656106881363, "grad_norm": 0.38121601939201355, "learning_rate": 0.00016964762003606016, "loss": 11.5103, "step": 372 }, { "epoch": 0.014507550343727818, "grad_norm": 0.11009401828050613, "learning_rate": 0.0001694888050760762, "loss": 11.4647, "step": 373 }, { "epoch": 0.014546444580574273, "grad_norm": 0.14690318703651428, "learning_rate": 0.0001693296504702862, "loss": 11.4769, "step": 374 }, { "epoch": 0.014585338817420728, "grad_norm": 0.132308691740036, "learning_rate": 0.00016917015699660244, "loss": 11.4758, "step": 375 }, { "epoch": 0.014624233054267183, "grad_norm": 0.12044641375541687, "learning_rate": 0.00016901032543459348, "loss": 11.4626, "step": 376 }, { "epoch": 0.01466312729111364, "grad_norm": 0.14883580803871155, "learning_rate": 0.0001688501565654804, "loss": 11.471, "step": 377 }, { "epoch": 0.014702021527960095, "grad_norm": 0.23667500913143158, "learning_rate": 0.000168689651172133, "loss": 11.4532, "step": 378 }, { "epoch": 0.01474091576480655, "grad_norm": 0.38999703526496887, "learning_rate": 0.0001685288100390659, "loss": 11.4697, "step": 379 }, { "epoch": 0.014779810001653005, "grad_norm": 0.09403635561466217, "learning_rate": 0.00016836763395243468, "loss": 11.4562, "step": 380 }, { "epoch": 0.01481870423849946, "grad_norm": 0.206887885928154, "learning_rate": 0.00016820612370003221, "loss": 11.5378, "step": 381 }, { "epoch": 0.014857598475345915, "grad_norm": 0.12810903787612915, "learning_rate": 0.00016804428007128466, "loss": 11.4557, "step": 382 }, { "epoch": 0.01489649271219237, "grad_norm": 0.2048032134771347, "learning_rate": 0.00016788210385724762, "loss": 11.5238, "step": 383 }, { "epoch": 0.014935386949038827, "grad_norm": 0.2907865047454834, "learning_rate": 0.00016771959585060232, "loss": 11.4953, "step": 384 }, { "epoch": 0.014974281185885282, "grad_norm": 0.6095656752586365, "learning_rate": 0.0001675567568456517, "loss": 11.5224, "step": 385 }, { "epoch": 0.015013175422731737, "grad_norm": 0.19066348671913147, "learning_rate": 0.00016739358763831656, "loss": 11.4465, "step": 386 }, { "epoch": 0.015052069659578192, "grad_norm": 0.15955707430839539, "learning_rate": 0.0001672300890261317, "loss": 11.5322, "step": 387 }, { "epoch": 0.015090963896424647, "grad_norm": 0.2904532849788666, "learning_rate": 0.00016706626180824186, "loss": 11.4777, "step": 388 }, { "epoch": 0.015129858133271102, "grad_norm": 0.32818281650543213, "learning_rate": 0.000166902106785398, "loss": 11.4286, "step": 389 }, { "epoch": 0.015168752370117557, "grad_norm": 0.11760245263576508, "learning_rate": 0.00016673762475995343, "loss": 11.4806, "step": 390 }, { "epoch": 0.015207646606964014, "grad_norm": 0.5074398517608643, "learning_rate": 0.00016657281653585955, "loss": 11.5177, "step": 391 }, { "epoch": 0.015246540843810469, "grad_norm": 0.11384806782007217, "learning_rate": 0.00016640768291866227, "loss": 11.4524, "step": 392 }, { "epoch": 0.015285435080656924, "grad_norm": 0.22359219193458557, "learning_rate": 0.00016624222471549798, "loss": 11.477, "step": 393 }, { "epoch": 0.015324329317503379, "grad_norm": 0.22272583842277527, "learning_rate": 0.0001660764427350895, "loss": 11.4525, "step": 394 }, { "epoch": 0.015363223554349834, "grad_norm": 0.1591319441795349, "learning_rate": 0.0001659103377877423, "loss": 11.5647, "step": 395 }, { "epoch": 0.015402117791196289, "grad_norm": 0.18074019253253937, "learning_rate": 0.0001657439106853403, "loss": 11.4354, "step": 396 }, { "epoch": 0.015441012028042744, "grad_norm": 0.3558606505393982, "learning_rate": 0.00016557716224134216, "loss": 11.5447, "step": 397 }, { "epoch": 0.0154799062648892, "grad_norm": 0.1546933948993683, "learning_rate": 0.00016541009327077715, "loss": 11.4561, "step": 398 }, { "epoch": 0.015518800501735656, "grad_norm": 0.24904824793338776, "learning_rate": 0.00016524270459024115, "loss": 11.5131, "step": 399 }, { "epoch": 0.01555769473858211, "grad_norm": 0.13351157307624817, "learning_rate": 0.0001650749970178928, "loss": 11.4785, "step": 400 }, { "epoch": 0.015596588975428566, "grad_norm": 0.2609478533267975, "learning_rate": 0.00016490697137344938, "loss": 11.4535, "step": 401 }, { "epoch": 0.015635483212275023, "grad_norm": 0.12605337798595428, "learning_rate": 0.00016473862847818277, "loss": 11.4694, "step": 402 }, { "epoch": 0.015674377449121476, "grad_norm": 0.14125953614711761, "learning_rate": 0.00016456996915491562, "loss": 11.4653, "step": 403 }, { "epoch": 0.015713271685967933, "grad_norm": 0.14558717608451843, "learning_rate": 0.0001644009942280171, "loss": 11.4657, "step": 404 }, { "epoch": 0.015752165922814386, "grad_norm": 0.2568672001361847, "learning_rate": 0.00016423170452339905, "loss": 11.4466, "step": 405 }, { "epoch": 0.015791060159660843, "grad_norm": 0.2427850365638733, "learning_rate": 0.00016406210086851184, "loss": 11.478, "step": 406 }, { "epoch": 0.015829954396507296, "grad_norm": 0.18654537200927734, "learning_rate": 0.00016389218409234037, "loss": 11.4737, "step": 407 }, { "epoch": 0.015868848633353753, "grad_norm": 0.1696103811264038, "learning_rate": 0.00016372195502540002, "loss": 11.4878, "step": 408 }, { "epoch": 0.01590774287020021, "grad_norm": 0.2038358896970749, "learning_rate": 0.00016355141449973256, "loss": 11.4679, "step": 409 }, { "epoch": 0.015946637107046663, "grad_norm": 0.131368950009346, "learning_rate": 0.0001633805633489021, "loss": 11.4821, "step": 410 }, { "epoch": 0.01598553134389312, "grad_norm": 0.12556558847427368, "learning_rate": 0.000163209402407991, "loss": 11.4256, "step": 411 }, { "epoch": 0.016024425580739573, "grad_norm": 0.24933062493801117, "learning_rate": 0.00016303793251359585, "loss": 11.5033, "step": 412 }, { "epoch": 0.01606331981758603, "grad_norm": 0.12761083245277405, "learning_rate": 0.00016286615450382332, "loss": 11.4297, "step": 413 }, { "epoch": 0.016102214054432483, "grad_norm": 0.25256770849227905, "learning_rate": 0.00016269406921828606, "loss": 11.5313, "step": 414 }, { "epoch": 0.01614110829127894, "grad_norm": 0.10969125479459763, "learning_rate": 0.0001625216774980986, "loss": 11.5064, "step": 415 }, { "epoch": 0.016180002528125396, "grad_norm": 0.169768288731575, "learning_rate": 0.00016234898018587337, "loss": 11.4491, "step": 416 }, { "epoch": 0.01621889676497185, "grad_norm": 0.157431498169899, "learning_rate": 0.0001621759781257163, "loss": 11.4606, "step": 417 }, { "epoch": 0.016257791001818307, "grad_norm": 0.3962075114250183, "learning_rate": 0.0001620026721632229, "loss": 11.4564, "step": 418 }, { "epoch": 0.01629668523866476, "grad_norm": 0.12321560829877853, "learning_rate": 0.00016182906314547423, "loss": 11.4394, "step": 419 }, { "epoch": 0.016335579475511217, "grad_norm": 0.12217842787504196, "learning_rate": 0.00016165515192103245, "loss": 11.4775, "step": 420 }, { "epoch": 0.01637447371235767, "grad_norm": 0.27666154503822327, "learning_rate": 0.00016148093933993692, "loss": 11.4748, "step": 421 }, { "epoch": 0.016413367949204127, "grad_norm": 0.1424492746591568, "learning_rate": 0.00016130642625369993, "loss": 11.5012, "step": 422 }, { "epoch": 0.016452262186050583, "grad_norm": 0.16903488337993622, "learning_rate": 0.0001611316135153026, "loss": 11.4579, "step": 423 }, { "epoch": 0.016491156422897037, "grad_norm": 0.20407697558403015, "learning_rate": 0.00016095650197919063, "loss": 11.5047, "step": 424 }, { "epoch": 0.016530050659743493, "grad_norm": 0.3471042811870575, "learning_rate": 0.00016078109250127027, "loss": 11.4789, "step": 425 }, { "epoch": 0.016568944896589947, "grad_norm": 0.30747127532958984, "learning_rate": 0.00016060538593890396, "loss": 11.4888, "step": 426 }, { "epoch": 0.016607839133436404, "grad_norm": 0.4272612929344177, "learning_rate": 0.00016042938315090628, "loss": 11.4779, "step": 427 }, { "epoch": 0.016646733370282857, "grad_norm": 0.12549056112766266, "learning_rate": 0.00016025308499753962, "loss": 11.4958, "step": 428 }, { "epoch": 0.016685627607129314, "grad_norm": 0.13514985144138336, "learning_rate": 0.00016007649234051012, "loss": 11.4351, "step": 429 }, { "epoch": 0.01672452184397577, "grad_norm": 0.14483191072940826, "learning_rate": 0.0001598996060429634, "loss": 11.4877, "step": 430 }, { "epoch": 0.016763416080822224, "grad_norm": 0.20620040595531464, "learning_rate": 0.00015972242696948023, "loss": 11.4457, "step": 431 }, { "epoch": 0.01680231031766868, "grad_norm": 0.13124777376651764, "learning_rate": 0.00015954495598607252, "loss": 11.4917, "step": 432 }, { "epoch": 0.016841204554515134, "grad_norm": 0.08985606580972672, "learning_rate": 0.00015936719396017889, "loss": 11.4537, "step": 433 }, { "epoch": 0.01688009879136159, "grad_norm": 0.139922633767128, "learning_rate": 0.00015918914176066054, "loss": 11.501, "step": 434 }, { "epoch": 0.016918993028208044, "grad_norm": 0.10267654806375504, "learning_rate": 0.00015901080025779702, "loss": 11.4442, "step": 435 }, { "epoch": 0.0169578872650545, "grad_norm": 0.4352187514305115, "learning_rate": 0.00015883217032328182, "loss": 11.4614, "step": 436 }, { "epoch": 0.016996781501900957, "grad_norm": 0.19934622943401337, "learning_rate": 0.0001586532528302183, "loss": 11.5283, "step": 437 }, { "epoch": 0.01703567573874741, "grad_norm": 0.09342128038406372, "learning_rate": 0.00015847404865311536, "loss": 11.4648, "step": 438 }, { "epoch": 0.017074569975593867, "grad_norm": 0.3421262204647064, "learning_rate": 0.00015829455866788313, "loss": 11.4942, "step": 439 }, { "epoch": 0.01711346421244032, "grad_norm": 0.3187176585197449, "learning_rate": 0.0001581147837518286, "loss": 11.4361, "step": 440 }, { "epoch": 0.017152358449286777, "grad_norm": 0.19588756561279297, "learning_rate": 0.00015793472478365162, "loss": 11.4523, "step": 441 }, { "epoch": 0.01719125268613323, "grad_norm": 0.2834365963935852, "learning_rate": 0.00015775438264344026, "loss": 11.4632, "step": 442 }, { "epoch": 0.017230146922979687, "grad_norm": 0.15775611996650696, "learning_rate": 0.0001575737582126668, "loss": 11.5202, "step": 443 }, { "epoch": 0.017269041159826144, "grad_norm": 0.12323816865682602, "learning_rate": 0.0001573928523741832, "loss": 11.4652, "step": 444 }, { "epoch": 0.017307935396672598, "grad_norm": 0.21613965928554535, "learning_rate": 0.00015721166601221698, "loss": 11.4788, "step": 445 }, { "epoch": 0.017346829633519054, "grad_norm": 0.2511967718601227, "learning_rate": 0.00015703020001236665, "loss": 11.4588, "step": 446 }, { "epoch": 0.017385723870365508, "grad_norm": 0.08450320363044739, "learning_rate": 0.00015684845526159767, "loss": 11.4585, "step": 447 }, { "epoch": 0.017424618107211964, "grad_norm": 0.1496826708316803, "learning_rate": 0.00015666643264823787, "loss": 11.4661, "step": 448 }, { "epoch": 0.017463512344058418, "grad_norm": 0.13807488977909088, "learning_rate": 0.00015648413306197325, "loss": 11.4589, "step": 449 }, { "epoch": 0.017502406580904874, "grad_norm": 0.1394951492547989, "learning_rate": 0.00015630155739384364, "loss": 11.5089, "step": 450 }, { "epoch": 0.01754130081775133, "grad_norm": 0.42306429147720337, "learning_rate": 0.00015611870653623825, "loss": 11.4751, "step": 451 }, { "epoch": 0.017580195054597784, "grad_norm": 0.4221981465816498, "learning_rate": 0.00015593558138289132, "loss": 11.4538, "step": 452 }, { "epoch": 0.01761908929144424, "grad_norm": 0.11176911741495132, "learning_rate": 0.00015575218282887782, "loss": 11.4456, "step": 453 }, { "epoch": 0.017657983528290695, "grad_norm": 0.10232798010110855, "learning_rate": 0.00015556851177060907, "loss": 11.5036, "step": 454 }, { "epoch": 0.01769687776513715, "grad_norm": 0.18220168352127075, "learning_rate": 0.0001553845691058283, "loss": 11.5132, "step": 455 }, { "epoch": 0.017735772001983605, "grad_norm": 0.31858009099960327, "learning_rate": 0.00015520035573360626, "loss": 11.5271, "step": 456 }, { "epoch": 0.01777466623883006, "grad_norm": 0.12854431569576263, "learning_rate": 0.000155015872554337, "loss": 11.4812, "step": 457 }, { "epoch": 0.017813560475676518, "grad_norm": 0.13666190207004547, "learning_rate": 0.0001548311204697331, "loss": 11.4507, "step": 458 }, { "epoch": 0.01785245471252297, "grad_norm": 0.29696568846702576, "learning_rate": 0.00015464610038282167, "loss": 11.4584, "step": 459 }, { "epoch": 0.017891348949369428, "grad_norm": 0.14931274950504303, "learning_rate": 0.00015446081319793969, "loss": 11.4221, "step": 460 }, { "epoch": 0.01793024318621588, "grad_norm": 0.23341301083564758, "learning_rate": 0.00015427525982072962, "loss": 11.4627, "step": 461 }, { "epoch": 0.017969137423062338, "grad_norm": 0.13812541961669922, "learning_rate": 0.0001540894411581351, "loss": 11.5192, "step": 462 }, { "epoch": 0.01800803165990879, "grad_norm": 0.2535600960254669, "learning_rate": 0.0001539033581183964, "loss": 11.4715, "step": 463 }, { "epoch": 0.01804692589675525, "grad_norm": 0.1264931857585907, "learning_rate": 0.0001537170116110459, "loss": 11.4935, "step": 464 }, { "epoch": 0.018085820133601705, "grad_norm": 0.22763359546661377, "learning_rate": 0.00015353040254690393, "loss": 11.4827, "step": 465 }, { "epoch": 0.01812471437044816, "grad_norm": 0.14794450998306274, "learning_rate": 0.00015334353183807397, "loss": 11.4729, "step": 466 }, { "epoch": 0.018163608607294615, "grad_norm": 0.196041539311409, "learning_rate": 0.00015315640039793846, "loss": 11.5024, "step": 467 }, { "epoch": 0.01820250284414107, "grad_norm": 0.10622090846300125, "learning_rate": 0.00015296900914115417, "loss": 11.4356, "step": 468 }, { "epoch": 0.018241397080987525, "grad_norm": 0.31031596660614014, "learning_rate": 0.00015278135898364793, "loss": 11.466, "step": 469 }, { "epoch": 0.01828029131783398, "grad_norm": 0.14226101338863373, "learning_rate": 0.00015259345084261185, "loss": 11.4444, "step": 470 }, { "epoch": 0.018319185554680435, "grad_norm": 0.08401184529066086, "learning_rate": 0.00015240528563649906, "loss": 11.4541, "step": 471 }, { "epoch": 0.018358079791526892, "grad_norm": 0.1299443542957306, "learning_rate": 0.00015221686428501928, "loss": 11.4389, "step": 472 }, { "epoch": 0.018396974028373345, "grad_norm": 0.5548118352890015, "learning_rate": 0.00015202818770913409, "loss": 11.4713, "step": 473 }, { "epoch": 0.018435868265219802, "grad_norm": 0.20359492301940918, "learning_rate": 0.00015183925683105254, "loss": 11.5051, "step": 474 }, { "epoch": 0.018474762502066255, "grad_norm": 0.15279248356819153, "learning_rate": 0.00015165007257422668, "loss": 11.5072, "step": 475 }, { "epoch": 0.018513656738912712, "grad_norm": 0.17459337413311005, "learning_rate": 0.00015146063586334707, "loss": 11.4696, "step": 476 }, { "epoch": 0.018552550975759165, "grad_norm": 0.24277138710021973, "learning_rate": 0.00015127094762433816, "loss": 11.445, "step": 477 }, { "epoch": 0.018591445212605622, "grad_norm": 0.36216112971305847, "learning_rate": 0.00015108100878435387, "loss": 11.4581, "step": 478 }, { "epoch": 0.01863033944945208, "grad_norm": 0.157160684466362, "learning_rate": 0.0001508908202717729, "loss": 11.459, "step": 479 }, { "epoch": 0.018669233686298532, "grad_norm": 0.07062587141990662, "learning_rate": 0.00015070038301619437, "loss": 11.4427, "step": 480 }, { "epoch": 0.01870812792314499, "grad_norm": 0.21761757135391235, "learning_rate": 0.00015050969794843316, "loss": 11.4747, "step": 481 }, { "epoch": 0.018747022159991442, "grad_norm": 0.27131345868110657, "learning_rate": 0.0001503187660005154, "loss": 11.5052, "step": 482 }, { "epoch": 0.0187859163968379, "grad_norm": 0.12191030383110046, "learning_rate": 0.00015012758810567403, "loss": 11.4567, "step": 483 }, { "epoch": 0.018824810633684352, "grad_norm": 0.1149320974946022, "learning_rate": 0.00014993616519834397, "loss": 11.4384, "step": 484 }, { "epoch": 0.01886370487053081, "grad_norm": 0.3711194396018982, "learning_rate": 0.00014974449821415778, "loss": 11.4758, "step": 485 }, { "epoch": 0.018902599107377266, "grad_norm": 0.1323329359292984, "learning_rate": 0.00014955258808994096, "loss": 11.489, "step": 486 }, { "epoch": 0.01894149334422372, "grad_norm": 0.24308934807777405, "learning_rate": 0.00014936043576370747, "loss": 11.4823, "step": 487 }, { "epoch": 0.018980387581070176, "grad_norm": 0.13251838088035583, "learning_rate": 0.00014916804217465516, "loss": 11.4414, "step": 488 }, { "epoch": 0.01901928181791663, "grad_norm": 0.1579715609550476, "learning_rate": 0.000148975408263161, "loss": 11.4515, "step": 489 }, { "epoch": 0.019058176054763086, "grad_norm": 0.22576065361499786, "learning_rate": 0.00014878253497077663, "loss": 11.5119, "step": 490 }, { "epoch": 0.01909707029160954, "grad_norm": 0.13290148973464966, "learning_rate": 0.00014858942324022384, "loss": 11.5112, "step": 491 }, { "epoch": 0.019135964528455996, "grad_norm": 0.12317763268947601, "learning_rate": 0.0001483960740153897, "loss": 11.4426, "step": 492 }, { "epoch": 0.019174858765302453, "grad_norm": 0.16715236008167267, "learning_rate": 0.0001482024882413222, "loss": 11.4811, "step": 493 }, { "epoch": 0.019213753002148906, "grad_norm": 0.2382892221212387, "learning_rate": 0.00014800866686422547, "loss": 11.4744, "step": 494 }, { "epoch": 0.019252647238995363, "grad_norm": 0.1738983541727066, "learning_rate": 0.00014781461083145526, "loss": 11.4577, "step": 495 }, { "epoch": 0.019291541475841816, "grad_norm": 0.14401042461395264, "learning_rate": 0.00014762032109151427, "loss": 11.4428, "step": 496 }, { "epoch": 0.019330435712688273, "grad_norm": 0.20653113722801208, "learning_rate": 0.00014742579859404744, "loss": 11.4352, "step": 497 }, { "epoch": 0.019369329949534726, "grad_norm": 0.11518168449401855, "learning_rate": 0.00014723104428983745, "loss": 11.4464, "step": 498 }, { "epoch": 0.019408224186381183, "grad_norm": 0.22878611087799072, "learning_rate": 0.00014703605913079997, "loss": 11.4389, "step": 499 }, { "epoch": 0.01944711842322764, "grad_norm": 0.3510136902332306, "learning_rate": 0.00014684084406997903, "loss": 11.4845, "step": 500 }, { "epoch": 0.019486012660074093, "grad_norm": 0.17691604793071747, "learning_rate": 0.00014664540006154237, "loss": 11.4395, "step": 501 }, { "epoch": 0.01952490689692055, "grad_norm": 0.14631284773349762, "learning_rate": 0.00014644972806077683, "loss": 11.512, "step": 502 }, { "epoch": 0.019563801133767003, "grad_norm": 0.16179683804512024, "learning_rate": 0.00014625382902408356, "loss": 11.4485, "step": 503 }, { "epoch": 0.01960269537061346, "grad_norm": 0.15515045821666718, "learning_rate": 0.00014605770390897337, "loss": 11.4329, "step": 504 }, { "epoch": 0.019641589607459913, "grad_norm": 0.10927967727184296, "learning_rate": 0.00014586135367406224, "loss": 11.4433, "step": 505 }, { "epoch": 0.01968048384430637, "grad_norm": 0.0937536209821701, "learning_rate": 0.00014566477927906632, "loss": 11.4613, "step": 506 }, { "epoch": 0.019719378081152827, "grad_norm": 0.09222274273633957, "learning_rate": 0.00014546798168479756, "loss": 11.4234, "step": 507 }, { "epoch": 0.01975827231799928, "grad_norm": 0.20182093977928162, "learning_rate": 0.0001452709618531587, "loss": 11.4662, "step": 508 }, { "epoch": 0.019797166554845737, "grad_norm": 0.09872374683618546, "learning_rate": 0.00014507372074713888, "loss": 11.4806, "step": 509 }, { "epoch": 0.01983606079169219, "grad_norm": 0.2067841738462448, "learning_rate": 0.00014487625933080867, "loss": 11.4736, "step": 510 }, { "epoch": 0.019874955028538647, "grad_norm": 0.07817395031452179, "learning_rate": 0.00014467857856931545, "loss": 11.4249, "step": 511 }, { "epoch": 0.0199138492653851, "grad_norm": 0.1415766328573227, "learning_rate": 0.00014448067942887885, "loss": 11.4739, "step": 512 }, { "epoch": 0.019952743502231557, "grad_norm": 0.14361786842346191, "learning_rate": 0.0001442825628767858, "loss": 11.4741, "step": 513 }, { "epoch": 0.019991637739078014, "grad_norm": 0.18572257459163666, "learning_rate": 0.00014408422988138584, "loss": 11.469, "step": 514 }, { "epoch": 0.020030531975924467, "grad_norm": 0.09005368500947952, "learning_rate": 0.00014388568141208651, "loss": 11.4595, "step": 515 }, { "epoch": 0.020069426212770924, "grad_norm": 0.1343315690755844, "learning_rate": 0.00014368691843934852, "loss": 11.4687, "step": 516 }, { "epoch": 0.020108320449617377, "grad_norm": 0.08616956323385239, "learning_rate": 0.00014348794193468103, "loss": 11.4523, "step": 517 }, { "epoch": 0.020147214686463834, "grad_norm": 0.15260371565818787, "learning_rate": 0.00014328875287063686, "loss": 11.4406, "step": 518 }, { "epoch": 0.020186108923310287, "grad_norm": 0.1616385281085968, "learning_rate": 0.0001430893522208078, "loss": 11.4426, "step": 519 }, { "epoch": 0.020225003160156744, "grad_norm": 0.07586033642292023, "learning_rate": 0.0001428897409598199, "loss": 11.4365, "step": 520 }, { "epoch": 0.0202638973970032, "grad_norm": 0.3101730942726135, "learning_rate": 0.00014268992006332846, "loss": 11.5896, "step": 521 }, { "epoch": 0.020302791633849654, "grad_norm": 0.13230843842029572, "learning_rate": 0.0001424898905080136, "loss": 11.4672, "step": 522 }, { "epoch": 0.02034168587069611, "grad_norm": 0.1998041868209839, "learning_rate": 0.0001422896532715752, "loss": 11.4787, "step": 523 }, { "epoch": 0.020380580107542564, "grad_norm": 0.14576083421707153, "learning_rate": 0.00014208920933272826, "loss": 11.4604, "step": 524 }, { "epoch": 0.02041947434438902, "grad_norm": 0.14003677666187286, "learning_rate": 0.0001418885596711982, "loss": 11.45, "step": 525 }, { "epoch": 0.020458368581235474, "grad_norm": 0.1250571757555008, "learning_rate": 0.00014168770526771585, "loss": 11.4468, "step": 526 }, { "epoch": 0.02049726281808193, "grad_norm": 0.1381315439939499, "learning_rate": 0.00014148664710401278, "loss": 11.4912, "step": 527 }, { "epoch": 0.020536157054928388, "grad_norm": 0.1761142462491989, "learning_rate": 0.0001412853861628166, "loss": 11.4575, "step": 528 }, { "epoch": 0.02057505129177484, "grad_norm": 0.2214204967021942, "learning_rate": 0.00014108392342784587, "loss": 11.4479, "step": 529 }, { "epoch": 0.020613945528621298, "grad_norm": 0.1192215159535408, "learning_rate": 0.00014088225988380566, "loss": 11.4633, "step": 530 }, { "epoch": 0.02065283976546775, "grad_norm": 0.13961835205554962, "learning_rate": 0.00014068039651638246, "loss": 11.4547, "step": 531 }, { "epoch": 0.020691734002314208, "grad_norm": 0.13457316160202026, "learning_rate": 0.00014047833431223938, "loss": 11.4693, "step": 532 }, { "epoch": 0.02073062823916066, "grad_norm": 0.16662298142910004, "learning_rate": 0.0001402760742590116, "loss": 11.4613, "step": 533 }, { "epoch": 0.020769522476007118, "grad_norm": 0.10683729499578476, "learning_rate": 0.00014007361734530115, "loss": 11.4844, "step": 534 }, { "epoch": 0.020808416712853574, "grad_norm": 0.12592609226703644, "learning_rate": 0.00013987096456067236, "loss": 11.4334, "step": 535 }, { "epoch": 0.020847310949700028, "grad_norm": 0.10339102149009705, "learning_rate": 0.0001396681168956469, "loss": 11.4507, "step": 536 }, { "epoch": 0.020886205186546485, "grad_norm": 0.2094239443540573, "learning_rate": 0.00013946507534169905, "loss": 11.4672, "step": 537 }, { "epoch": 0.020925099423392938, "grad_norm": 0.2979714870452881, "learning_rate": 0.0001392618408912506, "loss": 11.4472, "step": 538 }, { "epoch": 0.020963993660239395, "grad_norm": 0.19053438305854797, "learning_rate": 0.00013905841453766638, "loss": 11.4709, "step": 539 }, { "epoch": 0.021002887897085848, "grad_norm": 0.1641671061515808, "learning_rate": 0.00013885479727524915, "loss": 11.528, "step": 540 }, { "epoch": 0.021041782133932305, "grad_norm": 0.27010658383369446, "learning_rate": 0.00013865099009923462, "loss": 11.4699, "step": 541 }, { "epoch": 0.02108067637077876, "grad_norm": 0.18193985521793365, "learning_rate": 0.00013844699400578696, "loss": 11.4778, "step": 542 }, { "epoch": 0.021119570607625215, "grad_norm": 0.25620508193969727, "learning_rate": 0.00013824280999199365, "loss": 11.5205, "step": 543 }, { "epoch": 0.02115846484447167, "grad_norm": 0.17457740008831024, "learning_rate": 0.00013803843905586067, "loss": 11.4563, "step": 544 }, { "epoch": 0.021197359081318125, "grad_norm": 0.12833549082279205, "learning_rate": 0.00013783388219630764, "loss": 11.4535, "step": 545 }, { "epoch": 0.02123625331816458, "grad_norm": 0.15961208939552307, "learning_rate": 0.00013762914041316298, "loss": 11.4438, "step": 546 }, { "epoch": 0.021275147555011035, "grad_norm": 0.1915200799703598, "learning_rate": 0.0001374242147071589, "loss": 11.4228, "step": 547 }, { "epoch": 0.02131404179185749, "grad_norm": 0.3410807251930237, "learning_rate": 0.0001372191060799266, "loss": 11.5251, "step": 548 }, { "epoch": 0.02135293602870395, "grad_norm": 0.164834663271904, "learning_rate": 0.00013701381553399145, "loss": 11.4629, "step": 549 }, { "epoch": 0.0213918302655504, "grad_norm": 0.3576444983482361, "learning_rate": 0.00013680834407276784, "loss": 11.5089, "step": 550 }, { "epoch": 0.02143072450239686, "grad_norm": 0.1609504222869873, "learning_rate": 0.0001366026927005546, "loss": 11.4675, "step": 551 }, { "epoch": 0.021469618739243312, "grad_norm": 0.14706267416477203, "learning_rate": 0.0001363968624225298, "loss": 11.4673, "step": 552 }, { "epoch": 0.02150851297608977, "grad_norm": 0.19631259143352509, "learning_rate": 0.000136190854244746, "loss": 11.4307, "step": 553 }, { "epoch": 0.021547407212936222, "grad_norm": 0.14005130529403687, "learning_rate": 0.00013598466917412534, "loss": 11.5272, "step": 554 }, { "epoch": 0.02158630144978268, "grad_norm": 0.1432911604642868, "learning_rate": 0.0001357783082184545, "loss": 11.4875, "step": 555 }, { "epoch": 0.021625195686629135, "grad_norm": 0.1323319524526596, "learning_rate": 0.00013557177238637986, "loss": 11.4616, "step": 556 }, { "epoch": 0.02166408992347559, "grad_norm": 0.34612029790878296, "learning_rate": 0.0001353650626874026, "loss": 11.4691, "step": 557 }, { "epoch": 0.021702984160322045, "grad_norm": 0.1030290499329567, "learning_rate": 0.00013515818013187377, "loss": 11.4687, "step": 558 }, { "epoch": 0.0217418783971685, "grad_norm": 0.22259925305843353, "learning_rate": 0.00013495112573098914, "loss": 11.4703, "step": 559 }, { "epoch": 0.021780772634014955, "grad_norm": 0.18742729723453522, "learning_rate": 0.00013474390049678453, "loss": 11.5275, "step": 560 }, { "epoch": 0.02181966687086141, "grad_norm": 0.17428499460220337, "learning_rate": 0.00013453650544213076, "loss": 11.4421, "step": 561 }, { "epoch": 0.021858561107707866, "grad_norm": 0.10771626234054565, "learning_rate": 0.00013432894158072872, "loss": 11.4584, "step": 562 }, { "epoch": 0.021897455344554322, "grad_norm": 0.12291907519102097, "learning_rate": 0.00013412120992710425, "loss": 11.4563, "step": 563 }, { "epoch": 0.021936349581400776, "grad_norm": 0.15500085055828094, "learning_rate": 0.0001339133114966035, "loss": 11.4747, "step": 564 }, { "epoch": 0.021975243818247232, "grad_norm": 0.16445863246917725, "learning_rate": 0.00013370524730538767, "loss": 11.4798, "step": 565 }, { "epoch": 0.022014138055093686, "grad_norm": 0.16945667564868927, "learning_rate": 0.00013349701837042817, "loss": 11.4582, "step": 566 }, { "epoch": 0.022053032291940142, "grad_norm": 0.2130759209394455, "learning_rate": 0.00013328862570950175, "loss": 11.4807, "step": 567 }, { "epoch": 0.022091926528786596, "grad_norm": 0.22122079133987427, "learning_rate": 0.0001330800703411853, "loss": 11.4573, "step": 568 }, { "epoch": 0.022130820765633052, "grad_norm": 0.214433491230011, "learning_rate": 0.0001328713532848509, "loss": 11.4302, "step": 569 }, { "epoch": 0.02216971500247951, "grad_norm": 0.10128531605005264, "learning_rate": 0.00013266247556066122, "loss": 11.4357, "step": 570 }, { "epoch": 0.022208609239325963, "grad_norm": 0.23014956712722778, "learning_rate": 0.000132453438189564, "loss": 11.4452, "step": 571 }, { "epoch": 0.02224750347617242, "grad_norm": 0.1748920977115631, "learning_rate": 0.00013224424219328735, "loss": 11.4576, "step": 572 }, { "epoch": 0.022286397713018873, "grad_norm": 0.09783193469047546, "learning_rate": 0.0001320348885943347, "loss": 11.4592, "step": 573 }, { "epoch": 0.02232529194986533, "grad_norm": 0.168840229511261, "learning_rate": 0.00013182537841597989, "loss": 11.457, "step": 574 }, { "epoch": 0.022364186186711783, "grad_norm": 0.13013891875743866, "learning_rate": 0.00013161571268226197, "loss": 11.4647, "step": 575 }, { "epoch": 0.02240308042355824, "grad_norm": 0.14634005725383759, "learning_rate": 0.00013140589241798033, "loss": 11.4808, "step": 576 }, { "epoch": 0.022441974660404696, "grad_norm": 0.1694815754890442, "learning_rate": 0.0001311959186486898, "loss": 11.5279, "step": 577 }, { "epoch": 0.02248086889725115, "grad_norm": 0.09830515831708908, "learning_rate": 0.0001309857924006953, "loss": 11.4687, "step": 578 }, { "epoch": 0.022519763134097606, "grad_norm": 0.17134815454483032, "learning_rate": 0.0001307755147010472, "loss": 11.5015, "step": 579 }, { "epoch": 0.02255865737094406, "grad_norm": 0.13167834281921387, "learning_rate": 0.0001305650865775361, "loss": 11.4763, "step": 580 }, { "epoch": 0.022597551607790516, "grad_norm": 0.37757420539855957, "learning_rate": 0.00013035450905868773, "loss": 11.4619, "step": 581 }, { "epoch": 0.02263644584463697, "grad_norm": 0.2738214433193207, "learning_rate": 0.00013014378317375818, "loss": 11.471, "step": 582 }, { "epoch": 0.022675340081483426, "grad_norm": 0.08914168179035187, "learning_rate": 0.00012993290995272862, "loss": 11.4365, "step": 583 }, { "epoch": 0.022714234318329883, "grad_norm": 0.15012338757514954, "learning_rate": 0.00012972189042630044, "loss": 11.4608, "step": 584 }, { "epoch": 0.022753128555176336, "grad_norm": 0.22025415301322937, "learning_rate": 0.00012951072562589005, "loss": 11.4312, "step": 585 }, { "epoch": 0.022792022792022793, "grad_norm": 0.1591462343931198, "learning_rate": 0.000129299416583624, "loss": 11.4272, "step": 586 }, { "epoch": 0.022830917028869246, "grad_norm": 0.12218027561903, "learning_rate": 0.0001290879643323338, "loss": 11.449, "step": 587 }, { "epoch": 0.022869811265715703, "grad_norm": 0.15239384770393372, "learning_rate": 0.00012887636990555098, "loss": 11.4613, "step": 588 }, { "epoch": 0.022908705502562157, "grad_norm": 0.20182852447032928, "learning_rate": 0.00012866463433750205, "loss": 11.4654, "step": 589 }, { "epoch": 0.022947599739408613, "grad_norm": 0.19629980623722076, "learning_rate": 0.00012845275866310324, "loss": 11.4934, "step": 590 }, { "epoch": 0.02298649397625507, "grad_norm": 0.15712539851665497, "learning_rate": 0.0001282407439179557, "loss": 11.543, "step": 591 }, { "epoch": 0.023025388213101523, "grad_norm": 0.10479318350553513, "learning_rate": 0.00012802859113834035, "loss": 11.4756, "step": 592 }, { "epoch": 0.02306428244994798, "grad_norm": 0.11372707784175873, "learning_rate": 0.00012781630136121262, "loss": 11.4431, "step": 593 }, { "epoch": 0.023103176686794433, "grad_norm": 0.1052485853433609, "learning_rate": 0.00012760387562419772, "loss": 11.4533, "step": 594 }, { "epoch": 0.02314207092364089, "grad_norm": 0.28067412972450256, "learning_rate": 0.00012739131496558535, "loss": 11.4839, "step": 595 }, { "epoch": 0.023180965160487343, "grad_norm": 0.09815080463886261, "learning_rate": 0.00012717862042432472, "loss": 11.4441, "step": 596 }, { "epoch": 0.0232198593973338, "grad_norm": 0.10377902537584305, "learning_rate": 0.00012696579304001933, "loss": 11.447, "step": 597 }, { "epoch": 0.023258753634180257, "grad_norm": 0.1069532036781311, "learning_rate": 0.00012675283385292212, "loss": 11.4245, "step": 598 }, { "epoch": 0.02329764787102671, "grad_norm": 0.1295267790555954, "learning_rate": 0.00012653974390393008, "loss": 11.4429, "step": 599 }, { "epoch": 0.023336542107873167, "grad_norm": 0.15152481198310852, "learning_rate": 0.00012632652423457948, "loss": 11.4667, "step": 600 }, { "epoch": 0.02337543634471962, "grad_norm": 0.17180782556533813, "learning_rate": 0.00012611317588704057, "loss": 11.4346, "step": 601 }, { "epoch": 0.023414330581566077, "grad_norm": 0.16724589467048645, "learning_rate": 0.00012589969990411257, "loss": 11.4887, "step": 602 }, { "epoch": 0.02345322481841253, "grad_norm": 0.09064344316720963, "learning_rate": 0.0001256860973292186, "loss": 11.4553, "step": 603 }, { "epoch": 0.023492119055258987, "grad_norm": 0.134121835231781, "learning_rate": 0.00012547236920640043, "loss": 11.47, "step": 604 }, { "epoch": 0.023531013292105444, "grad_norm": 0.13513854146003723, "learning_rate": 0.00012525851658031352, "loss": 11.4928, "step": 605 }, { "epoch": 0.023569907528951897, "grad_norm": 0.1374712586402893, "learning_rate": 0.0001250445404962219, "loss": 11.4613, "step": 606 }, { "epoch": 0.023608801765798354, "grad_norm": 0.1524314433336258, "learning_rate": 0.00012483044199999307, "loss": 11.4366, "step": 607 }, { "epoch": 0.023647696002644807, "grad_norm": 0.25216957926750183, "learning_rate": 0.00012461622213809275, "loss": 11.4638, "step": 608 }, { "epoch": 0.023686590239491264, "grad_norm": 0.1467767208814621, "learning_rate": 0.00012440188195757997, "loss": 11.4825, "step": 609 }, { "epoch": 0.023725484476337717, "grad_norm": 0.10582228749990463, "learning_rate": 0.00012418742250610174, "loss": 11.4393, "step": 610 }, { "epoch": 0.023764378713184174, "grad_norm": 0.12060561031103134, "learning_rate": 0.00012397284483188817, "loss": 11.4632, "step": 611 }, { "epoch": 0.02380327295003063, "grad_norm": 0.09976198524236679, "learning_rate": 0.00012375814998374712, "loss": 11.4355, "step": 612 }, { "epoch": 0.023842167186877084, "grad_norm": 0.1368313729763031, "learning_rate": 0.0001235433390110592, "loss": 11.4946, "step": 613 }, { "epoch": 0.02388106142372354, "grad_norm": 0.11875221133232117, "learning_rate": 0.00012332841296377266, "loss": 11.4815, "step": 614 }, { "epoch": 0.023919955660569994, "grad_norm": 0.14994442462921143, "learning_rate": 0.00012311337289239806, "loss": 11.4628, "step": 615 }, { "epoch": 0.02395884989741645, "grad_norm": 0.28800103068351746, "learning_rate": 0.00012289821984800346, "loss": 11.4616, "step": 616 }, { "epoch": 0.023997744134262904, "grad_norm": 0.11221929639577866, "learning_rate": 0.00012268295488220907, "loss": 11.5016, "step": 617 }, { "epoch": 0.02403663837110936, "grad_norm": 0.20300208032131195, "learning_rate": 0.000122467579047182, "loss": 11.4818, "step": 618 }, { "epoch": 0.024075532607955818, "grad_norm": 0.1258939653635025, "learning_rate": 0.00012225209339563145, "loss": 11.4389, "step": 619 }, { "epoch": 0.02411442684480227, "grad_norm": 0.10672712326049805, "learning_rate": 0.0001220364989808033, "loss": 11.4435, "step": 620 }, { "epoch": 0.024153321081648728, "grad_norm": 0.14902956783771515, "learning_rate": 0.00012182079685647498, "loss": 11.5028, "step": 621 }, { "epoch": 0.02419221531849518, "grad_norm": 0.15215426683425903, "learning_rate": 0.00012160498807695054, "loss": 11.4287, "step": 622 }, { "epoch": 0.024231109555341638, "grad_norm": 0.21524302661418915, "learning_rate": 0.00012138907369705519, "loss": 11.4368, "step": 623 }, { "epoch": 0.02427000379218809, "grad_norm": 0.12461920082569122, "learning_rate": 0.0001211730547721303, "loss": 11.4415, "step": 624 }, { "epoch": 0.024308898029034548, "grad_norm": 0.14802490174770355, "learning_rate": 0.00012095693235802835, "loss": 11.4592, "step": 625 }, { "epoch": 0.024347792265881005, "grad_norm": 0.11762988567352295, "learning_rate": 0.00012074070751110751, "loss": 11.4407, "step": 626 }, { "epoch": 0.024386686502727458, "grad_norm": 0.13906416296958923, "learning_rate": 0.00012052438128822674, "loss": 11.4441, "step": 627 }, { "epoch": 0.024425580739573915, "grad_norm": 0.10678276419639587, "learning_rate": 0.00012030795474674042, "loss": 11.431, "step": 628 }, { "epoch": 0.024464474976420368, "grad_norm": 0.15330135822296143, "learning_rate": 0.0001200914289444933, "loss": 11.4387, "step": 629 }, { "epoch": 0.024503369213266825, "grad_norm": 0.0899004265666008, "learning_rate": 0.00011987480493981527, "loss": 11.4281, "step": 630 }, { "epoch": 0.024542263450113278, "grad_norm": 0.1445421576499939, "learning_rate": 0.00011965808379151625, "loss": 11.4977, "step": 631 }, { "epoch": 0.024581157686959735, "grad_norm": 0.2019146978855133, "learning_rate": 0.00011944126655888095, "loss": 11.4775, "step": 632 }, { "epoch": 0.02462005192380619, "grad_norm": 0.12725169956684113, "learning_rate": 0.0001192243543016637, "loss": 11.4343, "step": 633 }, { "epoch": 0.024658946160652645, "grad_norm": 0.12219946831464767, "learning_rate": 0.00011900734808008333, "loss": 11.4318, "step": 634 }, { "epoch": 0.024697840397499102, "grad_norm": 0.3171999454498291, "learning_rate": 0.00011879024895481799, "loss": 11.4481, "step": 635 }, { "epoch": 0.024736734634345555, "grad_norm": 0.11464212089776993, "learning_rate": 0.00011857305798699976, "loss": 11.5213, "step": 636 }, { "epoch": 0.024775628871192012, "grad_norm": 0.09886327385902405, "learning_rate": 0.00011835577623820979, "loss": 11.436, "step": 637 }, { "epoch": 0.024814523108038465, "grad_norm": 0.1506219506263733, "learning_rate": 0.00011813840477047289, "loss": 11.4729, "step": 638 }, { "epoch": 0.024853417344884922, "grad_norm": 0.07243680208921432, "learning_rate": 0.00011792094464625232, "loss": 11.4474, "step": 639 }, { "epoch": 0.02489231158173138, "grad_norm": 0.15325765311717987, "learning_rate": 0.00011770339692844483, "loss": 11.4927, "step": 640 }, { "epoch": 0.024931205818577832, "grad_norm": 0.10868734866380692, "learning_rate": 0.00011748576268037524, "loss": 11.4738, "step": 641 }, { "epoch": 0.02497010005542429, "grad_norm": 0.1233552023768425, "learning_rate": 0.00011726804296579117, "loss": 11.5065, "step": 642 }, { "epoch": 0.025008994292270742, "grad_norm": 0.17249184846878052, "learning_rate": 0.00011705023884885821, "loss": 11.4504, "step": 643 }, { "epoch": 0.0250478885291172, "grad_norm": 0.08547560125589371, "learning_rate": 0.00011683235139415436, "loss": 11.4288, "step": 644 }, { "epoch": 0.025086782765963652, "grad_norm": 0.10097157210111618, "learning_rate": 0.00011661438166666497, "loss": 11.4405, "step": 645 }, { "epoch": 0.02512567700281011, "grad_norm": 0.19517868757247925, "learning_rate": 0.00011639633073177754, "loss": 11.4526, "step": 646 }, { "epoch": 0.025164571239656566, "grad_norm": 0.16102519631385803, "learning_rate": 0.0001161781996552765, "loss": 11.5228, "step": 647 }, { "epoch": 0.02520346547650302, "grad_norm": 0.10687874257564545, "learning_rate": 0.00011595998950333793, "loss": 11.4531, "step": 648 }, { "epoch": 0.025242359713349476, "grad_norm": 0.08028004318475723, "learning_rate": 0.00011574170134252452, "loss": 11.4638, "step": 649 }, { "epoch": 0.02528125395019593, "grad_norm": 0.17443528771400452, "learning_rate": 0.00011552333623978017, "loss": 11.472, "step": 650 }, { "epoch": 0.025320148187042386, "grad_norm": 0.2097606062889099, "learning_rate": 0.0001153048952624249, "loss": 11.4881, "step": 651 }, { "epoch": 0.02535904242388884, "grad_norm": 0.101004958152771, "learning_rate": 0.00011508637947814951, "loss": 11.4213, "step": 652 }, { "epoch": 0.025397936660735296, "grad_norm": 0.12250139564275742, "learning_rate": 0.00011486778995501057, "loss": 11.4527, "step": 653 }, { "epoch": 0.025436830897581753, "grad_norm": 0.17433437705039978, "learning_rate": 0.00011464912776142494, "loss": 11.475, "step": 654 }, { "epoch": 0.025475725134428206, "grad_norm": 0.08881186693906784, "learning_rate": 0.00011443039396616475, "loss": 11.4489, "step": 655 }, { "epoch": 0.025514619371274663, "grad_norm": 0.14085331559181213, "learning_rate": 0.00011421158963835208, "loss": 11.4693, "step": 656 }, { "epoch": 0.025553513608121116, "grad_norm": 0.1215318888425827, "learning_rate": 0.00011399271584745381, "loss": 11.449, "step": 657 }, { "epoch": 0.025592407844967573, "grad_norm": 0.07946611195802689, "learning_rate": 0.0001137737736632762, "loss": 11.4205, "step": 658 }, { "epoch": 0.025631302081814026, "grad_norm": 0.10181257873773575, "learning_rate": 0.00011355476415595998, "loss": 11.4513, "step": 659 }, { "epoch": 0.025670196318660483, "grad_norm": 0.10591235756874084, "learning_rate": 0.00011333568839597481, "loss": 11.4448, "step": 660 }, { "epoch": 0.02570909055550694, "grad_norm": 0.3380816876888275, "learning_rate": 0.00011311654745411425, "loss": 11.4624, "step": 661 }, { "epoch": 0.025747984792353393, "grad_norm": 0.21232765913009644, "learning_rate": 0.0001128973424014904, "loss": 11.4867, "step": 662 }, { "epoch": 0.02578687902919985, "grad_norm": 0.267391175031662, "learning_rate": 0.00011267807430952877, "loss": 11.5121, "step": 663 }, { "epoch": 0.025825773266046303, "grad_norm": 0.13013480603694916, "learning_rate": 0.00011245874424996293, "loss": 11.4034, "step": 664 }, { "epoch": 0.02586466750289276, "grad_norm": 0.09451703727245331, "learning_rate": 0.00011223935329482942, "loss": 11.4235, "step": 665 }, { "epoch": 0.025903561739739213, "grad_norm": 0.14881646633148193, "learning_rate": 0.00011201990251646237, "loss": 11.4617, "step": 666 }, { "epoch": 0.02594245597658567, "grad_norm": 0.07941895723342896, "learning_rate": 0.0001118003929874883, "loss": 11.4343, "step": 667 }, { "epoch": 0.025981350213432126, "grad_norm": 0.15021832287311554, "learning_rate": 0.00011158082578082089, "loss": 11.4173, "step": 668 }, { "epoch": 0.02602024445027858, "grad_norm": 0.07973194122314453, "learning_rate": 0.00011136120196965582, "loss": 11.452, "step": 669 }, { "epoch": 0.026059138687125036, "grad_norm": 0.08654632419347763, "learning_rate": 0.00011114152262746528, "loss": 11.4528, "step": 670 }, { "epoch": 0.02609803292397149, "grad_norm": 0.12764911353588104, "learning_rate": 0.00011092178882799308, "loss": 11.4542, "step": 671 }, { "epoch": 0.026136927160817947, "grad_norm": 0.11200135201215744, "learning_rate": 0.00011070200164524907, "loss": 11.442, "step": 672 }, { "epoch": 0.0261758213976644, "grad_norm": 0.12071491032838821, "learning_rate": 0.00011048216215350402, "loss": 11.4551, "step": 673 }, { "epoch": 0.026214715634510857, "grad_norm": 0.07602666318416595, "learning_rate": 0.00011026227142728443, "loss": 11.4547, "step": 674 }, { "epoch": 0.026253609871357313, "grad_norm": 0.17223294079303741, "learning_rate": 0.00011004233054136725, "loss": 11.4654, "step": 675 }, { "epoch": 0.026292504108203767, "grad_norm": 0.11711843311786652, "learning_rate": 0.0001098223405707745, "loss": 11.4441, "step": 676 }, { "epoch": 0.026331398345050223, "grad_norm": 0.16260488331317902, "learning_rate": 0.00010960230259076818, "loss": 11.4418, "step": 677 }, { "epoch": 0.026370292581896677, "grad_norm": 0.14485377073287964, "learning_rate": 0.00010938221767684498, "loss": 11.4393, "step": 678 }, { "epoch": 0.026409186818743133, "grad_norm": 0.15744422376155853, "learning_rate": 0.00010916208690473091, "loss": 11.4608, "step": 679 }, { "epoch": 0.026448081055589587, "grad_norm": 0.13384109735488892, "learning_rate": 0.00010894191135037619, "loss": 11.4703, "step": 680 }, { "epoch": 0.026486975292436044, "grad_norm": 0.2098294049501419, "learning_rate": 0.00010872169208994992, "loss": 11.4493, "step": 681 }, { "epoch": 0.026525869529282497, "grad_norm": 0.15324105322360992, "learning_rate": 0.00010850143019983474, "loss": 11.4974, "step": 682 }, { "epoch": 0.026564763766128954, "grad_norm": 0.12925155460834503, "learning_rate": 0.00010828112675662176, "loss": 11.4796, "step": 683 }, { "epoch": 0.02660365800297541, "grad_norm": 0.16498637199401855, "learning_rate": 0.00010806078283710522, "loss": 11.4605, "step": 684 }, { "epoch": 0.026642552239821864, "grad_norm": 0.16534031927585602, "learning_rate": 0.00010784039951827702, "loss": 11.4634, "step": 685 }, { "epoch": 0.02668144647666832, "grad_norm": 0.2693954110145569, "learning_rate": 0.00010761997787732183, "loss": 11.4592, "step": 686 }, { "epoch": 0.026720340713514774, "grad_norm": 0.14219005405902863, "learning_rate": 0.00010739951899161153, "loss": 11.4712, "step": 687 }, { "epoch": 0.02675923495036123, "grad_norm": 0.1442575603723526, "learning_rate": 0.00010717902393870007, "loss": 11.451, "step": 688 }, { "epoch": 0.026798129187207684, "grad_norm": 0.14794135093688965, "learning_rate": 0.00010695849379631813, "loss": 11.4786, "step": 689 }, { "epoch": 0.02683702342405414, "grad_norm": 0.10855800658464432, "learning_rate": 0.000106737929642368, "loss": 11.418, "step": 690 }, { "epoch": 0.026875917660900597, "grad_norm": 0.0762091651558876, "learning_rate": 0.00010651733255491809, "loss": 11.4227, "step": 691 }, { "epoch": 0.02691481189774705, "grad_norm": 0.1832919418811798, "learning_rate": 0.0001062967036121979, "loss": 11.4328, "step": 692 }, { "epoch": 0.026953706134593507, "grad_norm": 0.1116456463932991, "learning_rate": 0.00010607604389259256, "loss": 11.4808, "step": 693 }, { "epoch": 0.02699260037143996, "grad_norm": 0.15306049585342407, "learning_rate": 0.00010585535447463761, "loss": 11.4594, "step": 694 }, { "epoch": 0.027031494608286417, "grad_norm": 0.13958840072155, "learning_rate": 0.0001056346364370138, "loss": 11.4539, "step": 695 }, { "epoch": 0.02707038884513287, "grad_norm": 0.11846836656332016, "learning_rate": 0.00010541389085854176, "loss": 11.4531, "step": 696 }, { "epoch": 0.027109283081979327, "grad_norm": 0.2525574862957001, "learning_rate": 0.00010519311881817673, "loss": 11.6172, "step": 697 }, { "epoch": 0.027148177318825784, "grad_norm": 0.13093318045139313, "learning_rate": 0.00010497232139500329, "loss": 11.437, "step": 698 }, { "epoch": 0.027187071555672238, "grad_norm": 0.14582161605358124, "learning_rate": 0.00010475149966823004, "loss": 11.4334, "step": 699 }, { "epoch": 0.027225965792518694, "grad_norm": 0.0936809629201889, "learning_rate": 0.00010453065471718444, "loss": 11.4259, "step": 700 }, { "epoch": 0.027264860029365148, "grad_norm": 0.1881120502948761, "learning_rate": 0.00010430978762130743, "loss": 11.4332, "step": 701 }, { "epoch": 0.027303754266211604, "grad_norm": 0.14588621258735657, "learning_rate": 0.00010408889946014819, "loss": 11.4863, "step": 702 }, { "epoch": 0.027342648503058058, "grad_norm": 0.1064009740948677, "learning_rate": 0.00010386799131335889, "loss": 11.4396, "step": 703 }, { "epoch": 0.027381542739904514, "grad_norm": 0.12732961773872375, "learning_rate": 0.0001036470642606893, "loss": 11.4598, "step": 704 }, { "epoch": 0.02742043697675097, "grad_norm": 0.1523655652999878, "learning_rate": 0.00010342611938198174, "loss": 11.4805, "step": 705 }, { "epoch": 0.027459331213597424, "grad_norm": 0.15525731444358826, "learning_rate": 0.00010320515775716555, "loss": 11.4902, "step": 706 }, { "epoch": 0.02749822545044388, "grad_norm": 0.08067545294761658, "learning_rate": 0.00010298418046625189, "loss": 11.4337, "step": 707 }, { "epoch": 0.027537119687290335, "grad_norm": 0.19358405470848083, "learning_rate": 0.00010276318858932863, "loss": 11.4824, "step": 708 }, { "epoch": 0.02757601392413679, "grad_norm": 0.155364990234375, "learning_rate": 0.00010254218320655482, "loss": 11.4997, "step": 709 }, { "epoch": 0.027614908160983245, "grad_norm": 0.10002454370260239, "learning_rate": 0.00010232116539815558, "loss": 11.4413, "step": 710 }, { "epoch": 0.0276538023978297, "grad_norm": 0.12226614356040955, "learning_rate": 0.0001021001362444167, "loss": 11.4367, "step": 711 }, { "epoch": 0.027692696634676158, "grad_norm": 0.11758933961391449, "learning_rate": 0.00010187909682567956, "loss": 11.4641, "step": 712 }, { "epoch": 0.02773159087152261, "grad_norm": 0.27798619866371155, "learning_rate": 0.00010165804822233549, "loss": 11.438, "step": 713 }, { "epoch": 0.027770485108369068, "grad_norm": 0.2517690360546112, "learning_rate": 0.00010143699151482094, "loss": 11.5115, "step": 714 }, { "epoch": 0.02780937934521552, "grad_norm": 0.12701858580112457, "learning_rate": 0.00010121592778361184, "loss": 11.4634, "step": 715 }, { "epoch": 0.027848273582061978, "grad_norm": 0.10672388970851898, "learning_rate": 0.00010099485810921848, "loss": 11.4631, "step": 716 }, { "epoch": 0.027848273582061978, "eval_loss": 11.487701416015625, "eval_runtime": 136.741, "eval_samples_per_second": 79.172, "eval_steps_per_second": 39.586, "step": 716 }, { "epoch": 0.02788716781890843, "grad_norm": 0.08253801614046097, "learning_rate": 0.00010077378357218021, "loss": 11.4232, "step": 717 }, { "epoch": 0.02792606205575489, "grad_norm": 0.13256096839904785, "learning_rate": 0.00010055270525306016, "loss": 11.4315, "step": 718 }, { "epoch": 0.027964956292601345, "grad_norm": 0.17937391996383667, "learning_rate": 0.00010033162423243987, "loss": 11.4851, "step": 719 }, { "epoch": 0.0280038505294478, "grad_norm": 0.20057813823223114, "learning_rate": 0.0001001105415909142, "loss": 11.5311, "step": 720 }, { "epoch": 0.028042744766294255, "grad_norm": 0.08090359717607498, "learning_rate": 9.988945840908583e-05, "loss": 11.4295, "step": 721 }, { "epoch": 0.02808163900314071, "grad_norm": 0.082366943359375, "learning_rate": 9.966837576756016e-05, "loss": 11.4418, "step": 722 }, { "epoch": 0.028120533239987165, "grad_norm": 0.15174053609371185, "learning_rate": 9.944729474693987e-05, "loss": 11.4378, "step": 723 }, { "epoch": 0.02815942747683362, "grad_norm": 0.09956763684749603, "learning_rate": 9.92262164278198e-05, "loss": 11.4256, "step": 724 }, { "epoch": 0.028198321713680075, "grad_norm": 0.06715458631515503, "learning_rate": 9.900514189078155e-05, "loss": 11.444, "step": 725 }, { "epoch": 0.028237215950526532, "grad_norm": 0.10695479065179825, "learning_rate": 9.878407221638816e-05, "loss": 11.4519, "step": 726 }, { "epoch": 0.028276110187372985, "grad_norm": 0.08852813392877579, "learning_rate": 9.85630084851791e-05, "loss": 11.4474, "step": 727 }, { "epoch": 0.028315004424219442, "grad_norm": 0.1369630992412567, "learning_rate": 9.834195177766452e-05, "loss": 11.4339, "step": 728 }, { "epoch": 0.028353898661065895, "grad_norm": 0.09969980269670486, "learning_rate": 9.81209031743205e-05, "loss": 11.4579, "step": 729 }, { "epoch": 0.028392792897912352, "grad_norm": 0.15188638865947723, "learning_rate": 9.789986375558331e-05, "loss": 11.5502, "step": 730 }, { "epoch": 0.028431687134758805, "grad_norm": 0.08629871159791946, "learning_rate": 9.767883460184443e-05, "loss": 11.4381, "step": 731 }, { "epoch": 0.028470581371605262, "grad_norm": 0.19648370146751404, "learning_rate": 9.74578167934452e-05, "loss": 11.4772, "step": 732 }, { "epoch": 0.02850947560845172, "grad_norm": 0.10891472548246384, "learning_rate": 9.723681141067139e-05, "loss": 11.4388, "step": 733 }, { "epoch": 0.028548369845298172, "grad_norm": 0.1772705316543579, "learning_rate": 9.701581953374815e-05, "loss": 11.4936, "step": 734 }, { "epoch": 0.02858726408214463, "grad_norm": 0.29078036546707153, "learning_rate": 9.679484224283449e-05, "loss": 11.5355, "step": 735 }, { "epoch": 0.028626158318991082, "grad_norm": 0.10268472880125046, "learning_rate": 9.657388061801828e-05, "loss": 11.4682, "step": 736 }, { "epoch": 0.02866505255583754, "grad_norm": 0.11252088099718094, "learning_rate": 9.635293573931072e-05, "loss": 11.4544, "step": 737 }, { "epoch": 0.028703946792683992, "grad_norm": 0.0970296636223793, "learning_rate": 9.613200868664112e-05, "loss": 11.4352, "step": 738 }, { "epoch": 0.02874284102953045, "grad_norm": 0.13067781925201416, "learning_rate": 9.591110053985182e-05, "loss": 11.4347, "step": 739 }, { "epoch": 0.028781735266376906, "grad_norm": 0.17787784337997437, "learning_rate": 9.569021237869258e-05, "loss": 11.4483, "step": 740 }, { "epoch": 0.02882062950322336, "grad_norm": 0.15255478024482727, "learning_rate": 9.54693452828156e-05, "loss": 11.4333, "step": 741 }, { "epoch": 0.028859523740069816, "grad_norm": 0.09174659103155136, "learning_rate": 9.524850033177e-05, "loss": 11.4265, "step": 742 }, { "epoch": 0.02889841797691627, "grad_norm": 0.1348842978477478, "learning_rate": 9.502767860499672e-05, "loss": 11.4743, "step": 743 }, { "epoch": 0.028937312213762726, "grad_norm": 0.17099758982658386, "learning_rate": 9.48068811818233e-05, "loss": 11.4782, "step": 744 }, { "epoch": 0.02897620645060918, "grad_norm": 0.0950162261724472, "learning_rate": 9.458610914145826e-05, "loss": 11.4753, "step": 745 }, { "epoch": 0.029015100687455636, "grad_norm": 0.11666002869606018, "learning_rate": 9.436536356298624e-05, "loss": 11.4198, "step": 746 }, { "epoch": 0.029053994924302093, "grad_norm": 0.1315421611070633, "learning_rate": 9.414464552536242e-05, "loss": 11.4877, "step": 747 }, { "epoch": 0.029092889161148546, "grad_norm": 0.15513554215431213, "learning_rate": 9.39239561074075e-05, "loss": 11.4861, "step": 748 }, { "epoch": 0.029131783397995003, "grad_norm": 0.1369619071483612, "learning_rate": 9.370329638780213e-05, "loss": 11.5185, "step": 749 }, { "epoch": 0.029170677634841456, "grad_norm": 0.10951586067676544, "learning_rate": 9.348266744508191e-05, "loss": 11.4876, "step": 750 }, { "epoch": 0.029209571871687913, "grad_norm": 0.0937129557132721, "learning_rate": 9.326207035763202e-05, "loss": 11.4312, "step": 751 }, { "epoch": 0.029248466108534366, "grad_norm": 0.1251746565103531, "learning_rate": 9.304150620368188e-05, "loss": 11.4397, "step": 752 }, { "epoch": 0.029287360345380823, "grad_norm": 0.13234980404376984, "learning_rate": 9.282097606129998e-05, "loss": 11.4528, "step": 753 }, { "epoch": 0.02932625458222728, "grad_norm": 0.16654668748378754, "learning_rate": 9.260048100838848e-05, "loss": 11.4857, "step": 754 }, { "epoch": 0.029365148819073733, "grad_norm": 0.40939077734947205, "learning_rate": 9.238002212267821e-05, "loss": 11.4327, "step": 755 }, { "epoch": 0.02940404305592019, "grad_norm": 0.14264138042926788, "learning_rate": 9.215960048172299e-05, "loss": 11.4855, "step": 756 }, { "epoch": 0.029442937292766643, "grad_norm": 0.14503702521324158, "learning_rate": 9.193921716289482e-05, "loss": 11.4595, "step": 757 }, { "epoch": 0.0294818315296131, "grad_norm": 0.11121979355812073, "learning_rate": 9.171887324337826e-05, "loss": 11.4433, "step": 758 }, { "epoch": 0.029520725766459553, "grad_norm": 0.11435031145811081, "learning_rate": 9.149856980016529e-05, "loss": 11.4518, "step": 759 }, { "epoch": 0.02955962000330601, "grad_norm": 0.21833956241607666, "learning_rate": 9.127830791005015e-05, "loss": 11.452, "step": 760 }, { "epoch": 0.029598514240152467, "grad_norm": 0.12180177867412567, "learning_rate": 9.105808864962384e-05, "loss": 11.4448, "step": 761 }, { "epoch": 0.02963740847699892, "grad_norm": 0.13487689197063446, "learning_rate": 9.083791309526908e-05, "loss": 11.4377, "step": 762 }, { "epoch": 0.029676302713845377, "grad_norm": 0.17410726845264435, "learning_rate": 9.061778232315505e-05, "loss": 11.441, "step": 763 }, { "epoch": 0.02971519695069183, "grad_norm": 0.12577226758003235, "learning_rate": 9.039769740923183e-05, "loss": 11.4545, "step": 764 }, { "epoch": 0.029754091187538287, "grad_norm": 0.10234140604734421, "learning_rate": 9.017765942922554e-05, "loss": 11.4476, "step": 765 }, { "epoch": 0.02979298542438474, "grad_norm": 0.17257264256477356, "learning_rate": 8.995766945863277e-05, "loss": 11.4764, "step": 766 }, { "epoch": 0.029831879661231197, "grad_norm": 0.1410018652677536, "learning_rate": 8.973772857271558e-05, "loss": 11.4282, "step": 767 }, { "epoch": 0.029870773898077654, "grad_norm": 0.11423639208078384, "learning_rate": 8.951783784649602e-05, "loss": 11.4617, "step": 768 }, { "epoch": 0.029909668134924107, "grad_norm": 0.10588809102773666, "learning_rate": 8.929799835475093e-05, "loss": 11.4645, "step": 769 }, { "epoch": 0.029948562371770564, "grad_norm": 0.18268363177776337, "learning_rate": 8.907821117200694e-05, "loss": 11.4531, "step": 770 }, { "epoch": 0.029987456608617017, "grad_norm": 0.10537099093198776, "learning_rate": 8.88584773725347e-05, "loss": 11.4033, "step": 771 }, { "epoch": 0.030026350845463474, "grad_norm": 0.13354121148586273, "learning_rate": 8.863879803034421e-05, "loss": 11.4562, "step": 772 }, { "epoch": 0.030065245082309927, "grad_norm": 0.09870657324790955, "learning_rate": 8.841917421917912e-05, "loss": 11.4425, "step": 773 }, { "epoch": 0.030104139319156384, "grad_norm": 0.08798401057720184, "learning_rate": 8.819960701251175e-05, "loss": 11.4367, "step": 774 }, { "epoch": 0.03014303355600284, "grad_norm": 0.139079287648201, "learning_rate": 8.798009748353765e-05, "loss": 11.4877, "step": 775 }, { "epoch": 0.030181927792849294, "grad_norm": 0.1492471843957901, "learning_rate": 8.776064670517059e-05, "loss": 11.4751, "step": 776 }, { "epoch": 0.03022082202969575, "grad_norm": 0.15318681299686432, "learning_rate": 8.754125575003708e-05, "loss": 11.4705, "step": 777 }, { "epoch": 0.030259716266542204, "grad_norm": 0.14461193978786469, "learning_rate": 8.732192569047126e-05, "loss": 11.4745, "step": 778 }, { "epoch": 0.03029861050338866, "grad_norm": 0.14277328550815582, "learning_rate": 8.710265759850963e-05, "loss": 11.4334, "step": 779 }, { "epoch": 0.030337504740235114, "grad_norm": 0.09315398335456848, "learning_rate": 8.688345254588578e-05, "loss": 11.4297, "step": 780 }, { "epoch": 0.03037639897708157, "grad_norm": 0.08005926758050919, "learning_rate": 8.666431160402518e-05, "loss": 11.4431, "step": 781 }, { "epoch": 0.030415293213928028, "grad_norm": 0.10046279430389404, "learning_rate": 8.644523584404003e-05, "loss": 11.4787, "step": 782 }, { "epoch": 0.03045418745077448, "grad_norm": 0.14663808047771454, "learning_rate": 8.62262263367238e-05, "loss": 11.4878, "step": 783 }, { "epoch": 0.030493081687620938, "grad_norm": 0.17167599499225616, "learning_rate": 8.600728415254624e-05, "loss": 11.5243, "step": 784 }, { "epoch": 0.03053197592446739, "grad_norm": 0.11615641415119171, "learning_rate": 8.578841036164794e-05, "loss": 11.4355, "step": 785 }, { "epoch": 0.030570870161313848, "grad_norm": 0.08393886685371399, "learning_rate": 8.55696060338353e-05, "loss": 11.4553, "step": 786 }, { "epoch": 0.0306097643981603, "grad_norm": 0.16598129272460938, "learning_rate": 8.535087223857508e-05, "loss": 11.446, "step": 787 }, { "epoch": 0.030648658635006758, "grad_norm": 0.1524801105260849, "learning_rate": 8.513221004498946e-05, "loss": 11.4439, "step": 788 }, { "epoch": 0.030687552871853215, "grad_norm": 0.1081952229142189, "learning_rate": 8.491362052185053e-05, "loss": 11.4627, "step": 789 }, { "epoch": 0.030726447108699668, "grad_norm": 0.13826072216033936, "learning_rate": 8.469510473757513e-05, "loss": 11.4835, "step": 790 }, { "epoch": 0.030765341345546125, "grad_norm": 0.1568002849817276, "learning_rate": 8.447666376021985e-05, "loss": 11.4659, "step": 791 }, { "epoch": 0.030804235582392578, "grad_norm": 0.09875816106796265, "learning_rate": 8.425829865747549e-05, "loss": 11.4209, "step": 792 }, { "epoch": 0.030843129819239035, "grad_norm": 0.18653692305088043, "learning_rate": 8.404001049666211e-05, "loss": 11.4606, "step": 793 }, { "epoch": 0.030882024056085488, "grad_norm": 0.19002626836299896, "learning_rate": 8.382180034472353e-05, "loss": 11.4729, "step": 794 }, { "epoch": 0.030920918292931945, "grad_norm": 0.09468155354261398, "learning_rate": 8.360366926822247e-05, "loss": 11.4465, "step": 795 }, { "epoch": 0.0309598125297784, "grad_norm": 0.12331503629684448, "learning_rate": 8.338561833333506e-05, "loss": 11.4442, "step": 796 }, { "epoch": 0.030998706766624855, "grad_norm": 0.1488187164068222, "learning_rate": 8.316764860584567e-05, "loss": 11.4704, "step": 797 }, { "epoch": 0.03103760100347131, "grad_norm": 0.08257905393838882, "learning_rate": 8.294976115114184e-05, "loss": 11.4504, "step": 798 }, { "epoch": 0.031076495240317765, "grad_norm": 0.11658480763435364, "learning_rate": 8.273195703420884e-05, "loss": 11.4836, "step": 799 }, { "epoch": 0.03111538947716422, "grad_norm": 0.11987362056970596, "learning_rate": 8.25142373196248e-05, "loss": 11.4406, "step": 800 }, { "epoch": 0.031154283714010675, "grad_norm": 0.1509653776884079, "learning_rate": 8.229660307155518e-05, "loss": 11.497, "step": 801 }, { "epoch": 0.03119317795085713, "grad_norm": 0.11979538947343826, "learning_rate": 8.207905535374767e-05, "loss": 11.4391, "step": 802 }, { "epoch": 0.03123207218770359, "grad_norm": 0.18861103057861328, "learning_rate": 8.186159522952716e-05, "loss": 11.4934, "step": 803 }, { "epoch": 0.031270966424550045, "grad_norm": 0.06771805882453918, "learning_rate": 8.164422376179023e-05, "loss": 11.442, "step": 804 }, { "epoch": 0.031309860661396495, "grad_norm": 0.28224071860313416, "learning_rate": 8.142694201300027e-05, "loss": 11.4551, "step": 805 }, { "epoch": 0.03134875489824295, "grad_norm": 0.15878629684448242, "learning_rate": 8.120975104518203e-05, "loss": 11.4591, "step": 806 }, { "epoch": 0.03138764913508941, "grad_norm": 0.16484294831752777, "learning_rate": 8.099265191991665e-05, "loss": 11.5227, "step": 807 }, { "epoch": 0.031426543371935865, "grad_norm": 0.19311536848545074, "learning_rate": 8.077564569833632e-05, "loss": 11.4931, "step": 808 }, { "epoch": 0.03146543760878232, "grad_norm": 0.11146111786365509, "learning_rate": 8.055873344111906e-05, "loss": 11.4418, "step": 809 }, { "epoch": 0.03150433184562877, "grad_norm": 0.12124677747488022, "learning_rate": 8.03419162084838e-05, "loss": 11.4164, "step": 810 }, { "epoch": 0.03154322608247523, "grad_norm": 0.09567411988973618, "learning_rate": 8.012519506018476e-05, "loss": 11.4232, "step": 811 }, { "epoch": 0.031582120319321685, "grad_norm": 0.0929986760020256, "learning_rate": 7.990857105550675e-05, "loss": 11.4187, "step": 812 }, { "epoch": 0.03162101455616814, "grad_norm": 0.13243938982486725, "learning_rate": 7.969204525325962e-05, "loss": 11.4515, "step": 813 }, { "epoch": 0.03165990879301459, "grad_norm": 0.1833588033914566, "learning_rate": 7.947561871177327e-05, "loss": 11.4609, "step": 814 }, { "epoch": 0.03169880302986105, "grad_norm": 0.1955696940422058, "learning_rate": 7.92592924888925e-05, "loss": 11.5276, "step": 815 }, { "epoch": 0.031737697266707506, "grad_norm": 0.12613818049430847, "learning_rate": 7.904306764197168e-05, "loss": 11.4316, "step": 816 }, { "epoch": 0.03177659150355396, "grad_norm": 0.13009190559387207, "learning_rate": 7.882694522786974e-05, "loss": 11.4452, "step": 817 }, { "epoch": 0.03181548574040042, "grad_norm": 0.10820070654153824, "learning_rate": 7.861092630294484e-05, "loss": 11.4772, "step": 818 }, { "epoch": 0.03185437997724687, "grad_norm": 0.15169255435466766, "learning_rate": 7.839501192304947e-05, "loss": 11.4619, "step": 819 }, { "epoch": 0.031893274214093326, "grad_norm": 0.13594582676887512, "learning_rate": 7.817920314352503e-05, "loss": 11.4355, "step": 820 }, { "epoch": 0.03193216845093978, "grad_norm": 0.1923629492521286, "learning_rate": 7.796350101919671e-05, "loss": 11.4255, "step": 821 }, { "epoch": 0.03197106268778624, "grad_norm": 0.10615510493516922, "learning_rate": 7.774790660436858e-05, "loss": 11.4286, "step": 822 }, { "epoch": 0.032009956924632696, "grad_norm": 0.08784908056259155, "learning_rate": 7.753242095281802e-05, "loss": 11.4347, "step": 823 }, { "epoch": 0.032048851161479146, "grad_norm": 0.11678597331047058, "learning_rate": 7.731704511779099e-05, "loss": 11.4353, "step": 824 }, { "epoch": 0.0320877453983256, "grad_norm": 0.14838770031929016, "learning_rate": 7.710178015199655e-05, "loss": 11.465, "step": 825 }, { "epoch": 0.03212663963517206, "grad_norm": 0.09852113574743271, "learning_rate": 7.688662710760194e-05, "loss": 11.4075, "step": 826 }, { "epoch": 0.032165533872018516, "grad_norm": 0.09276288747787476, "learning_rate": 7.667158703622739e-05, "loss": 11.4557, "step": 827 }, { "epoch": 0.032204428108864966, "grad_norm": 0.10213444381952286, "learning_rate": 7.645666098894082e-05, "loss": 11.4035, "step": 828 }, { "epoch": 0.03224332234571142, "grad_norm": 0.1934029459953308, "learning_rate": 7.624185001625292e-05, "loss": 11.4614, "step": 829 }, { "epoch": 0.03228221658255788, "grad_norm": 0.09062288701534271, "learning_rate": 7.602715516811183e-05, "loss": 11.4192, "step": 830 }, { "epoch": 0.032321110819404336, "grad_norm": 0.12071729451417923, "learning_rate": 7.581257749389828e-05, "loss": 11.4517, "step": 831 }, { "epoch": 0.03236000505625079, "grad_norm": 0.18437331914901733, "learning_rate": 7.559811804242008e-05, "loss": 11.5739, "step": 832 }, { "epoch": 0.03239889929309724, "grad_norm": 0.1351187527179718, "learning_rate": 7.538377786190724e-05, "loss": 11.467, "step": 833 }, { "epoch": 0.0324377935299437, "grad_norm": 0.11380963027477264, "learning_rate": 7.516955800000696e-05, "loss": 11.4142, "step": 834 }, { "epoch": 0.032476687766790156, "grad_norm": 0.20516403019428253, "learning_rate": 7.49554595037781e-05, "loss": 11.4732, "step": 835 }, { "epoch": 0.03251558200363661, "grad_norm": 0.11636844277381897, "learning_rate": 7.474148341968652e-05, "loss": 11.4619, "step": 836 }, { "epoch": 0.03255447624048307, "grad_norm": 0.10400319844484329, "learning_rate": 7.45276307935996e-05, "loss": 11.4187, "step": 837 }, { "epoch": 0.03259337047732952, "grad_norm": 0.13016699254512787, "learning_rate": 7.431390267078142e-05, "loss": 11.4434, "step": 838 }, { "epoch": 0.032632264714175976, "grad_norm": 0.07082528620958328, "learning_rate": 7.410030009588744e-05, "loss": 11.4225, "step": 839 }, { "epoch": 0.03267115895102243, "grad_norm": 0.13742923736572266, "learning_rate": 7.388682411295946e-05, "loss": 11.4645, "step": 840 }, { "epoch": 0.03271005318786889, "grad_norm": 0.11265579611063004, "learning_rate": 7.367347576542059e-05, "loss": 11.4307, "step": 841 }, { "epoch": 0.03274894742471534, "grad_norm": 0.1302904635667801, "learning_rate": 7.346025609606996e-05, "loss": 11.4813, "step": 842 }, { "epoch": 0.032787841661561797, "grad_norm": 0.12135647982358932, "learning_rate": 7.324716614707793e-05, "loss": 11.4464, "step": 843 }, { "epoch": 0.03282673589840825, "grad_norm": 0.11820655316114426, "learning_rate": 7.30342069599807e-05, "loss": 11.4554, "step": 844 }, { "epoch": 0.03286563013525471, "grad_norm": 0.10697997361421585, "learning_rate": 7.282137957567528e-05, "loss": 11.4417, "step": 845 }, { "epoch": 0.03290452437210117, "grad_norm": 0.09324019402265549, "learning_rate": 7.260868503441466e-05, "loss": 11.4145, "step": 846 }, { "epoch": 0.03294341860894762, "grad_norm": 0.09825508296489716, "learning_rate": 7.23961243758023e-05, "loss": 11.4253, "step": 847 }, { "epoch": 0.03298231284579407, "grad_norm": 0.1215229257941246, "learning_rate": 7.218369863878744e-05, "loss": 11.4605, "step": 848 }, { "epoch": 0.03302120708264053, "grad_norm": 0.15956827998161316, "learning_rate": 7.197140886165969e-05, "loss": 11.4586, "step": 849 }, { "epoch": 0.03306010131948699, "grad_norm": 0.11994804441928864, "learning_rate": 7.175925608204428e-05, "loss": 11.4953, "step": 850 }, { "epoch": 0.033098995556333444, "grad_norm": 0.12891148030757904, "learning_rate": 7.154724133689677e-05, "loss": 11.4618, "step": 851 }, { "epoch": 0.033137889793179894, "grad_norm": 0.16294585168361664, "learning_rate": 7.133536566249794e-05, "loss": 11.4927, "step": 852 }, { "epoch": 0.03317678403002635, "grad_norm": 0.1092517077922821, "learning_rate": 7.112363009444903e-05, "loss": 11.4409, "step": 853 }, { "epoch": 0.03321567826687281, "grad_norm": 0.12596727907657623, "learning_rate": 7.091203566766622e-05, "loss": 11.4381, "step": 854 }, { "epoch": 0.033254572503719264, "grad_norm": 0.22151672840118408, "learning_rate": 7.070058341637605e-05, "loss": 11.4834, "step": 855 }, { "epoch": 0.033293466740565714, "grad_norm": 0.16800400614738464, "learning_rate": 7.048927437410999e-05, "loss": 11.4446, "step": 856 }, { "epoch": 0.03333236097741217, "grad_norm": 0.11195861548185349, "learning_rate": 7.027810957369957e-05, "loss": 11.4279, "step": 857 }, { "epoch": 0.03337125521425863, "grad_norm": 0.08265417069196701, "learning_rate": 7.006709004727139e-05, "loss": 11.4501, "step": 858 }, { "epoch": 0.033410149451105084, "grad_norm": 0.12703652679920197, "learning_rate": 6.985621682624183e-05, "loss": 11.48, "step": 859 }, { "epoch": 0.03344904368795154, "grad_norm": 0.14403915405273438, "learning_rate": 6.964549094131229e-05, "loss": 11.4881, "step": 860 }, { "epoch": 0.03348793792479799, "grad_norm": 0.07923027873039246, "learning_rate": 6.943491342246393e-05, "loss": 11.4331, "step": 861 }, { "epoch": 0.03352683216164445, "grad_norm": 0.16718092560768127, "learning_rate": 6.922448529895282e-05, "loss": 11.481, "step": 862 }, { "epoch": 0.033565726398490904, "grad_norm": 0.12445984780788422, "learning_rate": 6.901420759930473e-05, "loss": 11.4598, "step": 863 }, { "epoch": 0.03360462063533736, "grad_norm": 0.10626041144132614, "learning_rate": 6.880408135131022e-05, "loss": 11.4551, "step": 864 }, { "epoch": 0.03364351487218382, "grad_norm": 0.13476891815662384, "learning_rate": 6.85941075820197e-05, "loss": 11.4829, "step": 865 }, { "epoch": 0.03368240910903027, "grad_norm": 0.13352851569652557, "learning_rate": 6.838428731773806e-05, "loss": 11.485, "step": 866 }, { "epoch": 0.033721303345876724, "grad_norm": 0.11239789426326752, "learning_rate": 6.817462158402015e-05, "loss": 11.4483, "step": 867 }, { "epoch": 0.03376019758272318, "grad_norm": 0.08315054327249527, "learning_rate": 6.796511140566531e-05, "loss": 11.4452, "step": 868 }, { "epoch": 0.03379909181956964, "grad_norm": 0.22786268591880798, "learning_rate": 6.775575780671266e-05, "loss": 11.5424, "step": 869 }, { "epoch": 0.03383798605641609, "grad_norm": 0.17151309549808502, "learning_rate": 6.754656181043602e-05, "loss": 11.4625, "step": 870 }, { "epoch": 0.033876880293262544, "grad_norm": 0.13574554026126862, "learning_rate": 6.733752443933878e-05, "loss": 11.4441, "step": 871 }, { "epoch": 0.033915774530109, "grad_norm": 0.13274209201335907, "learning_rate": 6.712864671514911e-05, "loss": 11.4399, "step": 872 }, { "epoch": 0.03395466876695546, "grad_norm": 0.13931576907634735, "learning_rate": 6.691992965881475e-05, "loss": 11.4427, "step": 873 }, { "epoch": 0.033993563003801915, "grad_norm": 0.1982814520597458, "learning_rate": 6.671137429049827e-05, "loss": 11.4683, "step": 874 }, { "epoch": 0.034032457240648364, "grad_norm": 0.09911420941352844, "learning_rate": 6.650298162957183e-05, "loss": 11.4214, "step": 875 }, { "epoch": 0.03407135147749482, "grad_norm": 0.17978575825691223, "learning_rate": 6.629475269461234e-05, "loss": 11.4612, "step": 876 }, { "epoch": 0.03411024571434128, "grad_norm": 0.16942349076271057, "learning_rate": 6.608668850339652e-05, "loss": 11.4971, "step": 877 }, { "epoch": 0.034149139951187735, "grad_norm": 0.14842084050178528, "learning_rate": 6.587879007289576e-05, "loss": 11.4384, "step": 878 }, { "epoch": 0.03418803418803419, "grad_norm": 0.12057257443666458, "learning_rate": 6.567105841927132e-05, "loss": 11.4452, "step": 879 }, { "epoch": 0.03422692842488064, "grad_norm": 0.12537352740764618, "learning_rate": 6.546349455786926e-05, "loss": 11.4557, "step": 880 }, { "epoch": 0.0342658226617271, "grad_norm": 0.18904034793376923, "learning_rate": 6.525609950321552e-05, "loss": 11.4493, "step": 881 }, { "epoch": 0.034304716898573555, "grad_norm": 0.18003982305526733, "learning_rate": 6.50488742690109e-05, "loss": 11.4539, "step": 882 }, { "epoch": 0.03434361113542001, "grad_norm": 0.15979625284671783, "learning_rate": 6.484181986812625e-05, "loss": 11.553, "step": 883 }, { "epoch": 0.03438250537226646, "grad_norm": 0.10028652101755142, "learning_rate": 6.463493731259742e-05, "loss": 11.4449, "step": 884 }, { "epoch": 0.03442139960911292, "grad_norm": 0.22036820650100708, "learning_rate": 6.442822761362015e-05, "loss": 11.5304, "step": 885 }, { "epoch": 0.034460293845959375, "grad_norm": 0.1596294641494751, "learning_rate": 6.422169178154556e-05, "loss": 11.4617, "step": 886 }, { "epoch": 0.03449918808280583, "grad_norm": 0.23585407435894012, "learning_rate": 6.40153308258747e-05, "loss": 11.5596, "step": 887 }, { "epoch": 0.03453808231965229, "grad_norm": 0.134441077709198, "learning_rate": 6.3809145755254e-05, "loss": 11.4394, "step": 888 }, { "epoch": 0.03457697655649874, "grad_norm": 0.08511587977409363, "learning_rate": 6.360313757747022e-05, "loss": 11.4562, "step": 889 }, { "epoch": 0.034615870793345195, "grad_norm": 0.17673659324645996, "learning_rate": 6.33973072994454e-05, "loss": 11.4896, "step": 890 }, { "epoch": 0.03465476503019165, "grad_norm": 0.0889359638094902, "learning_rate": 6.319165592723218e-05, "loss": 11.4523, "step": 891 }, { "epoch": 0.03469365926703811, "grad_norm": 0.15608401596546173, "learning_rate": 6.298618446600856e-05, "loss": 11.4688, "step": 892 }, { "epoch": 0.034732553503884565, "grad_norm": 0.17957602441310883, "learning_rate": 6.278089392007343e-05, "loss": 11.4594, "step": 893 }, { "epoch": 0.034771447740731015, "grad_norm": 0.23120824992656708, "learning_rate": 6.257578529284113e-05, "loss": 11.5103, "step": 894 }, { "epoch": 0.03481034197757747, "grad_norm": 0.10850831866264343, "learning_rate": 6.237085958683704e-05, "loss": 11.467, "step": 895 }, { "epoch": 0.03484923621442393, "grad_norm": 0.06931223720312119, "learning_rate": 6.216611780369238e-05, "loss": 11.4337, "step": 896 }, { "epoch": 0.034888130451270385, "grad_norm": 0.1407434195280075, "learning_rate": 6.196156094413934e-05, "loss": 11.4687, "step": 897 }, { "epoch": 0.034927024688116835, "grad_norm": 0.17006301879882812, "learning_rate": 6.175719000800637e-05, "loss": 11.4503, "step": 898 }, { "epoch": 0.03496591892496329, "grad_norm": 0.10757336020469666, "learning_rate": 6.155300599421306e-05, "loss": 11.4638, "step": 899 }, { "epoch": 0.03500481316180975, "grad_norm": 0.10327316075563431, "learning_rate": 6.134900990076541e-05, "loss": 11.4722, "step": 900 }, { "epoch": 0.035043707398656206, "grad_norm": 0.10495606809854507, "learning_rate": 6.114520272475088e-05, "loss": 11.4596, "step": 901 }, { "epoch": 0.03508260163550266, "grad_norm": 0.17221322655677795, "learning_rate": 6.094158546233359e-05, "loss": 11.4826, "step": 902 }, { "epoch": 0.03512149587234911, "grad_norm": 0.16179487109184265, "learning_rate": 6.073815910874942e-05, "loss": 11.4734, "step": 903 }, { "epoch": 0.03516039010919557, "grad_norm": 0.1274636834859848, "learning_rate": 6.053492465830097e-05, "loss": 11.4439, "step": 904 }, { "epoch": 0.035199284346042026, "grad_norm": 0.11228005588054657, "learning_rate": 6.0331883104353115e-05, "loss": 11.4514, "step": 905 }, { "epoch": 0.03523817858288848, "grad_norm": 0.17040938138961792, "learning_rate": 6.012903543932766e-05, "loss": 11.4617, "step": 906 }, { "epoch": 0.03527707281973493, "grad_norm": 0.15520794689655304, "learning_rate": 5.9926382654698857e-05, "loss": 11.4857, "step": 907 }, { "epoch": 0.03531596705658139, "grad_norm": 0.12189356982707977, "learning_rate": 5.972392574098844e-05, "loss": 11.4562, "step": 908 }, { "epoch": 0.035354861293427846, "grad_norm": 0.0995606780052185, "learning_rate": 5.952166568776062e-05, "loss": 11.4251, "step": 909 }, { "epoch": 0.0353937555302743, "grad_norm": 0.1529882550239563, "learning_rate": 5.931960348361759e-05, "loss": 11.4374, "step": 910 }, { "epoch": 0.03543264976712076, "grad_norm": 0.12336838990449905, "learning_rate": 5.9117740116194375e-05, "loss": 11.4488, "step": 911 }, { "epoch": 0.03547154400396721, "grad_norm": 0.11592377722263336, "learning_rate": 5.8916076572154165e-05, "loss": 11.4422, "step": 912 }, { "epoch": 0.035510438240813666, "grad_norm": 0.15878193080425262, "learning_rate": 5.871461383718344e-05, "loss": 11.4852, "step": 913 }, { "epoch": 0.03554933247766012, "grad_norm": 0.14143121242523193, "learning_rate": 5.851335289598722e-05, "loss": 11.4936, "step": 914 }, { "epoch": 0.03558822671450658, "grad_norm": 0.20466415584087372, "learning_rate": 5.831229473228418e-05, "loss": 11.5021, "step": 915 }, { "epoch": 0.035627120951353036, "grad_norm": 0.1679922640323639, "learning_rate": 5.811144032880182e-05, "loss": 11.4565, "step": 916 }, { "epoch": 0.035666015188199486, "grad_norm": 0.11905571073293686, "learning_rate": 5.791079066727174e-05, "loss": 11.4342, "step": 917 }, { "epoch": 0.03570490942504594, "grad_norm": 0.08226253092288971, "learning_rate": 5.7710346728424836e-05, "loss": 11.4062, "step": 918 }, { "epoch": 0.0357438036618924, "grad_norm": 0.1482539027929306, "learning_rate": 5.751010949198643e-05, "loss": 11.4366, "step": 919 }, { "epoch": 0.035782697898738856, "grad_norm": 0.10693041235208511, "learning_rate": 5.7310079936671545e-05, "loss": 11.4555, "step": 920 }, { "epoch": 0.035821592135585306, "grad_norm": 0.16853336989879608, "learning_rate": 5.711025904018013e-05, "loss": 11.4451, "step": 921 }, { "epoch": 0.03586048637243176, "grad_norm": 0.16642871499061584, "learning_rate": 5.691064777919223e-05, "loss": 11.4352, "step": 922 }, { "epoch": 0.03589938060927822, "grad_norm": 0.11272871494293213, "learning_rate": 5.6711247129363156e-05, "loss": 11.4952, "step": 923 }, { "epoch": 0.035938274846124676, "grad_norm": 0.13608944416046143, "learning_rate": 5.651205806531903e-05, "loss": 11.4288, "step": 924 }, { "epoch": 0.03597716908297113, "grad_norm": 0.14277823269367218, "learning_rate": 5.631308156065152e-05, "loss": 11.4076, "step": 925 }, { "epoch": 0.03601606331981758, "grad_norm": 0.10945692658424377, "learning_rate": 5.611431858791348e-05, "loss": 11.4533, "step": 926 }, { "epoch": 0.03605495755666404, "grad_norm": 0.10961952060461044, "learning_rate": 5.59157701186142e-05, "loss": 11.473, "step": 927 }, { "epoch": 0.0360938517935105, "grad_norm": 0.08197829127311707, "learning_rate": 5.571743712321422e-05, "loss": 11.4289, "step": 928 }, { "epoch": 0.03613274603035695, "grad_norm": 0.11842351406812668, "learning_rate": 5.551932057112115e-05, "loss": 11.4473, "step": 929 }, { "epoch": 0.03617164026720341, "grad_norm": 0.14072228968143463, "learning_rate": 5.532142143068455e-05, "loss": 11.4301, "step": 930 }, { "epoch": 0.03621053450404986, "grad_norm": 0.1937941163778305, "learning_rate": 5.512374066919137e-05, "loss": 11.4606, "step": 931 }, { "epoch": 0.03624942874089632, "grad_norm": 0.13401161134243011, "learning_rate": 5.492627925286113e-05, "loss": 11.4766, "step": 932 }, { "epoch": 0.036288322977742773, "grad_norm": 0.11659156531095505, "learning_rate": 5.4729038146841294e-05, "loss": 11.4341, "step": 933 }, { "epoch": 0.03632721721458923, "grad_norm": 0.14816398918628693, "learning_rate": 5.453201831520245e-05, "loss": 11.4842, "step": 934 }, { "epoch": 0.03636611145143568, "grad_norm": 0.15731480717658997, "learning_rate": 5.4335220720933664e-05, "loss": 11.4512, "step": 935 }, { "epoch": 0.03640500568828214, "grad_norm": 0.10614141076803207, "learning_rate": 5.4138646325937813e-05, "loss": 11.4329, "step": 936 }, { "epoch": 0.036443899925128594, "grad_norm": 0.13888613879680634, "learning_rate": 5.3942296091026656e-05, "loss": 11.4717, "step": 937 }, { "epoch": 0.03648279416197505, "grad_norm": 0.14382483065128326, "learning_rate": 5.37461709759165e-05, "loss": 11.4275, "step": 938 }, { "epoch": 0.03652168839882151, "grad_norm": 0.11295495927333832, "learning_rate": 5.3550271939223195e-05, "loss": 11.4522, "step": 939 }, { "epoch": 0.03656058263566796, "grad_norm": 0.28914937376976013, "learning_rate": 5.335459993845764e-05, "loss": 11.5041, "step": 940 }, { "epoch": 0.036599476872514414, "grad_norm": 0.16712334752082825, "learning_rate": 5.3159155930021e-05, "loss": 11.4382, "step": 941 }, { "epoch": 0.03663837110936087, "grad_norm": 0.22522681951522827, "learning_rate": 5.2963940869200056e-05, "loss": 11.5194, "step": 942 }, { "epoch": 0.03667726534620733, "grad_norm": 0.10965728759765625, "learning_rate": 5.276895571016257e-05, "loss": 11.4299, "step": 943 }, { "epoch": 0.036716159583053784, "grad_norm": 0.11578682065010071, "learning_rate": 5.257420140595257e-05, "loss": 11.4538, "step": 944 }, { "epoch": 0.036755053819900234, "grad_norm": 0.10459594428539276, "learning_rate": 5.237967890848574e-05, "loss": 11.4419, "step": 945 }, { "epoch": 0.03679394805674669, "grad_norm": 0.1656852513551712, "learning_rate": 5.218538916854473e-05, "loss": 11.4236, "step": 946 }, { "epoch": 0.03683284229359315, "grad_norm": 0.1385713666677475, "learning_rate": 5.1991333135774525e-05, "loss": 11.4504, "step": 947 }, { "epoch": 0.036871736530439604, "grad_norm": 0.15063555538654327, "learning_rate": 5.179751175867784e-05, "loss": 11.4632, "step": 948 }, { "epoch": 0.036910630767286054, "grad_norm": 0.13096244633197784, "learning_rate": 5.1603925984610326e-05, "loss": 11.4364, "step": 949 }, { "epoch": 0.03694952500413251, "grad_norm": 0.11817897111177444, "learning_rate": 5.141057675977619e-05, "loss": 11.4361, "step": 950 }, { "epoch": 0.03698841924097897, "grad_norm": 0.10221054404973984, "learning_rate": 5.1217465029223375e-05, "loss": 11.4402, "step": 951 }, { "epoch": 0.037027313477825424, "grad_norm": 0.13744771480560303, "learning_rate": 5.102459173683903e-05, "loss": 11.5, "step": 952 }, { "epoch": 0.03706620771467188, "grad_norm": 0.19383054971694946, "learning_rate": 5.0831957825344865e-05, "loss": 11.5256, "step": 953 }, { "epoch": 0.03710510195151833, "grad_norm": 0.1514645218849182, "learning_rate": 5.063956423629255e-05, "loss": 11.4661, "step": 954 }, { "epoch": 0.03714399618836479, "grad_norm": 0.11683838069438934, "learning_rate": 5.044741191005908e-05, "loss": 11.4504, "step": 955 }, { "epoch": 0.037182890425211244, "grad_norm": 0.11084909737110138, "learning_rate": 5.025550178584226e-05, "loss": 11.4211, "step": 956 }, { "epoch": 0.0372217846620577, "grad_norm": 0.20357000827789307, "learning_rate": 5.0063834801656084e-05, "loss": 11.4345, "step": 957 }, { "epoch": 0.03726067889890416, "grad_norm": 0.1357165426015854, "learning_rate": 4.9872411894325965e-05, "loss": 11.4705, "step": 958 }, { "epoch": 0.03729957313575061, "grad_norm": 0.15270249545574188, "learning_rate": 4.9681233999484564e-05, "loss": 11.4765, "step": 959 }, { "epoch": 0.037338467372597064, "grad_norm": 0.097058966755867, "learning_rate": 4.9490302051566886e-05, "loss": 11.4642, "step": 960 }, { "epoch": 0.03737736160944352, "grad_norm": 0.11222375929355621, "learning_rate": 4.929961698380564e-05, "loss": 11.4392, "step": 961 }, { "epoch": 0.03741625584628998, "grad_norm": 0.11173541843891144, "learning_rate": 4.910917972822713e-05, "loss": 11.4275, "step": 962 }, { "epoch": 0.03745515008313643, "grad_norm": 0.21483619511127472, "learning_rate": 4.891899121564615e-05, "loss": 11.4803, "step": 963 }, { "epoch": 0.037494044319982885, "grad_norm": 0.10671492666006088, "learning_rate": 4.872905237566183e-05, "loss": 11.4381, "step": 964 }, { "epoch": 0.03753293855682934, "grad_norm": 0.12410018593072891, "learning_rate": 4.853936413665294e-05, "loss": 11.4423, "step": 965 }, { "epoch": 0.0375718327936758, "grad_norm": 0.13656941056251526, "learning_rate": 4.8349927425773345e-05, "loss": 11.4282, "step": 966 }, { "epoch": 0.037610727030522255, "grad_norm": 0.1547286957502365, "learning_rate": 4.8160743168947496e-05, "loss": 11.4462, "step": 967 }, { "epoch": 0.037649621267368705, "grad_norm": 0.10143474489450455, "learning_rate": 4.797181229086594e-05, "loss": 11.4454, "step": 968 }, { "epoch": 0.03768851550421516, "grad_norm": 0.13370105624198914, "learning_rate": 4.7783135714980744e-05, "loss": 11.4257, "step": 969 }, { "epoch": 0.03772740974106162, "grad_norm": 0.16253124177455902, "learning_rate": 4.7594714363500915e-05, "loss": 11.4393, "step": 970 }, { "epoch": 0.037766303977908075, "grad_norm": 0.14151079952716827, "learning_rate": 4.7406549157388156e-05, "loss": 11.4231, "step": 971 }, { "epoch": 0.03780519821475453, "grad_norm": 0.13136602938175201, "learning_rate": 4.721864101635211e-05, "loss": 11.4503, "step": 972 }, { "epoch": 0.03784409245160098, "grad_norm": 0.11752691119909286, "learning_rate": 4.70309908588458e-05, "loss": 11.431, "step": 973 }, { "epoch": 0.03788298668844744, "grad_norm": 0.13689513504505157, "learning_rate": 4.6843599602061583e-05, "loss": 11.4344, "step": 974 }, { "epoch": 0.037921880925293895, "grad_norm": 0.1035035029053688, "learning_rate": 4.665646816192606e-05, "loss": 11.4391, "step": 975 }, { "epoch": 0.03796077516214035, "grad_norm": 0.1925671100616455, "learning_rate": 4.646959745309609e-05, "loss": 11.459, "step": 976 }, { "epoch": 0.0379996693989868, "grad_norm": 0.17556695640087128, "learning_rate": 4.62829883889541e-05, "loss": 11.4547, "step": 977 }, { "epoch": 0.03803856363583326, "grad_norm": 0.15916649997234344, "learning_rate": 4.609664188160362e-05, "loss": 11.4699, "step": 978 }, { "epoch": 0.038077457872679715, "grad_norm": 0.15165077149868011, "learning_rate": 4.591055884186489e-05, "loss": 11.4279, "step": 979 }, { "epoch": 0.03811635210952617, "grad_norm": 0.13335153460502625, "learning_rate": 4.572474017927038e-05, "loss": 11.4491, "step": 980 }, { "epoch": 0.03815524634637263, "grad_norm": 0.12981079518795013, "learning_rate": 4.553918680206037e-05, "loss": 11.4268, "step": 981 }, { "epoch": 0.03819414058321908, "grad_norm": 0.17829671502113342, "learning_rate": 4.535389961717834e-05, "loss": 11.4539, "step": 982 }, { "epoch": 0.038233034820065535, "grad_norm": 0.17136840522289276, "learning_rate": 4.516887953026691e-05, "loss": 11.5136, "step": 983 }, { "epoch": 0.03827192905691199, "grad_norm": 0.13877429068088531, "learning_rate": 4.498412744566305e-05, "loss": 11.4558, "step": 984 }, { "epoch": 0.03831082329375845, "grad_norm": 0.16534513235092163, "learning_rate": 4.47996442663937e-05, "loss": 11.4867, "step": 985 }, { "epoch": 0.038349717530604906, "grad_norm": 0.1574074625968933, "learning_rate": 4.461543089417173e-05, "loss": 11.4415, "step": 986 }, { "epoch": 0.038388611767451356, "grad_norm": 0.2112518548965454, "learning_rate": 4.443148822939095e-05, "loss": 11.4709, "step": 987 }, { "epoch": 0.03842750600429781, "grad_norm": 0.18481174111366272, "learning_rate": 4.42478171711222e-05, "loss": 11.4723, "step": 988 }, { "epoch": 0.03846640024114427, "grad_norm": 0.1394161880016327, "learning_rate": 4.406441861710871e-05, "loss": 11.4503, "step": 989 }, { "epoch": 0.038505294477990726, "grad_norm": 0.11776059120893478, "learning_rate": 4.388129346376178e-05, "loss": 11.4752, "step": 990 }, { "epoch": 0.038544188714837176, "grad_norm": 0.1907920092344284, "learning_rate": 4.369844260615635e-05, "loss": 11.4634, "step": 991 }, { "epoch": 0.03858308295168363, "grad_norm": 0.16755948960781097, "learning_rate": 4.351586693802674e-05, "loss": 11.4734, "step": 992 }, { "epoch": 0.03862197718853009, "grad_norm": 0.10538428276777267, "learning_rate": 4.333356735176218e-05, "loss": 11.431, "step": 993 }, { "epoch": 0.038660871425376546, "grad_norm": 0.18529963493347168, "learning_rate": 4.315154473840235e-05, "loss": 11.5198, "step": 994 }, { "epoch": 0.038699765662223, "grad_norm": 0.1970740407705307, "learning_rate": 4.296979998763338e-05, "loss": 11.4301, "step": 995 }, { "epoch": 0.03873865989906945, "grad_norm": 0.14860780537128448, "learning_rate": 4.278833398778306e-05, "loss": 11.4689, "step": 996 }, { "epoch": 0.03877755413591591, "grad_norm": 0.16582021117210388, "learning_rate": 4.260714762581677e-05, "loss": 11.4262, "step": 997 }, { "epoch": 0.038816448372762366, "grad_norm": 0.11632680147886276, "learning_rate": 4.242624178733322e-05, "loss": 11.4374, "step": 998 }, { "epoch": 0.03885534260960882, "grad_norm": 0.20491375029087067, "learning_rate": 4.224561735655977e-05, "loss": 11.4342, "step": 999 }, { "epoch": 0.03889423684645528, "grad_norm": 0.10806465148925781, "learning_rate": 4.206527521634842e-05, "loss": 11.4343, "step": 1000 }, { "epoch": 0.03893313108330173, "grad_norm": 0.1917140781879425, "learning_rate": 4.1885216248171425e-05, "loss": 11.4662, "step": 1001 }, { "epoch": 0.038972025320148186, "grad_norm": 0.1264260858297348, "learning_rate": 4.17054413321169e-05, "loss": 11.4654, "step": 1002 }, { "epoch": 0.03901091955699464, "grad_norm": 0.10900475829839706, "learning_rate": 4.152595134688464e-05, "loss": 11.4422, "step": 1003 }, { "epoch": 0.0390498137938411, "grad_norm": 0.10250162333250046, "learning_rate": 4.13467471697817e-05, "loss": 11.4218, "step": 1004 }, { "epoch": 0.03908870803068755, "grad_norm": 0.11095335334539413, "learning_rate": 4.1167829676718225e-05, "loss": 11.4274, "step": 1005 }, { "epoch": 0.039127602267534006, "grad_norm": 0.12225431948900223, "learning_rate": 4.0989199742203e-05, "loss": 11.4403, "step": 1006 }, { "epoch": 0.03916649650438046, "grad_norm": 0.10829450935125351, "learning_rate": 4.081085823933949e-05, "loss": 11.4363, "step": 1007 }, { "epoch": 0.03920539074122692, "grad_norm": 0.15065525472164154, "learning_rate": 4.0632806039821145e-05, "loss": 11.4778, "step": 1008 }, { "epoch": 0.03924428497807338, "grad_norm": 0.13061833381652832, "learning_rate": 4.045504401392749e-05, "loss": 11.4449, "step": 1009 }, { "epoch": 0.039283179214919826, "grad_norm": 0.13270919024944305, "learning_rate": 4.02775730305198e-05, "loss": 11.4269, "step": 1010 }, { "epoch": 0.03932207345176628, "grad_norm": 0.11030741780996323, "learning_rate": 4.010039395703664e-05, "loss": 11.4468, "step": 1011 }, { "epoch": 0.03936096768861274, "grad_norm": 0.17667675018310547, "learning_rate": 3.99235076594899e-05, "loss": 11.4739, "step": 1012 }, { "epoch": 0.0393998619254592, "grad_norm": 0.17058613896369934, "learning_rate": 3.9746915002460405e-05, "loss": 11.4725, "step": 1013 }, { "epoch": 0.03943875616230565, "grad_norm": 0.1783858984708786, "learning_rate": 3.9570616849093745e-05, "loss": 11.4287, "step": 1014 }, { "epoch": 0.0394776503991521, "grad_norm": 0.13282090425491333, "learning_rate": 3.939461406109605e-05, "loss": 11.443, "step": 1015 }, { "epoch": 0.03951654463599856, "grad_norm": 0.1711062490940094, "learning_rate": 3.921890749872973e-05, "loss": 11.4178, "step": 1016 }, { "epoch": 0.03955543887284502, "grad_norm": 0.10078589618206024, "learning_rate": 3.90434980208094e-05, "loss": 11.438, "step": 1017 }, { "epoch": 0.039594333109691474, "grad_norm": 0.11982541531324387, "learning_rate": 3.8868386484697417e-05, "loss": 11.4521, "step": 1018 }, { "epoch": 0.03963322734653792, "grad_norm": 0.13653124868869781, "learning_rate": 3.869357374630011e-05, "loss": 11.4497, "step": 1019 }, { "epoch": 0.03967212158338438, "grad_norm": 0.15596039593219757, "learning_rate": 3.851906066006311e-05, "loss": 11.4674, "step": 1020 }, { "epoch": 0.03971101582023084, "grad_norm": 0.13571912050247192, "learning_rate": 3.834484807896753e-05, "loss": 11.4646, "step": 1021 }, { "epoch": 0.039749910057077294, "grad_norm": 0.14839304983615875, "learning_rate": 3.817093685452578e-05, "loss": 11.4354, "step": 1022 }, { "epoch": 0.03978880429392375, "grad_norm": 0.11423840373754501, "learning_rate": 3.799732783677711e-05, "loss": 11.4233, "step": 1023 }, { "epoch": 0.0398276985307702, "grad_norm": 0.12816986441612244, "learning_rate": 3.7824021874283745e-05, "loss": 11.4109, "step": 1024 }, { "epoch": 0.03986659276761666, "grad_norm": 0.1389741748571396, "learning_rate": 3.7651019814126654e-05, "loss": 11.4411, "step": 1025 }, { "epoch": 0.039905487004463114, "grad_norm": 0.2206103503704071, "learning_rate": 3.747832250190139e-05, "loss": 11.484, "step": 1026 }, { "epoch": 0.03994438124130957, "grad_norm": 0.1348106563091278, "learning_rate": 3.730593078171396e-05, "loss": 11.4859, "step": 1027 }, { "epoch": 0.03998327547815603, "grad_norm": 0.1366932988166809, "learning_rate": 3.713384549617669e-05, "loss": 11.4695, "step": 1028 }, { "epoch": 0.04002216971500248, "grad_norm": 0.13749776780605316, "learning_rate": 3.696206748640416e-05, "loss": 11.4675, "step": 1029 }, { "epoch": 0.040061063951848934, "grad_norm": 0.11645928025245667, "learning_rate": 3.679059759200901e-05, "loss": 11.4201, "step": 1030 }, { "epoch": 0.04009995818869539, "grad_norm": 0.20807428658008575, "learning_rate": 3.661943665109796e-05, "loss": 11.4443, "step": 1031 }, { "epoch": 0.04013885242554185, "grad_norm": 0.09833139926195145, "learning_rate": 3.6448585500267485e-05, "loss": 11.4419, "step": 1032 }, { "epoch": 0.0401777466623883, "grad_norm": 0.1817658692598343, "learning_rate": 3.627804497460001e-05, "loss": 11.4636, "step": 1033 }, { "epoch": 0.040216640899234754, "grad_norm": 0.13411729037761688, "learning_rate": 3.610781590765966e-05, "loss": 11.4606, "step": 1034 }, { "epoch": 0.04025553513608121, "grad_norm": 0.0920453742146492, "learning_rate": 3.593789913148818e-05, "loss": 11.4563, "step": 1035 }, { "epoch": 0.04029442937292767, "grad_norm": 0.1194140836596489, "learning_rate": 3.576829547660098e-05, "loss": 11.4411, "step": 1036 }, { "epoch": 0.040333323609774124, "grad_norm": 0.16156767308712006, "learning_rate": 3.559900577198292e-05, "loss": 11.4888, "step": 1037 }, { "epoch": 0.040372217846620574, "grad_norm": 0.25374388694763184, "learning_rate": 3.5430030845084406e-05, "loss": 11.5209, "step": 1038 }, { "epoch": 0.04041111208346703, "grad_norm": 0.12839582562446594, "learning_rate": 3.5261371521817244e-05, "loss": 11.4657, "step": 1039 }, { "epoch": 0.04045000632031349, "grad_norm": 0.1393585056066513, "learning_rate": 3.509302862655064e-05, "loss": 11.4392, "step": 1040 }, { "epoch": 0.040488900557159944, "grad_norm": 0.17082121968269348, "learning_rate": 3.4925002982107205e-05, "loss": 11.4479, "step": 1041 }, { "epoch": 0.0405277947940064, "grad_norm": 0.11696211993694305, "learning_rate": 3.475729540975885e-05, "loss": 11.4272, "step": 1042 }, { "epoch": 0.04056668903085285, "grad_norm": 0.21628519892692566, "learning_rate": 3.45899067292229e-05, "loss": 11.4565, "step": 1043 }, { "epoch": 0.04060558326769931, "grad_norm": 0.2241363525390625, "learning_rate": 3.442283775865783e-05, "loss": 11.436, "step": 1044 }, { "epoch": 0.040644477504545765, "grad_norm": 0.16380998492240906, "learning_rate": 3.4256089314659725e-05, "loss": 11.4788, "step": 1045 }, { "epoch": 0.04068337174139222, "grad_norm": 0.12565840780735016, "learning_rate": 3.408966221225773e-05, "loss": 11.4718, "step": 1046 }, { "epoch": 0.04072226597823867, "grad_norm": 0.1578199714422226, "learning_rate": 3.3923557264910514e-05, "loss": 11.4613, "step": 1047 }, { "epoch": 0.04076116021508513, "grad_norm": 0.11667878925800323, "learning_rate": 3.3757775284502045e-05, "loss": 11.4321, "step": 1048 }, { "epoch": 0.040800054451931585, "grad_norm": 0.1548505276441574, "learning_rate": 3.3592317081337755e-05, "loss": 11.4568, "step": 1049 }, { "epoch": 0.04083894868877804, "grad_norm": 0.14588265120983124, "learning_rate": 3.342718346414049e-05, "loss": 11.4962, "step": 1050 }, { "epoch": 0.0408778429256245, "grad_norm": 0.12974365055561066, "learning_rate": 3.3262375240046596e-05, "loss": 11.4519, "step": 1051 }, { "epoch": 0.04091673716247095, "grad_norm": 0.1877010613679886, "learning_rate": 3.309789321460202e-05, "loss": 11.4317, "step": 1052 }, { "epoch": 0.040955631399317405, "grad_norm": 0.1269165575504303, "learning_rate": 3.293373819175816e-05, "loss": 11.4635, "step": 1053 }, { "epoch": 0.04099452563616386, "grad_norm": 0.12434379011392593, "learning_rate": 3.276991097386831e-05, "loss": 11.429, "step": 1054 }, { "epoch": 0.04103341987301032, "grad_norm": 0.12698589265346527, "learning_rate": 3.260641236168346e-05, "loss": 11.4503, "step": 1055 }, { "epoch": 0.041072314109856775, "grad_norm": 0.18467244505882263, "learning_rate": 3.24432431543483e-05, "loss": 11.4486, "step": 1056 }, { "epoch": 0.041111208346703225, "grad_norm": 0.14477095007896423, "learning_rate": 3.228040414939773e-05, "loss": 11.4645, "step": 1057 }, { "epoch": 0.04115010258354968, "grad_norm": 0.1388678103685379, "learning_rate": 3.211789614275241e-05, "loss": 11.4697, "step": 1058 }, { "epoch": 0.04118899682039614, "grad_norm": 0.14132258296012878, "learning_rate": 3.1955719928715345e-05, "loss": 11.4368, "step": 1059 }, { "epoch": 0.041227891057242595, "grad_norm": 0.18010205030441284, "learning_rate": 3.1793876299967816e-05, "loss": 11.4437, "step": 1060 }, { "epoch": 0.041266785294089045, "grad_norm": 0.10433457046747208, "learning_rate": 3.163236604756535e-05, "loss": 11.4442, "step": 1061 }, { "epoch": 0.0413056795309355, "grad_norm": 0.16652584075927734, "learning_rate": 3.1471189960934146e-05, "loss": 11.4379, "step": 1062 }, { "epoch": 0.04134457376778196, "grad_norm": 0.12524031102657318, "learning_rate": 3.1310348827867006e-05, "loss": 11.4358, "step": 1063 }, { "epoch": 0.041383468004628415, "grad_norm": 0.12246005982160568, "learning_rate": 3.114984343451963e-05, "loss": 11.4099, "step": 1064 }, { "epoch": 0.04142236224147487, "grad_norm": 0.13616223633289337, "learning_rate": 3.098967456540652e-05, "loss": 11.4322, "step": 1065 }, { "epoch": 0.04146125647832132, "grad_norm": 0.145111083984375, "learning_rate": 3.082984300339756e-05, "loss": 11.5065, "step": 1066 }, { "epoch": 0.04150015071516778, "grad_norm": 0.13896699249744415, "learning_rate": 3.0670349529713816e-05, "loss": 11.4505, "step": 1067 }, { "epoch": 0.041539044952014235, "grad_norm": 0.11143442243337631, "learning_rate": 3.051119492392379e-05, "loss": 11.4411, "step": 1068 }, { "epoch": 0.04157793918886069, "grad_norm": 0.15392428636550903, "learning_rate": 3.0352379963939883e-05, "loss": 11.4361, "step": 1069 }, { "epoch": 0.04161683342570715, "grad_norm": 0.16626837849617004, "learning_rate": 3.0193905426014146e-05, "loss": 11.4083, "step": 1070 }, { "epoch": 0.0416557276625536, "grad_norm": 0.2432129830121994, "learning_rate": 3.003577208473488e-05, "loss": 11.5145, "step": 1071 }, { "epoch": 0.041694621899400056, "grad_norm": 0.10259351879358292, "learning_rate": 2.9877980713022648e-05, "loss": 11.4465, "step": 1072 }, { "epoch": 0.04173351613624651, "grad_norm": 0.22118544578552246, "learning_rate": 2.9720532082126518e-05, "loss": 11.4629, "step": 1073 }, { "epoch": 0.04177241037309297, "grad_norm": 0.21239879727363586, "learning_rate": 2.9563426961620367e-05, "loss": 11.4424, "step": 1074 }, { "epoch": 0.04177241037309297, "eval_loss": 11.48007583618164, "eval_runtime": 139.1239, "eval_samples_per_second": 77.816, "eval_steps_per_second": 38.908, "step": 1074 }, { "epoch": 0.04181130460993942, "grad_norm": 0.08997377753257751, "learning_rate": 2.940666611939902e-05, "loss": 11.4518, "step": 1075 }, { "epoch": 0.041850198846785876, "grad_norm": 0.09690409898757935, "learning_rate": 2.92502503216746e-05, "loss": 11.4226, "step": 1076 }, { "epoch": 0.04188909308363233, "grad_norm": 0.118629090487957, "learning_rate": 2.909418033297262e-05, "loss": 11.4329, "step": 1077 }, { "epoch": 0.04192798732047879, "grad_norm": 0.12322626262903214, "learning_rate": 2.893845691612851e-05, "loss": 11.4323, "step": 1078 }, { "epoch": 0.041966881557325246, "grad_norm": 0.12857896089553833, "learning_rate": 2.878308083228366e-05, "loss": 11.4302, "step": 1079 }, { "epoch": 0.042005775794171696, "grad_norm": 0.13390401005744934, "learning_rate": 2.8628052840881682e-05, "loss": 11.4698, "step": 1080 }, { "epoch": 0.04204467003101815, "grad_norm": 0.11897007375955582, "learning_rate": 2.8473373699664997e-05, "loss": 11.4159, "step": 1081 }, { "epoch": 0.04208356426786461, "grad_norm": 0.13574343919754028, "learning_rate": 2.8319044164670704e-05, "loss": 11.4714, "step": 1082 }, { "epoch": 0.042122458504711066, "grad_norm": 0.12145520746707916, "learning_rate": 2.8165064990227252e-05, "loss": 11.4508, "step": 1083 }, { "epoch": 0.04216135274155752, "grad_norm": 0.2545243203639984, "learning_rate": 2.8011436928950553e-05, "loss": 11.5192, "step": 1084 }, { "epoch": 0.04220024697840397, "grad_norm": 0.1207599937915802, "learning_rate": 2.7858160731740356e-05, "loss": 11.4501, "step": 1085 }, { "epoch": 0.04223914121525043, "grad_norm": 0.11171094328165054, "learning_rate": 2.77052371477766e-05, "loss": 11.4124, "step": 1086 }, { "epoch": 0.042278035452096886, "grad_norm": 0.18224753439426422, "learning_rate": 2.755266692451569e-05, "loss": 11.4513, "step": 1087 }, { "epoch": 0.04231692968894334, "grad_norm": 0.15159229934215546, "learning_rate": 2.7400450807686938e-05, "loss": 11.4748, "step": 1088 }, { "epoch": 0.04235582392578979, "grad_norm": 0.167379230260849, "learning_rate": 2.724858954128876e-05, "loss": 11.4262, "step": 1089 }, { "epoch": 0.04239471816263625, "grad_norm": 0.1294448971748352, "learning_rate": 2.7097083867585272e-05, "loss": 11.4893, "step": 1090 }, { "epoch": 0.042433612399482706, "grad_norm": 0.15913696587085724, "learning_rate": 2.694593452710249e-05, "loss": 11.4446, "step": 1091 }, { "epoch": 0.04247250663632916, "grad_norm": 0.1298837959766388, "learning_rate": 2.679514225862464e-05, "loss": 11.4545, "step": 1092 }, { "epoch": 0.04251140087317562, "grad_norm": 0.1965494006872177, "learning_rate": 2.664470779919087e-05, "loss": 11.4959, "step": 1093 }, { "epoch": 0.04255029511002207, "grad_norm": 0.12095270305871964, "learning_rate": 2.6494631884091235e-05, "loss": 11.4916, "step": 1094 }, { "epoch": 0.042589189346868526, "grad_norm": 0.13163423538208008, "learning_rate": 2.6344915246863412e-05, "loss": 11.4269, "step": 1095 }, { "epoch": 0.04262808358371498, "grad_norm": 0.20584173500537872, "learning_rate": 2.6195558619288995e-05, "loss": 11.4665, "step": 1096 }, { "epoch": 0.04266697782056144, "grad_norm": 0.09251202642917633, "learning_rate": 2.6046562731389912e-05, "loss": 11.4101, "step": 1097 }, { "epoch": 0.0427058720574079, "grad_norm": 0.1911085844039917, "learning_rate": 2.5897928311424902e-05, "loss": 11.4502, "step": 1098 }, { "epoch": 0.04274476629425435, "grad_norm": 0.11317852139472961, "learning_rate": 2.5749656085885896e-05, "loss": 11.4431, "step": 1099 }, { "epoch": 0.0427836605311008, "grad_norm": 0.1697409600019455, "learning_rate": 2.5601746779494563e-05, "loss": 11.4348, "step": 1100 }, { "epoch": 0.04282255476794726, "grad_norm": 0.25867024064064026, "learning_rate": 2.545420111519855e-05, "loss": 11.4292, "step": 1101 }, { "epoch": 0.04286144900479372, "grad_norm": 0.15417879819869995, "learning_rate": 2.5307019814168342e-05, "loss": 11.4815, "step": 1102 }, { "epoch": 0.04290034324164017, "grad_norm": 0.18279899656772614, "learning_rate": 2.5160203595793273e-05, "loss": 11.4492, "step": 1103 }, { "epoch": 0.042939237478486623, "grad_norm": 0.13540595769882202, "learning_rate": 2.5013753177678323e-05, "loss": 11.4151, "step": 1104 }, { "epoch": 0.04297813171533308, "grad_norm": 0.14286018908023834, "learning_rate": 2.4867669275640616e-05, "loss": 11.4201, "step": 1105 }, { "epoch": 0.04301702595217954, "grad_norm": 0.16803769767284393, "learning_rate": 2.4721952603705657e-05, "loss": 11.4276, "step": 1106 }, { "epoch": 0.043055920189025994, "grad_norm": 0.09364805370569229, "learning_rate": 2.457660387410411e-05, "loss": 11.409, "step": 1107 }, { "epoch": 0.043094814425872444, "grad_norm": 0.17238271236419678, "learning_rate": 2.44316237972682e-05, "loss": 11.4356, "step": 1108 }, { "epoch": 0.0431337086627189, "grad_norm": 0.2280428111553192, "learning_rate": 2.4287013081828257e-05, "loss": 11.4807, "step": 1109 }, { "epoch": 0.04317260289956536, "grad_norm": 0.14022088050842285, "learning_rate": 2.4142772434609273e-05, "loss": 11.4843, "step": 1110 }, { "epoch": 0.043211497136411814, "grad_norm": 0.1371413767337799, "learning_rate": 2.39989025606274e-05, "loss": 11.4181, "step": 1111 }, { "epoch": 0.04325039137325827, "grad_norm": 0.309488981962204, "learning_rate": 2.3855404163086558e-05, "loss": 11.4803, "step": 1112 }, { "epoch": 0.04328928561010472, "grad_norm": 0.28754037618637085, "learning_rate": 2.371227794337495e-05, "loss": 11.6085, "step": 1113 }, { "epoch": 0.04332817984695118, "grad_norm": 0.21863828599452972, "learning_rate": 2.356952460106169e-05, "loss": 11.4461, "step": 1114 }, { "epoch": 0.043367074083797634, "grad_norm": 0.28783923387527466, "learning_rate": 2.342714483389329e-05, "loss": 11.4747, "step": 1115 }, { "epoch": 0.04340596832064409, "grad_norm": 0.17284227907657623, "learning_rate": 2.328513933779034e-05, "loss": 11.4825, "step": 1116 }, { "epoch": 0.04344486255749054, "grad_norm": 0.14010822772979736, "learning_rate": 2.314350880684416e-05, "loss": 11.4157, "step": 1117 }, { "epoch": 0.043483756794337, "grad_norm": 0.12766648828983307, "learning_rate": 2.3002253933313177e-05, "loss": 11.4398, "step": 1118 }, { "epoch": 0.043522651031183454, "grad_norm": 0.16336886584758759, "learning_rate": 2.286137540761979e-05, "loss": 11.4232, "step": 1119 }, { "epoch": 0.04356154526802991, "grad_norm": 0.14002946019172668, "learning_rate": 2.272087391834684e-05, "loss": 11.4272, "step": 1120 }, { "epoch": 0.04360043950487637, "grad_norm": 0.1765531450510025, "learning_rate": 2.2580750152234354e-05, "loss": 11.5226, "step": 1121 }, { "epoch": 0.04363933374172282, "grad_norm": 0.13177621364593506, "learning_rate": 2.2441004794176067e-05, "loss": 11.418, "step": 1122 }, { "epoch": 0.043678227978569274, "grad_norm": 0.2038862705230713, "learning_rate": 2.2301638527216194e-05, "loss": 11.4719, "step": 1123 }, { "epoch": 0.04371712221541573, "grad_norm": 0.13038241863250732, "learning_rate": 2.2162652032546007e-05, "loss": 11.4537, "step": 1124 }, { "epoch": 0.04375601645226219, "grad_norm": 0.2617287337779999, "learning_rate": 2.2024045989500542e-05, "loss": 11.5186, "step": 1125 }, { "epoch": 0.043794910689108645, "grad_norm": 0.1686462163925171, "learning_rate": 2.1885821075555302e-05, "loss": 11.4127, "step": 1126 }, { "epoch": 0.043833804925955094, "grad_norm": 0.161709725856781, "learning_rate": 2.174797796632281e-05, "loss": 11.4865, "step": 1127 }, { "epoch": 0.04387269916280155, "grad_norm": 0.13687454164028168, "learning_rate": 2.1610517335549563e-05, "loss": 11.4135, "step": 1128 }, { "epoch": 0.04391159339964801, "grad_norm": 0.1115826964378357, "learning_rate": 2.147343985511253e-05, "loss": 11.4259, "step": 1129 }, { "epoch": 0.043950487636494465, "grad_norm": 0.11887865513563156, "learning_rate": 2.1336746195015846e-05, "loss": 11.4208, "step": 1130 }, { "epoch": 0.043989381873340914, "grad_norm": 0.1860654354095459, "learning_rate": 2.120043702338772e-05, "loss": 11.5272, "step": 1131 }, { "epoch": 0.04402827611018737, "grad_norm": 0.1269589364528656, "learning_rate": 2.1064513006477017e-05, "loss": 11.4515, "step": 1132 }, { "epoch": 0.04406717034703383, "grad_norm": 0.15289169549942017, "learning_rate": 2.092897480865008e-05, "loss": 11.4436, "step": 1133 }, { "epoch": 0.044106064583880285, "grad_norm": 0.15345978736877441, "learning_rate": 2.0793823092387432e-05, "loss": 11.4424, "step": 1134 }, { "epoch": 0.04414495882072674, "grad_norm": 0.12720987200737, "learning_rate": 2.065905851828056e-05, "loss": 11.4425, "step": 1135 }, { "epoch": 0.04418385305757319, "grad_norm": 0.13926883041858673, "learning_rate": 2.0524681745028708e-05, "loss": 11.4444, "step": 1136 }, { "epoch": 0.04422274729441965, "grad_norm": 0.1361120641231537, "learning_rate": 2.0390693429435627e-05, "loss": 11.4485, "step": 1137 }, { "epoch": 0.044261641531266105, "grad_norm": 0.17767587304115295, "learning_rate": 2.025709422640637e-05, "loss": 11.4634, "step": 1138 }, { "epoch": 0.04430053576811256, "grad_norm": 0.20594410598278046, "learning_rate": 2.0123884788944036e-05, "loss": 11.4492, "step": 1139 }, { "epoch": 0.04433943000495902, "grad_norm": 0.21684671938419342, "learning_rate": 1.9991065768146787e-05, "loss": 11.4779, "step": 1140 }, { "epoch": 0.04437832424180547, "grad_norm": 0.11622948944568634, "learning_rate": 1.985863781320435e-05, "loss": 11.4141, "step": 1141 }, { "epoch": 0.044417218478651925, "grad_norm": 0.21400313079357147, "learning_rate": 1.9726601571395075e-05, "loss": 11.4662, "step": 1142 }, { "epoch": 0.04445611271549838, "grad_norm": 0.19747650623321533, "learning_rate": 1.9594957688082793e-05, "loss": 11.4697, "step": 1143 }, { "epoch": 0.04449500695234484, "grad_norm": 0.16535556316375732, "learning_rate": 1.946370680671341e-05, "loss": 11.4289, "step": 1144 }, { "epoch": 0.04453390118919129, "grad_norm": 0.10678819566965103, "learning_rate": 1.933284956881204e-05, "loss": 11.4269, "step": 1145 }, { "epoch": 0.044572795426037745, "grad_norm": 0.1680564284324646, "learning_rate": 1.920238661397972e-05, "loss": 11.4831, "step": 1146 }, { "epoch": 0.0446116896628842, "grad_norm": 0.20117631554603577, "learning_rate": 1.9072318579890326e-05, "loss": 11.4743, "step": 1147 }, { "epoch": 0.04465058389973066, "grad_norm": 0.1598464548587799, "learning_rate": 1.894264610228744e-05, "loss": 11.4333, "step": 1148 }, { "epoch": 0.044689478136577115, "grad_norm": 0.13115167617797852, "learning_rate": 1.8813369814981275e-05, "loss": 11.4237, "step": 1149 }, { "epoch": 0.044728372373423565, "grad_norm": 0.13870635628700256, "learning_rate": 1.868449034984554e-05, "loss": 11.4444, "step": 1150 }, { "epoch": 0.04476726661027002, "grad_norm": 0.16505026817321777, "learning_rate": 1.85560083368143e-05, "loss": 11.4504, "step": 1151 }, { "epoch": 0.04480616084711648, "grad_norm": 0.12766501307487488, "learning_rate": 1.8427924403879115e-05, "loss": 11.4517, "step": 1152 }, { "epoch": 0.044845055083962936, "grad_norm": 0.18523377180099487, "learning_rate": 1.8300239177085676e-05, "loss": 11.4428, "step": 1153 }, { "epoch": 0.04488394932080939, "grad_norm": 0.21062421798706055, "learning_rate": 1.8172953280530914e-05, "loss": 11.4629, "step": 1154 }, { "epoch": 0.04492284355765584, "grad_norm": 0.16536962985992432, "learning_rate": 1.804606733636004e-05, "loss": 11.4956, "step": 1155 }, { "epoch": 0.0449617377945023, "grad_norm": 0.15244871377944946, "learning_rate": 1.791958196476321e-05, "loss": 11.4157, "step": 1156 }, { "epoch": 0.045000632031348756, "grad_norm": 0.1967291533946991, "learning_rate": 1.779349778397279e-05, "loss": 11.4396, "step": 1157 }, { "epoch": 0.04503952626819521, "grad_norm": 0.2290966659784317, "learning_rate": 1.766781541026018e-05, "loss": 11.4687, "step": 1158 }, { "epoch": 0.04507842050504166, "grad_norm": 0.1414095014333725, "learning_rate": 1.754253545793285e-05, "loss": 11.4353, "step": 1159 }, { "epoch": 0.04511731474188812, "grad_norm": 0.14760176837444305, "learning_rate": 1.741765853933125e-05, "loss": 11.4339, "step": 1160 }, { "epoch": 0.045156208978734576, "grad_norm": 0.12192599475383759, "learning_rate": 1.7293185264826018e-05, "loss": 11.4519, "step": 1161 }, { "epoch": 0.04519510321558103, "grad_norm": 0.11039043962955475, "learning_rate": 1.7169116242814796e-05, "loss": 11.4252, "step": 1162 }, { "epoch": 0.04523399745242749, "grad_norm": 0.15859828889369965, "learning_rate": 1.7045452079719282e-05, "loss": 11.4189, "step": 1163 }, { "epoch": 0.04527289168927394, "grad_norm": 0.16476251184940338, "learning_rate": 1.6922193379982453e-05, "loss": 11.4449, "step": 1164 }, { "epoch": 0.045311785926120396, "grad_norm": 0.24089524149894714, "learning_rate": 1.679934074606533e-05, "loss": 11.4579, "step": 1165 }, { "epoch": 0.04535068016296685, "grad_norm": 0.14605370163917542, "learning_rate": 1.6676894778444207e-05, "loss": 11.4767, "step": 1166 }, { "epoch": 0.04538957439981331, "grad_norm": 0.15800780057907104, "learning_rate": 1.6554856075607793e-05, "loss": 11.4737, "step": 1167 }, { "epoch": 0.045428468636659766, "grad_norm": 0.1213153675198555, "learning_rate": 1.6433225234054027e-05, "loss": 11.4345, "step": 1168 }, { "epoch": 0.045467362873506216, "grad_norm": 0.18790219724178314, "learning_rate": 1.63120028482874e-05, "loss": 11.4845, "step": 1169 }, { "epoch": 0.04550625711035267, "grad_norm": 0.12065570801496506, "learning_rate": 1.619118951081594e-05, "loss": 11.4331, "step": 1170 }, { "epoch": 0.04554515134719913, "grad_norm": 0.18314050137996674, "learning_rate": 1.607078581214836e-05, "loss": 11.4867, "step": 1171 }, { "epoch": 0.045584045584045586, "grad_norm": 0.12358561158180237, "learning_rate": 1.5950792340791043e-05, "loss": 11.4415, "step": 1172 }, { "epoch": 0.045622939820892036, "grad_norm": 0.16861197352409363, "learning_rate": 1.5831209683245462e-05, "loss": 11.4463, "step": 1173 }, { "epoch": 0.04566183405773849, "grad_norm": 0.117819644510746, "learning_rate": 1.5712038424004993e-05, "loss": 11.4314, "step": 1174 }, { "epoch": 0.04570072829458495, "grad_norm": 0.1934998482465744, "learning_rate": 1.5593279145552164e-05, "loss": 11.4269, "step": 1175 }, { "epoch": 0.045739622531431406, "grad_norm": 0.0948733314871788, "learning_rate": 1.547493242835596e-05, "loss": 11.4259, "step": 1176 }, { "epoch": 0.04577851676827786, "grad_norm": 0.11361514031887054, "learning_rate": 1.535699885086872e-05, "loss": 11.4504, "step": 1177 }, { "epoch": 0.04581741100512431, "grad_norm": 0.18907108902931213, "learning_rate": 1.5239478989523525e-05, "loss": 11.4305, "step": 1178 }, { "epoch": 0.04585630524197077, "grad_norm": 0.2364674061536789, "learning_rate": 1.5122373418731306e-05, "loss": 11.4336, "step": 1179 }, { "epoch": 0.04589519947881723, "grad_norm": 0.1388162076473236, "learning_rate": 1.500568271087801e-05, "loss": 11.4671, "step": 1180 }, { "epoch": 0.04593409371566368, "grad_norm": 0.17299380898475647, "learning_rate": 1.4889407436321822e-05, "loss": 11.4643, "step": 1181 }, { "epoch": 0.04597298795251014, "grad_norm": 0.10393727570772171, "learning_rate": 1.4773548163390406e-05, "loss": 11.4679, "step": 1182 }, { "epoch": 0.04601188218935659, "grad_norm": 0.19474563002586365, "learning_rate": 1.4658105458378113e-05, "loss": 11.4569, "step": 1183 }, { "epoch": 0.04605077642620305, "grad_norm": 0.15817129611968994, "learning_rate": 1.4543079885543098e-05, "loss": 11.4179, "step": 1184 }, { "epoch": 0.0460896706630495, "grad_norm": 0.15230096876621246, "learning_rate": 1.4428472007104832e-05, "loss": 11.4267, "step": 1185 }, { "epoch": 0.04612856489989596, "grad_norm": 0.17485696077346802, "learning_rate": 1.4314282383241096e-05, "loss": 11.4485, "step": 1186 }, { "epoch": 0.04616745913674241, "grad_norm": 0.12060049921274185, "learning_rate": 1.4200511572085274e-05, "loss": 11.4368, "step": 1187 }, { "epoch": 0.04620635337358887, "grad_norm": 0.150129035115242, "learning_rate": 1.4087160129723853e-05, "loss": 11.4684, "step": 1188 }, { "epoch": 0.046245247610435324, "grad_norm": 0.11091771721839905, "learning_rate": 1.3974228610193374e-05, "loss": 11.4294, "step": 1189 }, { "epoch": 0.04628414184728178, "grad_norm": 0.14679087698459625, "learning_rate": 1.3861717565477994e-05, "loss": 11.4625, "step": 1190 }, { "epoch": 0.04632303608412824, "grad_norm": 0.13646747171878815, "learning_rate": 1.3749627545506616e-05, "loss": 11.5092, "step": 1191 }, { "epoch": 0.04636193032097469, "grad_norm": 0.1430552899837494, "learning_rate": 1.363795909815032e-05, "loss": 11.457, "step": 1192 }, { "epoch": 0.046400824557821144, "grad_norm": 0.1363144963979721, "learning_rate": 1.3526712769219618e-05, "loss": 11.4246, "step": 1193 }, { "epoch": 0.0464397187946676, "grad_norm": 0.12670528888702393, "learning_rate": 1.3415889102461775e-05, "loss": 11.4318, "step": 1194 }, { "epoch": 0.04647861303151406, "grad_norm": 0.3385750651359558, "learning_rate": 1.3305488639558206e-05, "loss": 11.5832, "step": 1195 }, { "epoch": 0.046517507268360514, "grad_norm": 0.2261715829372406, "learning_rate": 1.3195511920121795e-05, "loss": 11.5506, "step": 1196 }, { "epoch": 0.046556401505206964, "grad_norm": 0.24841472506523132, "learning_rate": 1.3085959481694265e-05, "loss": 11.4581, "step": 1197 }, { "epoch": 0.04659529574205342, "grad_norm": 0.18506531417369843, "learning_rate": 1.2976831859743521e-05, "loss": 11.4293, "step": 1198 }, { "epoch": 0.04663418997889988, "grad_norm": 0.15063372254371643, "learning_rate": 1.286812958766106e-05, "loss": 11.4309, "step": 1199 }, { "epoch": 0.046673084215746334, "grad_norm": 0.24693375825881958, "learning_rate": 1.2759853196759453e-05, "loss": 11.5344, "step": 1200 }, { "epoch": 0.046711978452592784, "grad_norm": 0.12466446310281754, "learning_rate": 1.2652003216269526e-05, "loss": 11.4251, "step": 1201 }, { "epoch": 0.04675087268943924, "grad_norm": 0.29291486740112305, "learning_rate": 1.2544580173337983e-05, "loss": 11.6883, "step": 1202 }, { "epoch": 0.0467897669262857, "grad_norm": 0.1630331128835678, "learning_rate": 1.2437584593024753e-05, "loss": 11.4409, "step": 1203 }, { "epoch": 0.046828661163132154, "grad_norm": 0.18174372613430023, "learning_rate": 1.2331016998300394e-05, "loss": 11.4636, "step": 1204 }, { "epoch": 0.04686755539997861, "grad_norm": 0.1426887810230255, "learning_rate": 1.2224877910043587e-05, "loss": 11.4672, "step": 1205 }, { "epoch": 0.04690644963682506, "grad_norm": 0.10058625787496567, "learning_rate": 1.2119167847038548e-05, "loss": 11.4118, "step": 1206 }, { "epoch": 0.04694534387367152, "grad_norm": 0.15972256660461426, "learning_rate": 1.201388732597255e-05, "loss": 11.4379, "step": 1207 }, { "epoch": 0.046984238110517974, "grad_norm": 0.20637661218643188, "learning_rate": 1.190903686143332e-05, "loss": 11.4773, "step": 1208 }, { "epoch": 0.04702313234736443, "grad_norm": 0.2554698884487152, "learning_rate": 1.18046169659066e-05, "loss": 11.4568, "step": 1209 }, { "epoch": 0.04706202658421089, "grad_norm": 0.17103084921836853, "learning_rate": 1.170062814977354e-05, "loss": 11.4325, "step": 1210 }, { "epoch": 0.04710092082105734, "grad_norm": 0.15731778740882874, "learning_rate": 1.1597070921308363e-05, "loss": 11.4215, "step": 1211 }, { "epoch": 0.047139815057903794, "grad_norm": 0.1296573281288147, "learning_rate": 1.1493945786675753e-05, "loss": 11.4338, "step": 1212 }, { "epoch": 0.04717870929475025, "grad_norm": 0.1563749611377716, "learning_rate": 1.1391253249928369e-05, "loss": 11.4196, "step": 1213 }, { "epoch": 0.04721760353159671, "grad_norm": 0.13780969381332397, "learning_rate": 1.1288993813004467e-05, "loss": 11.4687, "step": 1214 }, { "epoch": 0.04725649776844316, "grad_norm": 0.22211474180221558, "learning_rate": 1.118716797572542e-05, "loss": 11.4907, "step": 1215 }, { "epoch": 0.047295392005289615, "grad_norm": 0.1490112692117691, "learning_rate": 1.1085776235793243e-05, "loss": 11.4091, "step": 1216 }, { "epoch": 0.04733428624213607, "grad_norm": 0.11238259822130203, "learning_rate": 1.098481908878819e-05, "loss": 11.4216, "step": 1217 }, { "epoch": 0.04737318047898253, "grad_norm": 0.22447730600833893, "learning_rate": 1.0884297028166302e-05, "loss": 11.5614, "step": 1218 }, { "epoch": 0.047412074715828985, "grad_norm": 0.1197625920176506, "learning_rate": 1.0784210545257034e-05, "loss": 11.4529, "step": 1219 }, { "epoch": 0.047450968952675435, "grad_norm": 0.20383505523204803, "learning_rate": 1.0684560129260822e-05, "loss": 11.4606, "step": 1220 }, { "epoch": 0.04748986318952189, "grad_norm": 0.1828719973564148, "learning_rate": 1.0585346267246743e-05, "loss": 11.4802, "step": 1221 }, { "epoch": 0.04752875742636835, "grad_norm": 0.2332971841096878, "learning_rate": 1.0486569444149995e-05, "loss": 11.4597, "step": 1222 }, { "epoch": 0.047567651663214805, "grad_norm": 0.12918928265571594, "learning_rate": 1.038823014276975e-05, "loss": 11.4668, "step": 1223 }, { "epoch": 0.04760654590006126, "grad_norm": 0.20176751911640167, "learning_rate": 1.0290328843766628e-05, "loss": 11.4392, "step": 1224 }, { "epoch": 0.04764544013690771, "grad_norm": 0.1748267114162445, "learning_rate": 1.019286602566033e-05, "loss": 11.4722, "step": 1225 }, { "epoch": 0.04768433437375417, "grad_norm": 0.13155628740787506, "learning_rate": 1.009584216482743e-05, "loss": 11.4369, "step": 1226 }, { "epoch": 0.047723228610600625, "grad_norm": 0.15083764493465424, "learning_rate": 9.999257735498957e-06, "loss": 11.4321, "step": 1227 }, { "epoch": 0.04776212284744708, "grad_norm": 0.09894982725381851, "learning_rate": 9.903113209758096e-06, "loss": 11.4426, "step": 1228 }, { "epoch": 0.04780101708429353, "grad_norm": 0.1379556804895401, "learning_rate": 9.807409057537876e-06, "loss": 11.4429, "step": 1229 }, { "epoch": 0.04783991132113999, "grad_norm": 0.16551163792610168, "learning_rate": 9.712145746618873e-06, "loss": 11.4475, "step": 1230 }, { "epoch": 0.047878805557986445, "grad_norm": 0.15437820553779602, "learning_rate": 9.61732374262696e-06, "loss": 11.4187, "step": 1231 }, { "epoch": 0.0479176997948329, "grad_norm": 0.12910741567611694, "learning_rate": 9.522943509030968e-06, "loss": 11.4256, "step": 1232 }, { "epoch": 0.04795659403167936, "grad_norm": 0.09460493922233582, "learning_rate": 9.429005507140487e-06, "loss": 11.439, "step": 1233 }, { "epoch": 0.04799548826852581, "grad_norm": 0.16753201186656952, "learning_rate": 9.33551019610348e-06, "loss": 11.4761, "step": 1234 }, { "epoch": 0.048034382505372265, "grad_norm": 0.14802870154380798, "learning_rate": 9.242458032904311e-06, "loss": 11.4195, "step": 1235 }, { "epoch": 0.04807327674221872, "grad_norm": 0.20524270832538605, "learning_rate": 9.14984947236115e-06, "loss": 11.4556, "step": 1236 }, { "epoch": 0.04811217097906518, "grad_norm": 0.18146079778671265, "learning_rate": 9.057684967124036e-06, "loss": 11.439, "step": 1237 }, { "epoch": 0.048151065215911636, "grad_norm": 0.3100352883338928, "learning_rate": 8.96596496767259e-06, "loss": 11.5205, "step": 1238 }, { "epoch": 0.048189959452758085, "grad_norm": 0.13172343373298645, "learning_rate": 8.874689922313717e-06, "loss": 11.4617, "step": 1239 }, { "epoch": 0.04822885368960454, "grad_norm": 0.21646977961063385, "learning_rate": 8.7838602771795e-06, "loss": 11.5159, "step": 1240 }, { "epoch": 0.048267747926451, "grad_norm": 0.12774217128753662, "learning_rate": 8.693476476225037e-06, "loss": 11.4551, "step": 1241 }, { "epoch": 0.048306642163297456, "grad_norm": 0.17098848521709442, "learning_rate": 8.603538961226232e-06, "loss": 11.442, "step": 1242 }, { "epoch": 0.048345536400143906, "grad_norm": 0.12627612054347992, "learning_rate": 8.51404817177761e-06, "loss": 11.489, "step": 1243 }, { "epoch": 0.04838443063699036, "grad_norm": 0.1685693860054016, "learning_rate": 8.425004545290227e-06, "loss": 11.4365, "step": 1244 }, { "epoch": 0.04842332487383682, "grad_norm": 0.15457966923713684, "learning_rate": 8.336408516989536e-06, "loss": 11.4073, "step": 1245 }, { "epoch": 0.048462219110683276, "grad_norm": 0.14986367523670197, "learning_rate": 8.24826051991312e-06, "loss": 11.4463, "step": 1246 }, { "epoch": 0.04850111334752973, "grad_norm": 0.17568694055080414, "learning_rate": 8.160560984908849e-06, "loss": 11.4313, "step": 1247 }, { "epoch": 0.04854000758437618, "grad_norm": 0.1453072726726532, "learning_rate": 8.073310340632457e-06, "loss": 11.427, "step": 1248 }, { "epoch": 0.04857890182122264, "grad_norm": 0.12231708317995071, "learning_rate": 7.986509013545673e-06, "loss": 11.4361, "step": 1249 }, { "epoch": 0.048617796058069096, "grad_norm": 0.18410533666610718, "learning_rate": 7.900157427914101e-06, "loss": 11.4503, "step": 1250 }, { "epoch": 0.04865669029491555, "grad_norm": 0.10761623829603195, "learning_rate": 7.81425600580502e-06, "loss": 11.4477, "step": 1251 }, { "epoch": 0.04869558453176201, "grad_norm": 0.15875908732414246, "learning_rate": 7.728805167085462e-06, "loss": 11.4199, "step": 1252 }, { "epoch": 0.04873447876860846, "grad_norm": 0.10637667030096054, "learning_rate": 7.643805329420117e-06, "loss": 11.4229, "step": 1253 }, { "epoch": 0.048773373005454916, "grad_norm": 0.1852317601442337, "learning_rate": 7.559256908269252e-06, "loss": 11.4606, "step": 1254 }, { "epoch": 0.04881226724230137, "grad_norm": 0.15936970710754395, "learning_rate": 7.475160316886698e-06, "loss": 11.4559, "step": 1255 }, { "epoch": 0.04885116147914783, "grad_norm": 0.15641069412231445, "learning_rate": 7.3915159663179075e-06, "loss": 11.4544, "step": 1256 }, { "epoch": 0.04889005571599428, "grad_norm": 0.12669280171394348, "learning_rate": 7.308324265397836e-06, "loss": 11.4623, "step": 1257 }, { "epoch": 0.048928949952840736, "grad_norm": 0.2127736657857895, "learning_rate": 7.225585620748954e-06, "loss": 11.4445, "step": 1258 }, { "epoch": 0.04896784418968719, "grad_norm": 0.1329868584871292, "learning_rate": 7.143300436779398e-06, "loss": 11.4263, "step": 1259 }, { "epoch": 0.04900673842653365, "grad_norm": 0.191726416349411, "learning_rate": 7.061469115680764e-06, "loss": 11.4487, "step": 1260 }, { "epoch": 0.049045632663380107, "grad_norm": 0.0934595912694931, "learning_rate": 6.980092057426346e-06, "loss": 11.4218, "step": 1261 }, { "epoch": 0.049084526900226556, "grad_norm": 0.17179986834526062, "learning_rate": 6.899169659769111e-06, "loss": 11.4969, "step": 1262 }, { "epoch": 0.04912342113707301, "grad_norm": 0.1705106943845749, "learning_rate": 6.818702318239689e-06, "loss": 11.4973, "step": 1263 }, { "epoch": 0.04916231537391947, "grad_norm": 0.11566983908414841, "learning_rate": 6.738690426144545e-06, "loss": 11.4269, "step": 1264 }, { "epoch": 0.04920120961076593, "grad_norm": 0.1510932445526123, "learning_rate": 6.659134374563969e-06, "loss": 11.454, "step": 1265 }, { "epoch": 0.04924010384761238, "grad_norm": 0.15906092524528503, "learning_rate": 6.580034552350267e-06, "loss": 11.4606, "step": 1266 }, { "epoch": 0.04927899808445883, "grad_norm": 0.11045938730239868, "learning_rate": 6.501391346125707e-06, "loss": 11.4598, "step": 1267 }, { "epoch": 0.04931789232130529, "grad_norm": 0.16918398439884186, "learning_rate": 6.423205140280797e-06, "loss": 11.4941, "step": 1268 }, { "epoch": 0.04935678655815175, "grad_norm": 0.18566885590553284, "learning_rate": 6.345476316972321e-06, "loss": 11.4397, "step": 1269 }, { "epoch": 0.049395680794998204, "grad_norm": 0.14385563135147095, "learning_rate": 6.268205256121396e-06, "loss": 11.4464, "step": 1270 }, { "epoch": 0.04943457503184465, "grad_norm": 0.2474178671836853, "learning_rate": 6.191392335411839e-06, "loss": 11.4548, "step": 1271 }, { "epoch": 0.04947346926869111, "grad_norm": 0.12321379780769348, "learning_rate": 6.115037930288059e-06, "loss": 11.4204, "step": 1272 }, { "epoch": 0.04951236350553757, "grad_norm": 0.21031615138053894, "learning_rate": 6.03914241395338e-06, "loss": 11.5541, "step": 1273 }, { "epoch": 0.049551257742384024, "grad_norm": 0.19632770121097565, "learning_rate": 5.963706157368199e-06, "loss": 11.4914, "step": 1274 }, { "epoch": 0.04959015197923048, "grad_norm": 0.13494326174259186, "learning_rate": 5.888729529248149e-06, "loss": 11.4478, "step": 1275 }, { "epoch": 0.04962904621607693, "grad_norm": 0.1579502820968628, "learning_rate": 5.814212896062277e-06, "loss": 11.4493, "step": 1276 }, { "epoch": 0.04966794045292339, "grad_norm": 0.13213543593883514, "learning_rate": 5.7401566220313005e-06, "loss": 11.4415, "step": 1277 }, { "epoch": 0.049706834689769844, "grad_norm": 0.15101048350334167, "learning_rate": 5.666561069125797e-06, "loss": 11.4401, "step": 1278 }, { "epoch": 0.0497457289266163, "grad_norm": 0.20494867861270905, "learning_rate": 5.593426597064444e-06, "loss": 11.4223, "step": 1279 }, { "epoch": 0.04978462316346276, "grad_norm": 0.13972818851470947, "learning_rate": 5.520753563312253e-06, "loss": 11.4167, "step": 1280 }, { "epoch": 0.04982351740030921, "grad_norm": 0.15213821828365326, "learning_rate": 5.448542323078843e-06, "loss": 11.466, "step": 1281 }, { "epoch": 0.049862411637155664, "grad_norm": 0.1816379427909851, "learning_rate": 5.376793229316645e-06, "loss": 11.5445, "step": 1282 }, { "epoch": 0.04990130587400212, "grad_norm": 0.22997316718101501, "learning_rate": 5.3055066327192925e-06, "loss": 11.4652, "step": 1283 }, { "epoch": 0.04994020011084858, "grad_norm": 0.1814534068107605, "learning_rate": 5.2346828817197655e-06, "loss": 11.4606, "step": 1284 }, { "epoch": 0.04997909434769503, "grad_norm": 0.3224797546863556, "learning_rate": 5.164322322488802e-06, "loss": 11.4257, "step": 1285 }, { "epoch": 0.050017988584541484, "grad_norm": 0.11046472191810608, "learning_rate": 5.094425298933136e-06, "loss": 11.4264, "step": 1286 }, { "epoch": 0.05005688282138794, "grad_norm": 0.18401353061199188, "learning_rate": 5.024992152693875e-06, "loss": 11.5579, "step": 1287 }, { "epoch": 0.0500957770582344, "grad_norm": 0.17558960616588593, "learning_rate": 4.956023223144768e-06, "loss": 11.4755, "step": 1288 }, { "epoch": 0.050134671295080854, "grad_norm": 0.11735843122005463, "learning_rate": 4.887518847390571e-06, "loss": 11.4153, "step": 1289 }, { "epoch": 0.050173565531927304, "grad_norm": 0.1737535148859024, "learning_rate": 4.819479360265444e-06, "loss": 11.4699, "step": 1290 }, { "epoch": 0.05021245976877376, "grad_norm": 0.20732508599758148, "learning_rate": 4.7519050943312325e-06, "loss": 11.4195, "step": 1291 }, { "epoch": 0.05025135400562022, "grad_norm": 0.1525745838880539, "learning_rate": 4.684796379875922e-06, "loss": 11.4366, "step": 1292 }, { "epoch": 0.050290248242466674, "grad_norm": 0.15391410887241364, "learning_rate": 4.618153544911929e-06, "loss": 11.5117, "step": 1293 }, { "epoch": 0.05032914247931313, "grad_norm": 0.10862606763839722, "learning_rate": 4.551976915174605e-06, "loss": 11.4329, "step": 1294 }, { "epoch": 0.05036803671615958, "grad_norm": 0.10060062259435654, "learning_rate": 4.48626681412061e-06, "loss": 11.4304, "step": 1295 }, { "epoch": 0.05040693095300604, "grad_norm": 0.12805354595184326, "learning_rate": 4.421023562926252e-06, "loss": 11.4189, "step": 1296 }, { "epoch": 0.050445825189852495, "grad_norm": 0.17536696791648865, "learning_rate": 4.356247480486031e-06, "loss": 11.4282, "step": 1297 }, { "epoch": 0.05048471942669895, "grad_norm": 0.17301538586616516, "learning_rate": 4.291938883411007e-06, "loss": 11.419, "step": 1298 }, { "epoch": 0.0505236136635454, "grad_norm": 0.17384307086467743, "learning_rate": 4.2280980860272874e-06, "loss": 11.4603, "step": 1299 }, { "epoch": 0.05056250790039186, "grad_norm": 0.14885058999061584, "learning_rate": 4.16472540037447e-06, "loss": 11.4561, "step": 1300 }, { "epoch": 0.050601402137238315, "grad_norm": 0.18874341249465942, "learning_rate": 4.101821136204142e-06, "loss": 11.437, "step": 1301 }, { "epoch": 0.05064029637408477, "grad_norm": 0.12578798830509186, "learning_rate": 4.039385600978318e-06, "loss": 11.4308, "step": 1302 }, { "epoch": 0.05067919061093123, "grad_norm": 0.14595210552215576, "learning_rate": 3.977419099868018e-06, "loss": 11.4412, "step": 1303 }, { "epoch": 0.05071808484777768, "grad_norm": 0.19140197336673737, "learning_rate": 3.915921935751687e-06, "loss": 11.4171, "step": 1304 }, { "epoch": 0.050756979084624135, "grad_norm": 0.13179464638233185, "learning_rate": 3.85489440921376e-06, "loss": 11.4658, "step": 1305 }, { "epoch": 0.05079587332147059, "grad_norm": 0.14757223427295685, "learning_rate": 3.794336818543209e-06, "loss": 11.4411, "step": 1306 }, { "epoch": 0.05083476755831705, "grad_norm": 0.16024403274059296, "learning_rate": 3.7342494597320755e-06, "loss": 11.4195, "step": 1307 }, { "epoch": 0.050873661795163505, "grad_norm": 0.18657468259334564, "learning_rate": 3.6746326264739504e-06, "loss": 11.4491, "step": 1308 }, { "epoch": 0.050912556032009955, "grad_norm": 0.23776055872440338, "learning_rate": 3.615486610162655e-06, "loss": 11.4958, "step": 1309 }, { "epoch": 0.05095145026885641, "grad_norm": 0.14701801538467407, "learning_rate": 3.5568116998907498e-06, "loss": 11.433, "step": 1310 }, { "epoch": 0.05099034450570287, "grad_norm": 0.1552416831254959, "learning_rate": 3.4986081824481152e-06, "loss": 11.4309, "step": 1311 }, { "epoch": 0.051029238742549325, "grad_norm": 0.17288999259471893, "learning_rate": 3.440876342320609e-06, "loss": 11.4981, "step": 1312 }, { "epoch": 0.051068132979395775, "grad_norm": 0.1504266858100891, "learning_rate": 3.3836164616885992e-06, "loss": 11.4175, "step": 1313 }, { "epoch": 0.05110702721624223, "grad_norm": 0.11923205107450485, "learning_rate": 3.3268288204256315e-06, "loss": 11.4175, "step": 1314 }, { "epoch": 0.05114592145308869, "grad_norm": 0.15316557884216309, "learning_rate": 3.270513696097055e-06, "loss": 11.4449, "step": 1315 }, { "epoch": 0.051184815689935145, "grad_norm": 0.15415562689304352, "learning_rate": 3.214671363958666e-06, "loss": 11.4771, "step": 1316 }, { "epoch": 0.0512237099267816, "grad_norm": 0.21763721108436584, "learning_rate": 3.159302096955319e-06, "loss": 11.4575, "step": 1317 }, { "epoch": 0.05126260416362805, "grad_norm": 0.1797555685043335, "learning_rate": 3.1044061657196867e-06, "loss": 11.4424, "step": 1318 }, { "epoch": 0.05130149840047451, "grad_norm": 0.1600632220506668, "learning_rate": 3.049983838570858e-06, "loss": 11.4647, "step": 1319 }, { "epoch": 0.051340392637320965, "grad_norm": 0.21375976502895355, "learning_rate": 2.9960353815130293e-06, "loss": 11.4618, "step": 1320 }, { "epoch": 0.05137928687416742, "grad_norm": 0.1874701827764511, "learning_rate": 2.9425610582342834e-06, "loss": 11.4144, "step": 1321 }, { "epoch": 0.05141818111101388, "grad_norm": 0.17211183905601501, "learning_rate": 2.8895611301051673e-06, "loss": 11.4916, "step": 1322 }, { "epoch": 0.05145707534786033, "grad_norm": 0.1562688797712326, "learning_rate": 2.837035856177539e-06, "loss": 11.4537, "step": 1323 }, { "epoch": 0.051495969584706786, "grad_norm": 0.1409672051668167, "learning_rate": 2.7849854931832562e-06, "loss": 11.4108, "step": 1324 }, { "epoch": 0.05153486382155324, "grad_norm": 0.16954468190670013, "learning_rate": 2.73341029553289e-06, "loss": 11.4287, "step": 1325 }, { "epoch": 0.0515737580583997, "grad_norm": 0.14109128713607788, "learning_rate": 2.682310515314512e-06, "loss": 11.498, "step": 1326 }, { "epoch": 0.05161265229524615, "grad_norm": 0.17244742810726166, "learning_rate": 2.6316864022924993e-06, "loss": 11.4485, "step": 1327 }, { "epoch": 0.051651546532092606, "grad_norm": 0.14715106785297394, "learning_rate": 2.5815382039062308e-06, "loss": 11.4552, "step": 1328 }, { "epoch": 0.05169044076893906, "grad_norm": 0.11490415036678314, "learning_rate": 2.5318661652689036e-06, "loss": 11.4516, "step": 1329 }, { "epoch": 0.05172933500578552, "grad_norm": 0.216889426112175, "learning_rate": 2.48267052916642e-06, "loss": 11.4809, "step": 1330 }, { "epoch": 0.051768229242631976, "grad_norm": 0.12310255318880081, "learning_rate": 2.4339515360561005e-06, "loss": 11.4264, "step": 1331 }, { "epoch": 0.051807123479478426, "grad_norm": 0.1695736199617386, "learning_rate": 2.3857094240654856e-06, "loss": 11.4448, "step": 1332 }, { "epoch": 0.05184601771632488, "grad_norm": 0.20610633492469788, "learning_rate": 2.3379444289913342e-06, "loss": 11.4313, "step": 1333 }, { "epoch": 0.05188491195317134, "grad_norm": 0.1628742516040802, "learning_rate": 2.2906567842982728e-06, "loss": 11.4884, "step": 1334 }, { "epoch": 0.051923806190017796, "grad_norm": 0.19907735288143158, "learning_rate": 2.2438467211177816e-06, "loss": 11.4273, "step": 1335 }, { "epoch": 0.05196270042686425, "grad_norm": 0.2636147141456604, "learning_rate": 2.1975144682470415e-06, "loss": 11.4372, "step": 1336 }, { "epoch": 0.0520015946637107, "grad_norm": 0.32304054498672485, "learning_rate": 2.151660252147769e-06, "loss": 11.5135, "step": 1337 }, { "epoch": 0.05204048890055716, "grad_norm": 0.16421319544315338, "learning_rate": 2.1062842969451713e-06, "loss": 11.4225, "step": 1338 }, { "epoch": 0.052079383137403616, "grad_norm": 0.21596230566501617, "learning_rate": 2.0613868244268143e-06, "loss": 11.4815, "step": 1339 }, { "epoch": 0.05211827737425007, "grad_norm": 0.16720180213451385, "learning_rate": 2.016968054041546e-06, "loss": 11.4213, "step": 1340 }, { "epoch": 0.05215717161109652, "grad_norm": 0.12049616873264313, "learning_rate": 1.973028202898419e-06, "loss": 11.4324, "step": 1341 }, { "epoch": 0.05219606584794298, "grad_norm": 0.11477518826723099, "learning_rate": 1.9295674857656486e-06, "loss": 11.4659, "step": 1342 }, { "epoch": 0.052234960084789436, "grad_norm": 0.16870322823524475, "learning_rate": 1.8865861150695442e-06, "loss": 11.4344, "step": 1343 }, { "epoch": 0.05227385432163589, "grad_norm": 0.09433110058307648, "learning_rate": 1.8440843008934561e-06, "loss": 11.424, "step": 1344 }, { "epoch": 0.05231274855848235, "grad_norm": 0.12650403380393982, "learning_rate": 1.8020622509768326e-06, "loss": 11.4307, "step": 1345 }, { "epoch": 0.0523516427953288, "grad_norm": 0.255119651556015, "learning_rate": 1.7605201707140418e-06, "loss": 11.4822, "step": 1346 }, { "epoch": 0.052390537032175256, "grad_norm": 0.16543054580688477, "learning_rate": 1.7194582631535617e-06, "loss": 11.4568, "step": 1347 }, { "epoch": 0.05242943126902171, "grad_norm": 0.20184849202632904, "learning_rate": 1.6788767289968254e-06, "loss": 11.4388, "step": 1348 }, { "epoch": 0.05246832550586817, "grad_norm": 0.17886343598365784, "learning_rate": 1.6387757665973559e-06, "loss": 11.4182, "step": 1349 }, { "epoch": 0.05250721974271463, "grad_norm": 0.1549103856086731, "learning_rate": 1.5991555719597207e-06, "loss": 11.4222, "step": 1350 }, { "epoch": 0.05254611397956108, "grad_norm": 0.17105679214000702, "learning_rate": 1.5600163387386124e-06, "loss": 11.45, "step": 1351 }, { "epoch": 0.05258500821640753, "grad_norm": 0.13427035510540009, "learning_rate": 1.5213582582378927e-06, "loss": 11.438, "step": 1352 }, { "epoch": 0.05262390245325399, "grad_norm": 0.16201744973659515, "learning_rate": 1.4831815194096266e-06, "loss": 11.4172, "step": 1353 }, { "epoch": 0.05266279669010045, "grad_norm": 0.1943223476409912, "learning_rate": 1.4454863088532388e-06, "loss": 11.4298, "step": 1354 }, { "epoch": 0.0527016909269469, "grad_norm": 0.18249273300170898, "learning_rate": 1.408272810814515e-06, "loss": 11.4374, "step": 1355 }, { "epoch": 0.05274058516379335, "grad_norm": 0.15066294372081757, "learning_rate": 1.3715412071847345e-06, "loss": 11.4284, "step": 1356 }, { "epoch": 0.05277947940063981, "grad_norm": 0.1240072026848793, "learning_rate": 1.3352916774998281e-06, "loss": 11.453, "step": 1357 }, { "epoch": 0.05281837363748627, "grad_norm": 0.21831557154655457, "learning_rate": 1.299524398939389e-06, "loss": 11.4345, "step": 1358 }, { "epoch": 0.052857267874332724, "grad_norm": 0.16932734847068787, "learning_rate": 1.2642395463259404e-06, "loss": 11.5236, "step": 1359 }, { "epoch": 0.052896162111179174, "grad_norm": 0.22352683544158936, "learning_rate": 1.2294372921239694e-06, "loss": 11.4624, "step": 1360 }, { "epoch": 0.05293505634802563, "grad_norm": 0.1469883918762207, "learning_rate": 1.19511780643915e-06, "loss": 11.4261, "step": 1361 }, { "epoch": 0.05297395058487209, "grad_norm": 0.17142179608345032, "learning_rate": 1.161281257017477e-06, "loss": 11.4227, "step": 1362 }, { "epoch": 0.053012844821718544, "grad_norm": 0.15057890117168427, "learning_rate": 1.1279278092444889e-06, "loss": 11.4453, "step": 1363 }, { "epoch": 0.053051739058564994, "grad_norm": 0.13775911927223206, "learning_rate": 1.0950576261444023e-06, "loss": 11.4371, "step": 1364 }, { "epoch": 0.05309063329541145, "grad_norm": 0.19454172253608704, "learning_rate": 1.062670868379334e-06, "loss": 11.471, "step": 1365 }, { "epoch": 0.05312952753225791, "grad_norm": 0.16760843992233276, "learning_rate": 1.0307676942485689e-06, "loss": 11.4212, "step": 1366 }, { "epoch": 0.053168421769104364, "grad_norm": 0.12229340523481369, "learning_rate": 9.993482596877157e-07, "loss": 11.4386, "step": 1367 }, { "epoch": 0.05320731600595082, "grad_norm": 0.1881038248538971, "learning_rate": 9.684127182679526e-07, "loss": 11.4969, "step": 1368 }, { "epoch": 0.05324621024279727, "grad_norm": 0.2813434898853302, "learning_rate": 9.379612211953492e-07, "loss": 11.5781, "step": 1369 }, { "epoch": 0.05328510447964373, "grad_norm": 0.1275080293416977, "learning_rate": 9.079939173100238e-07, "loss": 11.4293, "step": 1370 }, { "epoch": 0.053323998716490184, "grad_norm": 0.15989451110363007, "learning_rate": 8.785109530854874e-07, "loss": 11.4262, "step": 1371 }, { "epoch": 0.05336289295333664, "grad_norm": 0.18090415000915527, "learning_rate": 8.495124726279002e-07, "loss": 11.4283, "step": 1372 }, { "epoch": 0.0534017871901831, "grad_norm": 0.19085204601287842, "learning_rate": 8.209986176753948e-07, "loss": 11.4415, "step": 1373 }, { "epoch": 0.05344068142702955, "grad_norm": 0.19983118772506714, "learning_rate": 7.929695275973204e-07, "loss": 11.4394, "step": 1374 }, { "epoch": 0.053479575663876004, "grad_norm": 0.3193620443344116, "learning_rate": 7.654253393936439e-07, "loss": 11.4513, "step": 1375 }, { "epoch": 0.05351846990072246, "grad_norm": 0.10505767166614532, "learning_rate": 7.383661876942283e-07, "loss": 11.4344, "step": 1376 }, { "epoch": 0.05355736413756892, "grad_norm": 0.12139089405536652, "learning_rate": 7.117922047581549e-07, "loss": 11.4365, "step": 1377 }, { "epoch": 0.05359625837441537, "grad_norm": 0.22217227518558502, "learning_rate": 6.857035204731688e-07, "loss": 11.4517, "step": 1378 }, { "epoch": 0.053635152611261824, "grad_norm": 0.12010081112384796, "learning_rate": 6.601002623549346e-07, "loss": 11.4011, "step": 1379 }, { "epoch": 0.05367404684810828, "grad_norm": 0.1507730633020401, "learning_rate": 6.349825555464706e-07, "loss": 11.4185, "step": 1380 }, { "epoch": 0.05371294108495474, "grad_norm": 0.19225631654262543, "learning_rate": 6.103505228175377e-07, "loss": 11.4336, "step": 1381 }, { "epoch": 0.053751835321801195, "grad_norm": 0.1865839958190918, "learning_rate": 5.862042845640403e-07, "loss": 11.4278, "step": 1382 }, { "epoch": 0.053790729558647644, "grad_norm": 0.16153313219547272, "learning_rate": 5.625439588074044e-07, "loss": 11.4257, "step": 1383 }, { "epoch": 0.0538296237954941, "grad_norm": 0.14634691178798676, "learning_rate": 5.393696611940225e-07, "loss": 11.4201, "step": 1384 }, { "epoch": 0.05386851803234056, "grad_norm": 0.2850719690322876, "learning_rate": 5.166815049947204e-07, "loss": 11.4702, "step": 1385 }, { "epoch": 0.053907412269187015, "grad_norm": 0.15424256026744843, "learning_rate": 4.944796011041475e-07, "loss": 11.4015, "step": 1386 }, { "epoch": 0.05394630650603347, "grad_norm": 0.13952219486236572, "learning_rate": 4.727640580402537e-07, "loss": 11.411, "step": 1387 }, { "epoch": 0.05398520074287992, "grad_norm": 0.1746031492948532, "learning_rate": 4.5153498194380195e-07, "loss": 11.4428, "step": 1388 }, { "epoch": 0.05402409497972638, "grad_norm": 0.20047025382518768, "learning_rate": 4.307924765777682e-07, "loss": 11.4867, "step": 1389 }, { "epoch": 0.054062989216572835, "grad_norm": 0.1938544064760208, "learning_rate": 4.105366433269087e-07, "loss": 11.4689, "step": 1390 }, { "epoch": 0.05410188345341929, "grad_norm": 0.20457126200199127, "learning_rate": 3.9076758119722666e-07, "loss": 11.4573, "step": 1391 }, { "epoch": 0.05414077769026574, "grad_norm": 0.12991569936275482, "learning_rate": 3.714853868154955e-07, "loss": 11.4399, "step": 1392 }, { "epoch": 0.0541796719271122, "grad_norm": 0.2232292741537094, "learning_rate": 3.5269015442878083e-07, "loss": 11.4935, "step": 1393 }, { "epoch": 0.054218566163958655, "grad_norm": 0.15414465963840485, "learning_rate": 3.343819759040079e-07, "loss": 11.4398, "step": 1394 }, { "epoch": 0.05425746040080511, "grad_norm": 0.17479385435581207, "learning_rate": 3.165609407274617e-07, "loss": 11.4745, "step": 1395 }, { "epoch": 0.05429635463765157, "grad_norm": 0.15992434322834015, "learning_rate": 2.9922713600439854e-07, "loss": 11.4093, "step": 1396 }, { "epoch": 0.05433524887449802, "grad_norm": 0.1427340805530548, "learning_rate": 2.82380646458591e-07, "loss": 11.4472, "step": 1397 }, { "epoch": 0.054374143111344475, "grad_norm": 0.15816694498062134, "learning_rate": 2.6602155443195e-07, "loss": 11.4177, "step": 1398 }, { "epoch": 0.05441303734819093, "grad_norm": 0.14333859086036682, "learning_rate": 2.501499398840479e-07, "loss": 11.427, "step": 1399 }, { "epoch": 0.05445193158503739, "grad_norm": 0.12002553045749664, "learning_rate": 2.3476588039181845e-07, "loss": 11.4343, "step": 1400 }, { "epoch": 0.054490825821883845, "grad_norm": 0.15301531553268433, "learning_rate": 2.1986945114911283e-07, "loss": 11.4611, "step": 1401 }, { "epoch": 0.054529720058730295, "grad_norm": 0.20533597469329834, "learning_rate": 2.054607249663665e-07, "loss": 11.4376, "step": 1402 }, { "epoch": 0.05456861429557675, "grad_norm": 0.1993454545736313, "learning_rate": 1.915397722702217e-07, "loss": 11.4586, "step": 1403 }, { "epoch": 0.05460750853242321, "grad_norm": 0.12486769258975983, "learning_rate": 1.7810666110318342e-07, "loss": 11.4282, "step": 1404 }, { "epoch": 0.054646402769269666, "grad_norm": 0.13480772078037262, "learning_rate": 1.6516145712333064e-07, "loss": 11.4318, "step": 1405 }, { "epoch": 0.054685297006116115, "grad_norm": 0.16604341566562653, "learning_rate": 1.5270422360391668e-07, "loss": 11.4172, "step": 1406 }, { "epoch": 0.05472419124296257, "grad_norm": 0.17657136917114258, "learning_rate": 1.4073502143313598e-07, "loss": 11.4706, "step": 1407 }, { "epoch": 0.05476308547980903, "grad_norm": 0.19776944816112518, "learning_rate": 1.2925390911379121e-07, "loss": 11.4629, "step": 1408 }, { "epoch": 0.054801979716655486, "grad_norm": 0.18015769124031067, "learning_rate": 1.1826094276298216e-07, "loss": 11.4426, "step": 1409 }, { "epoch": 0.05484087395350194, "grad_norm": 0.15199075639247894, "learning_rate": 1.0775617611189503e-07, "loss": 11.4482, "step": 1410 }, { "epoch": 0.05487976819034839, "grad_norm": 0.14073902368545532, "learning_rate": 9.773966050549143e-08, "loss": 11.4773, "step": 1411 }, { "epoch": 0.05491866242719485, "grad_norm": 0.10297466069459915, "learning_rate": 8.821144490225308e-08, "loss": 11.463, "step": 1412 }, { "epoch": 0.054957556664041306, "grad_norm": 0.19450917840003967, "learning_rate": 7.917157587399304e-08, "loss": 11.4102, "step": 1413 }, { "epoch": 0.05499645090088776, "grad_norm": 0.16901682317256927, "learning_rate": 7.06200976055782e-08, "loss": 11.4469, "step": 1414 }, { "epoch": 0.05503534513773422, "grad_norm": 0.15647615492343903, "learning_rate": 6.255705189471828e-08, "loss": 11.4383, "step": 1415 }, { "epoch": 0.05507423937458067, "grad_norm": 0.11146261543035507, "learning_rate": 5.498247815179936e-08, "loss": 11.4292, "step": 1416 }, { "epoch": 0.055113133611427126, "grad_norm": 0.250475138425827, "learning_rate": 4.789641339963957e-08, "loss": 11.5614, "step": 1417 }, { "epoch": 0.05515202784827358, "grad_norm": 0.14612603187561035, "learning_rate": 4.129889227334483e-08, "loss": 11.4791, "step": 1418 }, { "epoch": 0.05519092208512004, "grad_norm": 0.1520339548587799, "learning_rate": 3.5189947020142224e-08, "loss": 11.41, "step": 1419 }, { "epoch": 0.05522981632196649, "grad_norm": 0.16020874679088593, "learning_rate": 2.9569607499180252e-08, "loss": 11.4201, "step": 1420 }, { "epoch": 0.055268710558812946, "grad_norm": 0.1421998292207718, "learning_rate": 2.4437901181439958e-08, "loss": 11.4428, "step": 1421 }, { "epoch": 0.0553076047956594, "grad_norm": 0.13697375357151031, "learning_rate": 1.9794853149557314e-08, "loss": 11.4406, "step": 1422 }, { "epoch": 0.05534649903250586, "grad_norm": 0.12090425193309784, "learning_rate": 1.564048609771218e-08, "loss": 11.4115, "step": 1423 }, { "epoch": 0.055385393269352316, "grad_norm": 0.21023690700531006, "learning_rate": 1.1974820331517312e-08, "loss": 11.4664, "step": 1424 }, { "epoch": 0.055424287506198766, "grad_norm": 0.1580401510000229, "learning_rate": 8.797873767951714e-09, "loss": 11.44, "step": 1425 }, { "epoch": 0.05546318174304522, "grad_norm": 0.147861510515213, "learning_rate": 6.109661935205236e-09, "loss": 11.4288, "step": 1426 }, { "epoch": 0.05550207597989168, "grad_norm": 0.14827421307563782, "learning_rate": 3.9101979726674505e-09, "loss": 11.4429, "step": 1427 }, { "epoch": 0.055540970216738136, "grad_norm": 0.170853391289711, "learning_rate": 2.1994926308277486e-09, "loss": 11.4324, "step": 1428 }, { "epoch": 0.05557986445358459, "grad_norm": 0.1428116112947464, "learning_rate": 9.775542712309182e-10, "loss": 11.423, "step": 1429 }, { "epoch": 0.05561875869043104, "grad_norm": 0.1336054503917694, "learning_rate": 2.443888664327432e-10, "loss": 11.4468, "step": 1430 }, { "epoch": 0.0556576529272775, "grad_norm": 0.17747704684734344, "learning_rate": 0.0, "loss": 11.4313, "step": 1431 } ], "logging_steps": 1, "max_steps": 1431, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 358, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 179098141851648.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }