diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24148 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9997083272568177, + "eval_steps": 200, + "global_step": 3428, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005833454863642993, + "grad_norm": 17.78308868408203, + "learning_rate": 0.0, + "loss": 5.8378, + "step": 1 + }, + { + "epoch": 0.0011666909727285986, + "grad_norm": 5.597744464874268, + "learning_rate": 7.525749891599529e-06, + "loss": 2.3217, + "step": 2 + }, + { + "epoch": 0.0017500364590928978, + "grad_norm": 9.02523136138916, + "learning_rate": 1.192803136799156e-05, + "loss": 2.1816, + "step": 3 + }, + { + "epoch": 0.002333381945457197, + "grad_norm": 3.38098406791687, + "learning_rate": 1.5051499783199057e-05, + "loss": 1.7371, + "step": 4 + }, + { + "epoch": 0.002916727431821496, + "grad_norm": 3.2741100788116455, + "learning_rate": 1.7474250108400467e-05, + "loss": 1.6348, + "step": 5 + }, + { + "epoch": 0.0035000729181857955, + "grad_norm": 3.280085563659668, + "learning_rate": 1.945378125959109e-05, + "loss": 1.6431, + "step": 6 + }, + { + "epoch": 0.004083418404550095, + "grad_norm": 2.7186739444732666, + "learning_rate": 2.1127451000356418e-05, + "loss": 1.452, + "step": 7 + }, + { + "epoch": 0.004666763890914394, + "grad_norm": 2.5042686462402344, + "learning_rate": 2.2577249674798584e-05, + "loss": 1.3986, + "step": 8 + }, + { + "epoch": 0.005250109377278694, + "grad_norm": 2.6429643630981445, + "learning_rate": 2.385606273598312e-05, + "loss": 1.5973, + "step": 9 + }, + { + "epoch": 0.005833454863642992, + "grad_norm": 2.739990711212158, + "learning_rate": 2.4999999999999998e-05, + "loss": 1.4543, + "step": 10 + }, + { + "epoch": 0.006416800350007292, + "grad_norm": 2.085124969482422, + "learning_rate": 2.6034817128955623e-05, + "loss": 1.1983, + "step": 11 + }, + { + "epoch": 0.007000145836371591, + "grad_norm": 2.3114278316497803, + "learning_rate": 2.6979531151190617e-05, + "loss": 1.5163, + "step": 12 + }, + { + "epoch": 0.0075834913227358905, + "grad_norm": 2.1083810329437256, + "learning_rate": 2.7848583807670913e-05, + "loss": 1.3752, + "step": 13 + }, + { + "epoch": 0.00816683680910019, + "grad_norm": 2.667152166366577, + "learning_rate": 2.8653200891955945e-05, + "loss": 1.3118, + "step": 14 + }, + { + "epoch": 0.008750182295464488, + "grad_norm": 2.096151113510132, + "learning_rate": 2.940228147639203e-05, + "loss": 1.1005, + "step": 15 + }, + { + "epoch": 0.009333527781828789, + "grad_norm": 2.579360246658325, + "learning_rate": 3.0102999566398115e-05, + "loss": 1.5375, + "step": 16 + }, + { + "epoch": 0.009916873268193087, + "grad_norm": 2.0573806762695312, + "learning_rate": 3.076122303445685e-05, + "loss": 1.3534, + "step": 17 + }, + { + "epoch": 0.010500218754557387, + "grad_norm": 1.6891591548919678, + "learning_rate": 3.1381812627582646e-05, + "loss": 1.1602, + "step": 18 + }, + { + "epoch": 0.011083564240921686, + "grad_norm": 1.654665470123291, + "learning_rate": 3.1968840023820715e-05, + "loss": 1.2598, + "step": 19 + }, + { + "epoch": 0.011666909727285985, + "grad_norm": 1.51304030418396, + "learning_rate": 3.2525749891599525e-05, + "loss": 1.3738, + "step": 20 + }, + { + "epoch": 0.012250255213650285, + "grad_norm": 1.7645764350891113, + "learning_rate": 3.305548236834798e-05, + "loss": 1.3982, + "step": 21 + }, + { + "epoch": 0.012833600700014583, + "grad_norm": 2.0786468982696533, + "learning_rate": 3.3560567020555153e-05, + "loss": 1.2424, + "step": 22 + }, + { + "epoch": 0.013416946186378884, + "grad_norm": 1.5325442552566528, + "learning_rate": 3.404319590043982e-05, + "loss": 1.2803, + "step": 23 + }, + { + "epoch": 0.014000291672743182, + "grad_norm": 1.5579572916030884, + "learning_rate": 3.450528104279015e-05, + "loss": 1.4129, + "step": 24 + }, + { + "epoch": 0.01458363715910748, + "grad_norm": 1.6682405471801758, + "learning_rate": 3.4948500216800935e-05, + "loss": 1.1793, + "step": 25 + }, + { + "epoch": 0.015166982645471781, + "grad_norm": 1.558914303779602, + "learning_rate": 3.537433369927044e-05, + "loss": 1.2917, + "step": 26 + }, + { + "epoch": 0.01575032813183608, + "grad_norm": 2.367410182952881, + "learning_rate": 3.578409410397468e-05, + "loss": 1.3104, + "step": 27 + }, + { + "epoch": 0.01633367361820038, + "grad_norm": 1.7541717290878296, + "learning_rate": 3.6178950783555475e-05, + "loss": 1.1249, + "step": 28 + }, + { + "epoch": 0.016917019104564678, + "grad_norm": 1.3140240907669067, + "learning_rate": 3.65599499474739e-05, + "loss": 1.0676, + "step": 29 + }, + { + "epoch": 0.017500364590928977, + "grad_norm": 2.0365586280822754, + "learning_rate": 3.6928031367991554e-05, + "loss": 1.2787, + "step": 30 + }, + { + "epoch": 0.018083710077293275, + "grad_norm": 1.875240683555603, + "learning_rate": 3.728404234585681e-05, + "loss": 1.2876, + "step": 31 + }, + { + "epoch": 0.018667055563657577, + "grad_norm": 1.6128060817718506, + "learning_rate": 3.762874945799765e-05, + "loss": 1.1359, + "step": 32 + }, + { + "epoch": 0.019250401050021876, + "grad_norm": 1.761427640914917, + "learning_rate": 3.796284849694718e-05, + "loss": 1.2898, + "step": 33 + }, + { + "epoch": 0.019833746536386174, + "grad_norm": 1.8005716800689697, + "learning_rate": 3.8286972926056376e-05, + "loss": 1.0982, + "step": 34 + }, + { + "epoch": 0.020417092022750473, + "grad_norm": 1.7455655336380005, + "learning_rate": 3.8601701108756885e-05, + "loss": 1.1419, + "step": 35 + }, + { + "epoch": 0.021000437509114775, + "grad_norm": 1.207555890083313, + "learning_rate": 3.890756251918218e-05, + "loss": 1.3689, + "step": 36 + }, + { + "epoch": 0.021583782995479073, + "grad_norm": 1.4690238237380981, + "learning_rate": 3.920504310167487e-05, + "loss": 1.0896, + "step": 37 + }, + { + "epoch": 0.022167128481843372, + "grad_norm": 1.31458580493927, + "learning_rate": 3.949458991542025e-05, + "loss": 1.2124, + "step": 38 + }, + { + "epoch": 0.02275047396820767, + "grad_norm": 1.7508232593536377, + "learning_rate": 3.977661517566247e-05, + "loss": 1.0062, + "step": 39 + }, + { + "epoch": 0.02333381945457197, + "grad_norm": 1.7391034364700317, + "learning_rate": 4.005149978319905e-05, + "loss": 1.1858, + "step": 40 + }, + { + "epoch": 0.02391716494093627, + "grad_norm": 1.479203462600708, + "learning_rate": 4.031959641799338e-05, + "loss": 1.1886, + "step": 41 + }, + { + "epoch": 0.02450051042730057, + "grad_norm": 1.7898560762405396, + "learning_rate": 4.058123225994751e-05, + "loss": 1.204, + "step": 42 + }, + { + "epoch": 0.025083855913664868, + "grad_norm": 1.422057867050171, + "learning_rate": 4.0836711389489654e-05, + "loss": 1.1607, + "step": 43 + }, + { + "epoch": 0.025667201400029167, + "grad_norm": 2.161949872970581, + "learning_rate": 4.108631691215468e-05, + "loss": 1.1471, + "step": 44 + }, + { + "epoch": 0.026250546886393465, + "grad_norm": 1.9058781862258911, + "learning_rate": 4.133031284438358e-05, + "loss": 1.0656, + "step": 45 + }, + { + "epoch": 0.026833892372757767, + "grad_norm": 1.6354972124099731, + "learning_rate": 4.156894579203935e-05, + "loss": 1.0579, + "step": 46 + }, + { + "epoch": 0.027417237859122066, + "grad_norm": 1.6435108184814453, + "learning_rate": 4.180244644839293e-05, + "loss": 1.0281, + "step": 47 + }, + { + "epoch": 0.028000583345486364, + "grad_norm": 1.7230783700942993, + "learning_rate": 4.203103093438968e-05, + "loss": 1.086, + "step": 48 + }, + { + "epoch": 0.028583928831850663, + "grad_norm": 1.4264886379241943, + "learning_rate": 4.2254902000712836e-05, + "loss": 1.1384, + "step": 49 + }, + { + "epoch": 0.02916727431821496, + "grad_norm": 1.6517153978347778, + "learning_rate": 4.247425010840046e-05, + "loss": 1.2282, + "step": 50 + }, + { + "epoch": 0.029750619804579263, + "grad_norm": 2.1615817546844482, + "learning_rate": 4.2689254402448405e-05, + "loss": 1.2475, + "step": 51 + }, + { + "epoch": 0.030333965290943562, + "grad_norm": 1.5791531801223755, + "learning_rate": 4.290008359086998e-05, + "loss": 1.4099, + "step": 52 + }, + { + "epoch": 0.03091731077730786, + "grad_norm": 1.5546684265136719, + "learning_rate": 4.310689674001973e-05, + "loss": 1.2336, + "step": 53 + }, + { + "epoch": 0.03150065626367216, + "grad_norm": 1.5784398317337036, + "learning_rate": 4.330984399557421e-05, + "loss": 1.3104, + "step": 54 + }, + { + "epoch": 0.03208400175003646, + "grad_norm": 1.2873289585113525, + "learning_rate": 4.350906723735609e-05, + "loss": 1.3342, + "step": 55 + }, + { + "epoch": 0.03266734723640076, + "grad_norm": 1.6933963298797607, + "learning_rate": 4.370470067515501e-05, + "loss": 1.1348, + "step": 56 + }, + { + "epoch": 0.033250692722765054, + "grad_norm": 1.5604673624038696, + "learning_rate": 4.3896871391812285e-05, + "loss": 1.3968, + "step": 57 + }, + { + "epoch": 0.033834038209129357, + "grad_norm": 1.5108568668365479, + "learning_rate": 4.408569983907343e-05, + "loss": 1.1997, + "step": 58 + }, + { + "epoch": 0.03441738369549366, + "grad_norm": 3.0541892051696777, + "learning_rate": 4.42713002910536e-05, + "loss": 1.3484, + "step": 59 + }, + { + "epoch": 0.035000729181857954, + "grad_norm": 1.2510807514190674, + "learning_rate": 4.445378125959108e-05, + "loss": 1.3166, + "step": 60 + }, + { + "epoch": 0.035584074668222256, + "grad_norm": 1.6028800010681152, + "learning_rate": 4.463324587526917e-05, + "loss": 1.145, + "step": 61 + }, + { + "epoch": 0.03616742015458655, + "grad_norm": 1.5321674346923828, + "learning_rate": 4.4809792237456346e-05, + "loss": 1.2243, + "step": 62 + }, + { + "epoch": 0.03675076564095085, + "grad_norm": 1.4574953317642212, + "learning_rate": 4.498351373633954e-05, + "loss": 0.9716, + "step": 63 + }, + { + "epoch": 0.037334111127315155, + "grad_norm": 1.2741219997406006, + "learning_rate": 4.515449934959717e-05, + "loss": 1.083, + "step": 64 + }, + { + "epoch": 0.03791745661367945, + "grad_norm": 1.4046497344970703, + "learning_rate": 4.532283391607138e-05, + "loss": 1.1772, + "step": 65 + }, + { + "epoch": 0.03850080210004375, + "grad_norm": 1.3555320501327515, + "learning_rate": 4.548859838854671e-05, + "loss": 0.9133, + "step": 66 + }, + { + "epoch": 0.03908414758640805, + "grad_norm": 1.6886674165725708, + "learning_rate": 4.565187006752065e-05, + "loss": 1.1938, + "step": 67 + }, + { + "epoch": 0.03966749307277235, + "grad_norm": 1.430363416671753, + "learning_rate": 4.581272281765591e-05, + "loss": 1.1537, + "step": 68 + }, + { + "epoch": 0.04025083855913665, + "grad_norm": 1.6385464668273926, + "learning_rate": 4.597122726843138e-05, + "loss": 1.1556, + "step": 69 + }, + { + "epoch": 0.040834184045500946, + "grad_norm": 1.4001507759094238, + "learning_rate": 4.612745100035642e-05, + "loss": 1.1733, + "step": 70 + }, + { + "epoch": 0.04141752953186525, + "grad_norm": 1.5451918840408325, + "learning_rate": 4.628145871797688e-05, + "loss": 1.197, + "step": 71 + }, + { + "epoch": 0.04200087501822955, + "grad_norm": 1.5024776458740234, + "learning_rate": 4.643331241078171e-05, + "loss": 1.3481, + "step": 72 + }, + { + "epoch": 0.042584220504593845, + "grad_norm": 1.5428893566131592, + "learning_rate": 4.658307150301139e-05, + "loss": 1.1476, + "step": 73 + }, + { + "epoch": 0.04316756599095815, + "grad_norm": 1.2959415912628174, + "learning_rate": 4.67307929932744e-05, + "loss": 1.0119, + "step": 74 + }, + { + "epoch": 0.04375091147732244, + "grad_norm": 1.198490023612976, + "learning_rate": 4.687653158479249e-05, + "loss": 1.24, + "step": 75 + }, + { + "epoch": 0.044334256963686744, + "grad_norm": 1.3311015367507935, + "learning_rate": 4.702033980701978e-05, + "loss": 1.1441, + "step": 76 + }, + { + "epoch": 0.044917602450051046, + "grad_norm": 1.4103642702102661, + "learning_rate": 4.716226812931204e-05, + "loss": 1.0517, + "step": 77 + }, + { + "epoch": 0.04550094793641534, + "grad_norm": 1.260272741317749, + "learning_rate": 4.7302365067262006e-05, + "loss": 1.2173, + "step": 78 + }, + { + "epoch": 0.04608429342277964, + "grad_norm": 1.4968616962432861, + "learning_rate": 4.744067728226103e-05, + "loss": 1.0362, + "step": 79 + }, + { + "epoch": 0.04666763890914394, + "grad_norm": 1.2590322494506836, + "learning_rate": 4.757724967479858e-05, + "loss": 1.3106, + "step": 80 + }, + { + "epoch": 0.04725098439550824, + "grad_norm": 1.2166228294372559, + "learning_rate": 4.771212547196624e-05, + "loss": 1.0736, + "step": 81 + }, + { + "epoch": 0.04783432988187254, + "grad_norm": 1.19132399559021, + "learning_rate": 4.7845346309592914e-05, + "loss": 0.9754, + "step": 82 + }, + { + "epoch": 0.04841767536823684, + "grad_norm": 1.3279188871383667, + "learning_rate": 4.7976952309401844e-05, + "loss": 1.1815, + "step": 83 + }, + { + "epoch": 0.04900102085460114, + "grad_norm": 1.5890907049179077, + "learning_rate": 4.810698215154703e-05, + "loss": 1.0381, + "step": 84 + }, + { + "epoch": 0.049584366340965434, + "grad_norm": 1.2219558954238892, + "learning_rate": 4.823547314285732e-05, + "loss": 0.8901, + "step": 85 + }, + { + "epoch": 0.050167711827329736, + "grad_norm": 1.9673529863357544, + "learning_rate": 4.836246128108918e-05, + "loss": 1.087, + "step": 86 + }, + { + "epoch": 0.05075105731369404, + "grad_norm": 1.6241381168365479, + "learning_rate": 4.8487981315465456e-05, + "loss": 1.328, + "step": 87 + }, + { + "epoch": 0.05133440280005833, + "grad_norm": 1.289197325706482, + "learning_rate": 4.8612066803754214e-05, + "loss": 1.3474, + "step": 88 + }, + { + "epoch": 0.051917748286422635, + "grad_norm": 1.4262926578521729, + "learning_rate": 4.873475016612281e-05, + "loss": 1.0049, + "step": 89 + }, + { + "epoch": 0.05250109377278693, + "grad_norm": 1.382620930671692, + "learning_rate": 4.885606273598312e-05, + "loss": 0.94, + "step": 90 + }, + { + "epoch": 0.05308443925915123, + "grad_norm": 1.5160958766937256, + "learning_rate": 4.897603480802733e-05, + "loss": 1.1705, + "step": 91 + }, + { + "epoch": 0.053667784745515534, + "grad_norm": 2.5832581520080566, + "learning_rate": 4.909469568363888e-05, + "loss": 1.2074, + "step": 92 + }, + { + "epoch": 0.05425113023187983, + "grad_norm": 1.540708065032959, + "learning_rate": 4.9212073713848375e-05, + "loss": 1.2703, + "step": 93 + }, + { + "epoch": 0.05483447571824413, + "grad_norm": 1.8622777462005615, + "learning_rate": 4.932819633999246e-05, + "loss": 1.1169, + "step": 94 + }, + { + "epoch": 0.055417821204608426, + "grad_norm": 1.4035429954528809, + "learning_rate": 4.9443090132221186e-05, + "loss": 1.17, + "step": 95 + }, + { + "epoch": 0.05600116669097273, + "grad_norm": 1.4052140712738037, + "learning_rate": 4.9556780825989205e-05, + "loss": 0.9749, + "step": 96 + }, + { + "epoch": 0.05658451217733703, + "grad_norm": 1.3691339492797852, + "learning_rate": 4.9669293356656114e-05, + "loss": 0.9748, + "step": 97 + }, + { + "epoch": 0.057167857663701326, + "grad_norm": 1.2556352615356445, + "learning_rate": 4.978065189231237e-05, + "loss": 0.9405, + "step": 98 + }, + { + "epoch": 0.05775120315006563, + "grad_norm": 1.2853363752365112, + "learning_rate": 4.989087986493874e-05, + "loss": 1.3587, + "step": 99 + }, + { + "epoch": 0.05833454863642992, + "grad_norm": 2.179720640182495, + "learning_rate": 4.9999999999999996e-05, + "loss": 1.181, + "step": 100 + }, + { + "epoch": 0.058917894122794225, + "grad_norm": 1.1849037408828735, + "learning_rate": 5e-05, + "loss": 1.0379, + "step": 101 + }, + { + "epoch": 0.05950123960915853, + "grad_norm": 1.335351586341858, + "learning_rate": 4.998497596153847e-05, + "loss": 1.2294, + "step": 102 + }, + { + "epoch": 0.06008458509552282, + "grad_norm": 1.5505980253219604, + "learning_rate": 4.9969951923076926e-05, + "loss": 0.9985, + "step": 103 + }, + { + "epoch": 0.060667930581887124, + "grad_norm": 1.2492345571517944, + "learning_rate": 4.9954927884615385e-05, + "loss": 1.009, + "step": 104 + }, + { + "epoch": 0.06125127606825142, + "grad_norm": 1.2737586498260498, + "learning_rate": 4.993990384615384e-05, + "loss": 1.075, + "step": 105 + }, + { + "epoch": 0.06183462155461572, + "grad_norm": 1.6333897113800049, + "learning_rate": 4.992487980769231e-05, + "loss": 1.2081, + "step": 106 + }, + { + "epoch": 0.06241796704098002, + "grad_norm": 1.5222417116165161, + "learning_rate": 4.9909855769230774e-05, + "loss": 1.2142, + "step": 107 + }, + { + "epoch": 0.06300131252734432, + "grad_norm": 1.683530569076538, + "learning_rate": 4.989483173076923e-05, + "loss": 0.9966, + "step": 108 + }, + { + "epoch": 0.06358465801370862, + "grad_norm": 1.8551238775253296, + "learning_rate": 4.98798076923077e-05, + "loss": 1.0807, + "step": 109 + }, + { + "epoch": 0.06416800350007291, + "grad_norm": 1.721418857574463, + "learning_rate": 4.9864783653846156e-05, + "loss": 0.907, + "step": 110 + }, + { + "epoch": 0.06475134898643722, + "grad_norm": 1.1599464416503906, + "learning_rate": 4.9849759615384615e-05, + "loss": 1.0121, + "step": 111 + }, + { + "epoch": 0.06533469447280152, + "grad_norm": 1.1434050798416138, + "learning_rate": 4.983473557692308e-05, + "loss": 1.0762, + "step": 112 + }, + { + "epoch": 0.06591803995916581, + "grad_norm": 1.3993638753890991, + "learning_rate": 4.981971153846154e-05, + "loss": 1.2264, + "step": 113 + }, + { + "epoch": 0.06650138544553011, + "grad_norm": 1.278794288635254, + "learning_rate": 4.9804687500000004e-05, + "loss": 1.1514, + "step": 114 + }, + { + "epoch": 0.06708473093189442, + "grad_norm": 4.100466728210449, + "learning_rate": 4.978966346153847e-05, + "loss": 1.1164, + "step": 115 + }, + { + "epoch": 0.06766807641825871, + "grad_norm": 1.2248213291168213, + "learning_rate": 4.977463942307693e-05, + "loss": 1.1503, + "step": 116 + }, + { + "epoch": 0.06825142190462301, + "grad_norm": 1.7549059391021729, + "learning_rate": 4.9759615384615386e-05, + "loss": 1.185, + "step": 117 + }, + { + "epoch": 0.06883476739098732, + "grad_norm": 1.747718334197998, + "learning_rate": 4.9744591346153844e-05, + "loss": 1.1494, + "step": 118 + }, + { + "epoch": 0.06941811287735161, + "grad_norm": 1.4090042114257812, + "learning_rate": 4.972956730769231e-05, + "loss": 1.0752, + "step": 119 + }, + { + "epoch": 0.07000145836371591, + "grad_norm": 1.3040906190872192, + "learning_rate": 4.9714543269230775e-05, + "loss": 0.9461, + "step": 120 + }, + { + "epoch": 0.07058480385008022, + "grad_norm": 1.625506043434143, + "learning_rate": 4.9699519230769233e-05, + "loss": 1.0583, + "step": 121 + }, + { + "epoch": 0.07116814933644451, + "grad_norm": 1.273437261581421, + "learning_rate": 4.968449519230769e-05, + "loss": 1.1448, + "step": 122 + }, + { + "epoch": 0.0717514948228088, + "grad_norm": 1.3089839220046997, + "learning_rate": 4.966947115384616e-05, + "loss": 1.0703, + "step": 123 + }, + { + "epoch": 0.0723348403091731, + "grad_norm": 1.1929200887680054, + "learning_rate": 4.9654447115384616e-05, + "loss": 0.9157, + "step": 124 + }, + { + "epoch": 0.07291818579553741, + "grad_norm": 1.1315945386886597, + "learning_rate": 4.963942307692308e-05, + "loss": 1.2748, + "step": 125 + }, + { + "epoch": 0.0735015312819017, + "grad_norm": 1.4779376983642578, + "learning_rate": 4.962439903846154e-05, + "loss": 1.0475, + "step": 126 + }, + { + "epoch": 0.074084876768266, + "grad_norm": 1.3037022352218628, + "learning_rate": 4.9609375000000005e-05, + "loss": 1.1542, + "step": 127 + }, + { + "epoch": 0.07466822225463031, + "grad_norm": 1.454216480255127, + "learning_rate": 4.959435096153846e-05, + "loss": 1.3286, + "step": 128 + }, + { + "epoch": 0.0752515677409946, + "grad_norm": 1.5513452291488647, + "learning_rate": 4.957932692307692e-05, + "loss": 1.2599, + "step": 129 + }, + { + "epoch": 0.0758349132273589, + "grad_norm": 1.7299277782440186, + "learning_rate": 4.956430288461539e-05, + "loss": 1.2156, + "step": 130 + }, + { + "epoch": 0.07641825871372321, + "grad_norm": 1.2468066215515137, + "learning_rate": 4.9549278846153846e-05, + "loss": 1.0193, + "step": 131 + }, + { + "epoch": 0.0770016042000875, + "grad_norm": 1.1203701496124268, + "learning_rate": 4.953425480769231e-05, + "loss": 1.0451, + "step": 132 + }, + { + "epoch": 0.0775849496864518, + "grad_norm": 1.4089607000350952, + "learning_rate": 4.9519230769230776e-05, + "loss": 1.0041, + "step": 133 + }, + { + "epoch": 0.0781682951728161, + "grad_norm": 1.40877103805542, + "learning_rate": 4.9504206730769235e-05, + "loss": 1.1952, + "step": 134 + }, + { + "epoch": 0.0787516406591804, + "grad_norm": 1.2676079273223877, + "learning_rate": 4.948918269230769e-05, + "loss": 0.9846, + "step": 135 + }, + { + "epoch": 0.0793349861455447, + "grad_norm": 1.3630707263946533, + "learning_rate": 4.947415865384616e-05, + "loss": 1.2925, + "step": 136 + }, + { + "epoch": 0.07991833163190899, + "grad_norm": 1.4193191528320312, + "learning_rate": 4.945913461538462e-05, + "loss": 0.9525, + "step": 137 + }, + { + "epoch": 0.0805016771182733, + "grad_norm": 1.4956103563308716, + "learning_rate": 4.944411057692308e-05, + "loss": 1.0733, + "step": 138 + }, + { + "epoch": 0.0810850226046376, + "grad_norm": 1.4280532598495483, + "learning_rate": 4.942908653846154e-05, + "loss": 0.9752, + "step": 139 + }, + { + "epoch": 0.08166836809100189, + "grad_norm": 1.4108835458755493, + "learning_rate": 4.94140625e-05, + "loss": 1.0615, + "step": 140 + }, + { + "epoch": 0.0822517135773662, + "grad_norm": 1.3212484121322632, + "learning_rate": 4.9399038461538464e-05, + "loss": 0.957, + "step": 141 + }, + { + "epoch": 0.0828350590637305, + "grad_norm": 1.3035906553268433, + "learning_rate": 4.938401442307692e-05, + "loss": 1.0273, + "step": 142 + }, + { + "epoch": 0.08341840455009479, + "grad_norm": 1.9610090255737305, + "learning_rate": 4.936899038461539e-05, + "loss": 1.2096, + "step": 143 + }, + { + "epoch": 0.0840017500364591, + "grad_norm": 1.5158385038375854, + "learning_rate": 4.935396634615385e-05, + "loss": 1.056, + "step": 144 + }, + { + "epoch": 0.0845850955228234, + "grad_norm": 1.9964501857757568, + "learning_rate": 4.933894230769231e-05, + "loss": 1.0332, + "step": 145 + }, + { + "epoch": 0.08516844100918769, + "grad_norm": 1.5921212434768677, + "learning_rate": 4.932391826923077e-05, + "loss": 1.0627, + "step": 146 + }, + { + "epoch": 0.08575178649555198, + "grad_norm": 1.6017435789108276, + "learning_rate": 4.930889423076923e-05, + "loss": 1.0619, + "step": 147 + }, + { + "epoch": 0.0863351319819163, + "grad_norm": 1.4326040744781494, + "learning_rate": 4.9293870192307694e-05, + "loss": 1.0111, + "step": 148 + }, + { + "epoch": 0.08691847746828059, + "grad_norm": 1.0582656860351562, + "learning_rate": 4.927884615384616e-05, + "loss": 0.978, + "step": 149 + }, + { + "epoch": 0.08750182295464488, + "grad_norm": 1.1291985511779785, + "learning_rate": 4.926382211538462e-05, + "loss": 1.2015, + "step": 150 + }, + { + "epoch": 0.08808516844100919, + "grad_norm": 1.6709433794021606, + "learning_rate": 4.924879807692308e-05, + "loss": 1.086, + "step": 151 + }, + { + "epoch": 0.08866851392737349, + "grad_norm": 1.3261191844940186, + "learning_rate": 4.923377403846154e-05, + "loss": 1.2622, + "step": 152 + }, + { + "epoch": 0.08925185941373778, + "grad_norm": 1.3250443935394287, + "learning_rate": 4.921875e-05, + "loss": 0.9489, + "step": 153 + }, + { + "epoch": 0.08983520490010209, + "grad_norm": 1.3647021055221558, + "learning_rate": 4.9203725961538466e-05, + "loss": 1.0411, + "step": 154 + }, + { + "epoch": 0.09041855038646639, + "grad_norm": 1.5882066488265991, + "learning_rate": 4.9188701923076924e-05, + "loss": 1.2391, + "step": 155 + }, + { + "epoch": 0.09100189587283068, + "grad_norm": 1.2136874198913574, + "learning_rate": 4.917367788461539e-05, + "loss": 1.0796, + "step": 156 + }, + { + "epoch": 0.09158524135919498, + "grad_norm": 1.3928202390670776, + "learning_rate": 4.915865384615385e-05, + "loss": 1.2077, + "step": 157 + }, + { + "epoch": 0.09216858684555929, + "grad_norm": 1.487825632095337, + "learning_rate": 4.9143629807692306e-05, + "loss": 1.1106, + "step": 158 + }, + { + "epoch": 0.09275193233192358, + "grad_norm": 1.7569891214370728, + "learning_rate": 4.912860576923077e-05, + "loss": 1.2142, + "step": 159 + }, + { + "epoch": 0.09333527781828788, + "grad_norm": 1.2210508584976196, + "learning_rate": 4.911358173076923e-05, + "loss": 1.1436, + "step": 160 + }, + { + "epoch": 0.09391862330465219, + "grad_norm": 1.2036561965942383, + "learning_rate": 4.9098557692307695e-05, + "loss": 1.227, + "step": 161 + }, + { + "epoch": 0.09450196879101648, + "grad_norm": 1.2647464275360107, + "learning_rate": 4.908353365384616e-05, + "loss": 0.9552, + "step": 162 + }, + { + "epoch": 0.09508531427738078, + "grad_norm": 1.2640854120254517, + "learning_rate": 4.906850961538462e-05, + "loss": 1.2703, + "step": 163 + }, + { + "epoch": 0.09566865976374508, + "grad_norm": 1.8966978788375854, + "learning_rate": 4.905348557692308e-05, + "loss": 1.2019, + "step": 164 + }, + { + "epoch": 0.09625200525010938, + "grad_norm": 1.3982689380645752, + "learning_rate": 4.9038461538461536e-05, + "loss": 1.0861, + "step": 165 + }, + { + "epoch": 0.09683535073647367, + "grad_norm": 1.3104490041732788, + "learning_rate": 4.90234375e-05, + "loss": 1.0978, + "step": 166 + }, + { + "epoch": 0.09741869622283797, + "grad_norm": 1.3252923488616943, + "learning_rate": 4.900841346153847e-05, + "loss": 0.984, + "step": 167 + }, + { + "epoch": 0.09800204170920228, + "grad_norm": 1.636171579360962, + "learning_rate": 4.8993389423076925e-05, + "loss": 1.0612, + "step": 168 + }, + { + "epoch": 0.09858538719556657, + "grad_norm": 1.3443262577056885, + "learning_rate": 4.897836538461539e-05, + "loss": 1.0664, + "step": 169 + }, + { + "epoch": 0.09916873268193087, + "grad_norm": 1.2440283298492432, + "learning_rate": 4.896334134615385e-05, + "loss": 1.083, + "step": 170 + }, + { + "epoch": 0.09975207816829518, + "grad_norm": 1.125069499015808, + "learning_rate": 4.894831730769231e-05, + "loss": 1.1206, + "step": 171 + }, + { + "epoch": 0.10033542365465947, + "grad_norm": 1.3264683485031128, + "learning_rate": 4.893329326923077e-05, + "loss": 0.9415, + "step": 172 + }, + { + "epoch": 0.10091876914102377, + "grad_norm": 1.1505907773971558, + "learning_rate": 4.891826923076923e-05, + "loss": 1.0793, + "step": 173 + }, + { + "epoch": 0.10150211462738808, + "grad_norm": 1.1637053489685059, + "learning_rate": 4.89032451923077e-05, + "loss": 0.9331, + "step": 174 + }, + { + "epoch": 0.10208546011375237, + "grad_norm": 1.6416714191436768, + "learning_rate": 4.888822115384616e-05, + "loss": 0.8152, + "step": 175 + }, + { + "epoch": 0.10266880560011667, + "grad_norm": 1.0731501579284668, + "learning_rate": 4.8873197115384614e-05, + "loss": 1.188, + "step": 176 + }, + { + "epoch": 0.10325215108648096, + "grad_norm": 1.5831621885299683, + "learning_rate": 4.885817307692308e-05, + "loss": 1.1591, + "step": 177 + }, + { + "epoch": 0.10383549657284527, + "grad_norm": 1.221384882926941, + "learning_rate": 4.884314903846154e-05, + "loss": 1.0392, + "step": 178 + }, + { + "epoch": 0.10441884205920957, + "grad_norm": 1.2436399459838867, + "learning_rate": 4.8828125e-05, + "loss": 1.0176, + "step": 179 + }, + { + "epoch": 0.10500218754557386, + "grad_norm": 1.6474889516830444, + "learning_rate": 4.881310096153847e-05, + "loss": 1.294, + "step": 180 + }, + { + "epoch": 0.10558553303193817, + "grad_norm": 1.7568507194519043, + "learning_rate": 4.8798076923076926e-05, + "loss": 1.1217, + "step": 181 + }, + { + "epoch": 0.10616887851830246, + "grad_norm": 1.6313364505767822, + "learning_rate": 4.8783052884615385e-05, + "loss": 1.0687, + "step": 182 + }, + { + "epoch": 0.10675222400466676, + "grad_norm": 1.3143326044082642, + "learning_rate": 4.8768028846153843e-05, + "loss": 1.001, + "step": 183 + }, + { + "epoch": 0.10733556949103107, + "grad_norm": 1.7886399030685425, + "learning_rate": 4.875300480769231e-05, + "loss": 1.2089, + "step": 184 + }, + { + "epoch": 0.10791891497739536, + "grad_norm": 1.494966983795166, + "learning_rate": 4.8737980769230774e-05, + "loss": 1.096, + "step": 185 + }, + { + "epoch": 0.10850226046375966, + "grad_norm": 1.311043620109558, + "learning_rate": 4.872295673076923e-05, + "loss": 1.0791, + "step": 186 + }, + { + "epoch": 0.10908560595012397, + "grad_norm": 1.4188566207885742, + "learning_rate": 4.87079326923077e-05, + "loss": 0.8453, + "step": 187 + }, + { + "epoch": 0.10966895143648826, + "grad_norm": 1.2535464763641357, + "learning_rate": 4.8692908653846156e-05, + "loss": 0.9653, + "step": 188 + }, + { + "epoch": 0.11025229692285256, + "grad_norm": 1.2611030340194702, + "learning_rate": 4.8677884615384615e-05, + "loss": 1.2116, + "step": 189 + }, + { + "epoch": 0.11083564240921685, + "grad_norm": 1.33787202835083, + "learning_rate": 4.866286057692308e-05, + "loss": 1.0238, + "step": 190 + }, + { + "epoch": 0.11141898789558116, + "grad_norm": 1.256029486656189, + "learning_rate": 4.864783653846154e-05, + "loss": 1.123, + "step": 191 + }, + { + "epoch": 0.11200233338194546, + "grad_norm": 1.2442421913146973, + "learning_rate": 4.8632812500000004e-05, + "loss": 1.0065, + "step": 192 + }, + { + "epoch": 0.11258567886830975, + "grad_norm": 1.540716290473938, + "learning_rate": 4.861778846153847e-05, + "loss": 1.1194, + "step": 193 + }, + { + "epoch": 0.11316902435467406, + "grad_norm": 1.2217652797698975, + "learning_rate": 4.860276442307692e-05, + "loss": 1.0814, + "step": 194 + }, + { + "epoch": 0.11375236984103836, + "grad_norm": 1.1543757915496826, + "learning_rate": 4.8587740384615386e-05, + "loss": 1.1429, + "step": 195 + }, + { + "epoch": 0.11433571532740265, + "grad_norm": 1.2116531133651733, + "learning_rate": 4.8572716346153845e-05, + "loss": 0.8905, + "step": 196 + }, + { + "epoch": 0.11491906081376696, + "grad_norm": 1.2289031744003296, + "learning_rate": 4.855769230769231e-05, + "loss": 0.9011, + "step": 197 + }, + { + "epoch": 0.11550240630013126, + "grad_norm": 1.6803086996078491, + "learning_rate": 4.8542668269230775e-05, + "loss": 1.0375, + "step": 198 + }, + { + "epoch": 0.11608575178649555, + "grad_norm": 1.5701960325241089, + "learning_rate": 4.8527644230769234e-05, + "loss": 1.0346, + "step": 199 + }, + { + "epoch": 0.11666909727285985, + "grad_norm": 1.2407475709915161, + "learning_rate": 4.851262019230769e-05, + "loss": 0.7745, + "step": 200 + }, + { + "epoch": 0.11666909727285985, + "eval_loss_squad": 1.1932364337146282, + "eval_perplexity": 7.5639171404401075, + "eval_perplexity_reconstruct": 1.8755490788116311, + "step": 200 + }, + { + "epoch": 0.11725244275922415, + "grad_norm": 1.604348063468933, + "learning_rate": 4.849759615384616e-05, + "loss": 1.1338, + "step": 201 + }, + { + "epoch": 0.11783578824558845, + "grad_norm": 1.2575210332870483, + "learning_rate": 4.8482572115384616e-05, + "loss": 1.1509, + "step": 202 + }, + { + "epoch": 0.11841913373195274, + "grad_norm": 1.662980318069458, + "learning_rate": 4.846754807692308e-05, + "loss": 1.1348, + "step": 203 + }, + { + "epoch": 0.11900247921831705, + "grad_norm": 1.3378013372421265, + "learning_rate": 4.845252403846154e-05, + "loss": 1.2898, + "step": 204 + }, + { + "epoch": 0.11958582470468135, + "grad_norm": 1.4811590909957886, + "learning_rate": 4.8437500000000005e-05, + "loss": 0.9847, + "step": 205 + }, + { + "epoch": 0.12016917019104564, + "grad_norm": 1.452512502670288, + "learning_rate": 4.8422475961538464e-05, + "loss": 1.0591, + "step": 206 + }, + { + "epoch": 0.12075251567740995, + "grad_norm": 1.1843714714050293, + "learning_rate": 4.840745192307692e-05, + "loss": 0.7955, + "step": 207 + }, + { + "epoch": 0.12133586116377425, + "grad_norm": 1.3748295307159424, + "learning_rate": 4.839242788461539e-05, + "loss": 1.0937, + "step": 208 + }, + { + "epoch": 0.12191920665013854, + "grad_norm": 1.1598683595657349, + "learning_rate": 4.8377403846153846e-05, + "loss": 1.0785, + "step": 209 + }, + { + "epoch": 0.12250255213650284, + "grad_norm": 1.3770662546157837, + "learning_rate": 4.836237980769231e-05, + "loss": 1.1478, + "step": 210 + }, + { + "epoch": 0.12308589762286715, + "grad_norm": 1.3409414291381836, + "learning_rate": 4.8347355769230776e-05, + "loss": 1.2035, + "step": 211 + }, + { + "epoch": 0.12366924310923144, + "grad_norm": 1.0668489933013916, + "learning_rate": 4.833233173076923e-05, + "loss": 1.148, + "step": 212 + }, + { + "epoch": 0.12425258859559574, + "grad_norm": 1.2233036756515503, + "learning_rate": 4.8317307692307693e-05, + "loss": 0.9074, + "step": 213 + }, + { + "epoch": 0.12483593408196005, + "grad_norm": 1.2284631729125977, + "learning_rate": 4.830228365384616e-05, + "loss": 1.006, + "step": 214 + }, + { + "epoch": 0.12541927956832433, + "grad_norm": 1.0120066404342651, + "learning_rate": 4.828725961538462e-05, + "loss": 1.1474, + "step": 215 + }, + { + "epoch": 0.12600262505468865, + "grad_norm": 1.47971773147583, + "learning_rate": 4.827223557692308e-05, + "loss": 0.7954, + "step": 216 + }, + { + "epoch": 0.12658597054105294, + "grad_norm": 1.3422911167144775, + "learning_rate": 4.825721153846154e-05, + "loss": 1.0716, + "step": 217 + }, + { + "epoch": 0.12716931602741724, + "grad_norm": 1.1010000705718994, + "learning_rate": 4.82421875e-05, + "loss": 1.1334, + "step": 218 + }, + { + "epoch": 0.12775266151378153, + "grad_norm": 1.4309356212615967, + "learning_rate": 4.8227163461538465e-05, + "loss": 1.137, + "step": 219 + }, + { + "epoch": 0.12833600700014583, + "grad_norm": 1.333970308303833, + "learning_rate": 4.821213942307692e-05, + "loss": 1.2948, + "step": 220 + }, + { + "epoch": 0.12891935248651012, + "grad_norm": 1.3190792798995972, + "learning_rate": 4.819711538461539e-05, + "loss": 1.061, + "step": 221 + }, + { + "epoch": 0.12950269797287445, + "grad_norm": 1.3843986988067627, + "learning_rate": 4.818209134615385e-05, + "loss": 1.1526, + "step": 222 + }, + { + "epoch": 0.13008604345923874, + "grad_norm": 1.6656113862991333, + "learning_rate": 4.816706730769231e-05, + "loss": 1.2739, + "step": 223 + }, + { + "epoch": 0.13066938894560304, + "grad_norm": 1.3917752504348755, + "learning_rate": 4.815204326923077e-05, + "loss": 1.1356, + "step": 224 + }, + { + "epoch": 0.13125273443196733, + "grad_norm": 1.3735393285751343, + "learning_rate": 4.813701923076923e-05, + "loss": 0.9553, + "step": 225 + }, + { + "epoch": 0.13183607991833163, + "grad_norm": 1.253967523574829, + "learning_rate": 4.8121995192307695e-05, + "loss": 1.1367, + "step": 226 + }, + { + "epoch": 0.13241942540469592, + "grad_norm": 1.2027031183242798, + "learning_rate": 4.810697115384616e-05, + "loss": 1.129, + "step": 227 + }, + { + "epoch": 0.13300277089106022, + "grad_norm": 1.2082641124725342, + "learning_rate": 4.809194711538462e-05, + "loss": 1.0744, + "step": 228 + }, + { + "epoch": 0.13358611637742454, + "grad_norm": 1.4022235870361328, + "learning_rate": 4.8076923076923084e-05, + "loss": 0.9777, + "step": 229 + }, + { + "epoch": 0.13416946186378884, + "grad_norm": 1.1199394464492798, + "learning_rate": 4.8061899038461535e-05, + "loss": 0.9213, + "step": 230 + }, + { + "epoch": 0.13475280735015313, + "grad_norm": 1.1612290143966675, + "learning_rate": 4.8046875e-05, + "loss": 1.1336, + "step": 231 + }, + { + "epoch": 0.13533615283651743, + "grad_norm": 1.43931245803833, + "learning_rate": 4.8031850961538466e-05, + "loss": 1.2295, + "step": 232 + }, + { + "epoch": 0.13591949832288172, + "grad_norm": 1.4430086612701416, + "learning_rate": 4.8016826923076924e-05, + "loss": 1.1819, + "step": 233 + }, + { + "epoch": 0.13650284380924602, + "grad_norm": 1.1566003561019897, + "learning_rate": 4.800180288461539e-05, + "loss": 0.9493, + "step": 234 + }, + { + "epoch": 0.13708618929561034, + "grad_norm": 1.1817492246627808, + "learning_rate": 4.798677884615385e-05, + "loss": 1.0723, + "step": 235 + }, + { + "epoch": 0.13766953478197463, + "grad_norm": 1.0918781757354736, + "learning_rate": 4.797175480769231e-05, + "loss": 1.1297, + "step": 236 + }, + { + "epoch": 0.13825288026833893, + "grad_norm": 1.2325400114059448, + "learning_rate": 4.795673076923077e-05, + "loss": 1.0465, + "step": 237 + }, + { + "epoch": 0.13883622575470322, + "grad_norm": 1.4473427534103394, + "learning_rate": 4.794170673076923e-05, + "loss": 1.1118, + "step": 238 + }, + { + "epoch": 0.13941957124106752, + "grad_norm": 1.2843005657196045, + "learning_rate": 4.7926682692307696e-05, + "loss": 0.7924, + "step": 239 + }, + { + "epoch": 0.14000291672743181, + "grad_norm": 1.225659728050232, + "learning_rate": 4.791165865384616e-05, + "loss": 0.9887, + "step": 240 + }, + { + "epoch": 0.1405862622137961, + "grad_norm": 1.3597815036773682, + "learning_rate": 4.789663461538462e-05, + "loss": 1.063, + "step": 241 + }, + { + "epoch": 0.14116960770016043, + "grad_norm": 1.2931313514709473, + "learning_rate": 4.788161057692308e-05, + "loss": 1.1008, + "step": 242 + }, + { + "epoch": 0.14175295318652473, + "grad_norm": 1.6043051481246948, + "learning_rate": 4.7866586538461537e-05, + "loss": 1.2153, + "step": 243 + }, + { + "epoch": 0.14233629867288902, + "grad_norm": 1.2612937688827515, + "learning_rate": 4.78515625e-05, + "loss": 1.05, + "step": 244 + }, + { + "epoch": 0.14291964415925332, + "grad_norm": 1.4583909511566162, + "learning_rate": 4.783653846153847e-05, + "loss": 1.0092, + "step": 245 + }, + { + "epoch": 0.1435029896456176, + "grad_norm": 1.4382933378219604, + "learning_rate": 4.7821514423076926e-05, + "loss": 0.8433, + "step": 246 + }, + { + "epoch": 0.1440863351319819, + "grad_norm": 1.1039222478866577, + "learning_rate": 4.780649038461539e-05, + "loss": 0.9546, + "step": 247 + }, + { + "epoch": 0.1446696806183462, + "grad_norm": 1.46884024143219, + "learning_rate": 4.779146634615384e-05, + "loss": 1.1191, + "step": 248 + }, + { + "epoch": 0.14525302610471053, + "grad_norm": 1.221130609512329, + "learning_rate": 4.777644230769231e-05, + "loss": 0.9995, + "step": 249 + }, + { + "epoch": 0.14583637159107482, + "grad_norm": 1.2932945489883423, + "learning_rate": 4.776141826923077e-05, + "loss": 1.0508, + "step": 250 + }, + { + "epoch": 0.14641971707743912, + "grad_norm": 1.174483060836792, + "learning_rate": 4.774639423076923e-05, + "loss": 1.1433, + "step": 251 + }, + { + "epoch": 0.1470030625638034, + "grad_norm": 1.366163969039917, + "learning_rate": 4.77313701923077e-05, + "loss": 1.1716, + "step": 252 + }, + { + "epoch": 0.1475864080501677, + "grad_norm": 1.9801900386810303, + "learning_rate": 4.7716346153846155e-05, + "loss": 0.9884, + "step": 253 + }, + { + "epoch": 0.148169753536532, + "grad_norm": 1.3597514629364014, + "learning_rate": 4.7701322115384614e-05, + "loss": 1.0706, + "step": 254 + }, + { + "epoch": 0.14875309902289632, + "grad_norm": 1.1929199695587158, + "learning_rate": 4.768629807692308e-05, + "loss": 0.996, + "step": 255 + }, + { + "epoch": 0.14933644450926062, + "grad_norm": 1.0896735191345215, + "learning_rate": 4.767127403846154e-05, + "loss": 1.2185, + "step": 256 + }, + { + "epoch": 0.1499197899956249, + "grad_norm": 1.0902819633483887, + "learning_rate": 4.765625e-05, + "loss": 1.3523, + "step": 257 + }, + { + "epoch": 0.1505031354819892, + "grad_norm": 1.1990022659301758, + "learning_rate": 4.764122596153847e-05, + "loss": 1.049, + "step": 258 + }, + { + "epoch": 0.1510864809683535, + "grad_norm": 1.2676620483398438, + "learning_rate": 4.762620192307693e-05, + "loss": 1.2471, + "step": 259 + }, + { + "epoch": 0.1516698264547178, + "grad_norm": 1.0942872762680054, + "learning_rate": 4.7611177884615385e-05, + "loss": 1.0276, + "step": 260 + }, + { + "epoch": 0.1522531719410821, + "grad_norm": 1.886791706085205, + "learning_rate": 4.7596153846153844e-05, + "loss": 0.9001, + "step": 261 + }, + { + "epoch": 0.15283651742744642, + "grad_norm": 1.3173785209655762, + "learning_rate": 4.758112980769231e-05, + "loss": 1.0866, + "step": 262 + }, + { + "epoch": 0.1534198629138107, + "grad_norm": 1.3886914253234863, + "learning_rate": 4.7566105769230774e-05, + "loss": 1.0645, + "step": 263 + }, + { + "epoch": 0.154003208400175, + "grad_norm": 1.5575705766677856, + "learning_rate": 4.755108173076923e-05, + "loss": 1.0723, + "step": 264 + }, + { + "epoch": 0.1545865538865393, + "grad_norm": 1.2715169191360474, + "learning_rate": 4.75360576923077e-05, + "loss": 0.7773, + "step": 265 + }, + { + "epoch": 0.1551698993729036, + "grad_norm": 1.5639898777008057, + "learning_rate": 4.752103365384616e-05, + "loss": 1.0814, + "step": 266 + }, + { + "epoch": 0.1557532448592679, + "grad_norm": 1.5546399354934692, + "learning_rate": 4.7506009615384615e-05, + "loss": 0.9091, + "step": 267 + }, + { + "epoch": 0.1563365903456322, + "grad_norm": 1.1600172519683838, + "learning_rate": 4.749098557692308e-05, + "loss": 1.2764, + "step": 268 + }, + { + "epoch": 0.1569199358319965, + "grad_norm": 1.4420632123947144, + "learning_rate": 4.747596153846154e-05, + "loss": 1.1393, + "step": 269 + }, + { + "epoch": 0.1575032813183608, + "grad_norm": 1.2289047241210938, + "learning_rate": 4.7460937500000004e-05, + "loss": 0.9656, + "step": 270 + }, + { + "epoch": 0.1580866268047251, + "grad_norm": 1.165598750114441, + "learning_rate": 4.744591346153846e-05, + "loss": 0.9472, + "step": 271 + }, + { + "epoch": 0.1586699722910894, + "grad_norm": 1.1388275623321533, + "learning_rate": 4.743088942307692e-05, + "loss": 1.0464, + "step": 272 + }, + { + "epoch": 0.1592533177774537, + "grad_norm": 1.1367133855819702, + "learning_rate": 4.7415865384615386e-05, + "loss": 0.9505, + "step": 273 + }, + { + "epoch": 0.15983666326381799, + "grad_norm": 1.086214542388916, + "learning_rate": 4.7400841346153845e-05, + "loss": 0.8583, + "step": 274 + }, + { + "epoch": 0.1604200087501823, + "grad_norm": 1.464219093322754, + "learning_rate": 4.738581730769231e-05, + "loss": 0.9827, + "step": 275 + }, + { + "epoch": 0.1610033542365466, + "grad_norm": 1.486207365989685, + "learning_rate": 4.7370793269230776e-05, + "loss": 1.0802, + "step": 276 + }, + { + "epoch": 0.1615866997229109, + "grad_norm": 1.265541434288025, + "learning_rate": 4.7355769230769234e-05, + "loss": 1.0812, + "step": 277 + }, + { + "epoch": 0.1621700452092752, + "grad_norm": 1.532962441444397, + "learning_rate": 4.734074519230769e-05, + "loss": 0.9444, + "step": 278 + }, + { + "epoch": 0.1627533906956395, + "grad_norm": 1.329508900642395, + "learning_rate": 4.732572115384616e-05, + "loss": 1.1593, + "step": 279 + }, + { + "epoch": 0.16333673618200378, + "grad_norm": 1.2211445569992065, + "learning_rate": 4.7310697115384616e-05, + "loss": 1.1621, + "step": 280 + }, + { + "epoch": 0.16392008166836808, + "grad_norm": 1.0366151332855225, + "learning_rate": 4.729567307692308e-05, + "loss": 1.0649, + "step": 281 + }, + { + "epoch": 0.1645034271547324, + "grad_norm": 1.4749221801757812, + "learning_rate": 4.728064903846154e-05, + "loss": 1.1227, + "step": 282 + }, + { + "epoch": 0.1650867726410967, + "grad_norm": 1.3342058658599854, + "learning_rate": 4.7265625000000005e-05, + "loss": 1.1781, + "step": 283 + }, + { + "epoch": 0.165670118127461, + "grad_norm": 1.0524542331695557, + "learning_rate": 4.7250600961538464e-05, + "loss": 1.0728, + "step": 284 + }, + { + "epoch": 0.1662534636138253, + "grad_norm": 1.2550618648529053, + "learning_rate": 4.723557692307692e-05, + "loss": 0.9786, + "step": 285 + }, + { + "epoch": 0.16683680910018958, + "grad_norm": 1.6661171913146973, + "learning_rate": 4.722055288461539e-05, + "loss": 1.1991, + "step": 286 + }, + { + "epoch": 0.16742015458655388, + "grad_norm": 1.480756163597107, + "learning_rate": 4.7205528846153846e-05, + "loss": 1.1048, + "step": 287 + }, + { + "epoch": 0.1680035000729182, + "grad_norm": 1.4135624170303345, + "learning_rate": 4.719050480769231e-05, + "loss": 1.1372, + "step": 288 + }, + { + "epoch": 0.1685868455592825, + "grad_norm": 1.5737360715866089, + "learning_rate": 4.717548076923077e-05, + "loss": 1.2462, + "step": 289 + }, + { + "epoch": 0.1691701910456468, + "grad_norm": 1.2390706539154053, + "learning_rate": 4.716045673076923e-05, + "loss": 0.9622, + "step": 290 + }, + { + "epoch": 0.16975353653201108, + "grad_norm": 1.193597435951233, + "learning_rate": 4.7145432692307694e-05, + "loss": 1.0312, + "step": 291 + }, + { + "epoch": 0.17033688201837538, + "grad_norm": 1.2671687602996826, + "learning_rate": 4.713040865384616e-05, + "loss": 1.0976, + "step": 292 + }, + { + "epoch": 0.17092022750473967, + "grad_norm": 1.220468521118164, + "learning_rate": 4.711538461538462e-05, + "loss": 0.8168, + "step": 293 + }, + { + "epoch": 0.17150357299110397, + "grad_norm": 1.7847779989242554, + "learning_rate": 4.710036057692308e-05, + "loss": 1.1623, + "step": 294 + }, + { + "epoch": 0.1720869184774683, + "grad_norm": 1.1684701442718506, + "learning_rate": 4.708533653846154e-05, + "loss": 1.0476, + "step": 295 + }, + { + "epoch": 0.1726702639638326, + "grad_norm": 1.2110151052474976, + "learning_rate": 4.70703125e-05, + "loss": 1.2198, + "step": 296 + }, + { + "epoch": 0.17325360945019688, + "grad_norm": 1.2417210340499878, + "learning_rate": 4.7055288461538465e-05, + "loss": 0.9928, + "step": 297 + }, + { + "epoch": 0.17383695493656118, + "grad_norm": 1.4387229681015015, + "learning_rate": 4.7040264423076924e-05, + "loss": 1.2669, + "step": 298 + }, + { + "epoch": 0.17442030042292547, + "grad_norm": 1.5898375511169434, + "learning_rate": 4.702524038461539e-05, + "loss": 1.055, + "step": 299 + }, + { + "epoch": 0.17500364590928977, + "grad_norm": 1.5764034986495972, + "learning_rate": 4.701021634615385e-05, + "loss": 1.0568, + "step": 300 + }, + { + "epoch": 0.17558699139565406, + "grad_norm": 1.1175717115402222, + "learning_rate": 4.699519230769231e-05, + "loss": 0.9626, + "step": 301 + }, + { + "epoch": 0.17617033688201839, + "grad_norm": 1.0562156438827515, + "learning_rate": 4.698016826923077e-05, + "loss": 1.0173, + "step": 302 + }, + { + "epoch": 0.17675368236838268, + "grad_norm": 1.1503074169158936, + "learning_rate": 4.696514423076923e-05, + "loss": 1.0513, + "step": 303 + }, + { + "epoch": 0.17733702785474698, + "grad_norm": 1.1285890340805054, + "learning_rate": 4.6950120192307695e-05, + "loss": 0.966, + "step": 304 + }, + { + "epoch": 0.17792037334111127, + "grad_norm": 1.0698230266571045, + "learning_rate": 4.693509615384616e-05, + "loss": 1.1029, + "step": 305 + }, + { + "epoch": 0.17850371882747557, + "grad_norm": 1.3292865753173828, + "learning_rate": 4.692007211538462e-05, + "loss": 1.19, + "step": 306 + }, + { + "epoch": 0.17908706431383986, + "grad_norm": 1.1927766799926758, + "learning_rate": 4.690504807692308e-05, + "loss": 1.1935, + "step": 307 + }, + { + "epoch": 0.17967040980020418, + "grad_norm": 2.0182723999023438, + "learning_rate": 4.6890024038461536e-05, + "loss": 1.2596, + "step": 308 + }, + { + "epoch": 0.18025375528656848, + "grad_norm": 1.8118665218353271, + "learning_rate": 4.6875e-05, + "loss": 1.0279, + "step": 309 + }, + { + "epoch": 0.18083710077293277, + "grad_norm": 1.3893784284591675, + "learning_rate": 4.6859975961538466e-05, + "loss": 1.0969, + "step": 310 + }, + { + "epoch": 0.18142044625929707, + "grad_norm": 4.505359172821045, + "learning_rate": 4.6844951923076925e-05, + "loss": 1.04, + "step": 311 + }, + { + "epoch": 0.18200379174566136, + "grad_norm": 1.258991003036499, + "learning_rate": 4.682992788461539e-05, + "loss": 0.9914, + "step": 312 + }, + { + "epoch": 0.18258713723202566, + "grad_norm": 1.0383793115615845, + "learning_rate": 4.681490384615385e-05, + "loss": 1.0399, + "step": 313 + }, + { + "epoch": 0.18317048271838995, + "grad_norm": 1.4799776077270508, + "learning_rate": 4.679987980769231e-05, + "loss": 0.9968, + "step": 314 + }, + { + "epoch": 0.18375382820475428, + "grad_norm": 1.3189499378204346, + "learning_rate": 4.678485576923077e-05, + "loss": 1.0315, + "step": 315 + }, + { + "epoch": 0.18433717369111857, + "grad_norm": 1.2589702606201172, + "learning_rate": 4.676983173076923e-05, + "loss": 0.8627, + "step": 316 + }, + { + "epoch": 0.18492051917748287, + "grad_norm": 1.2101918458938599, + "learning_rate": 4.6754807692307696e-05, + "loss": 1.2105, + "step": 317 + }, + { + "epoch": 0.18550386466384716, + "grad_norm": 1.3174864053726196, + "learning_rate": 4.673978365384616e-05, + "loss": 0.9905, + "step": 318 + }, + { + "epoch": 0.18608721015021146, + "grad_norm": 1.2675038576126099, + "learning_rate": 4.672475961538462e-05, + "loss": 1.0751, + "step": 319 + }, + { + "epoch": 0.18667055563657575, + "grad_norm": 1.8191653490066528, + "learning_rate": 4.670973557692308e-05, + "loss": 1.1342, + "step": 320 + }, + { + "epoch": 0.18725390112294005, + "grad_norm": 1.3822802305221558, + "learning_rate": 4.669471153846154e-05, + "loss": 0.9589, + "step": 321 + }, + { + "epoch": 0.18783724660930437, + "grad_norm": 1.386513590812683, + "learning_rate": 4.66796875e-05, + "loss": 1.1085, + "step": 322 + }, + { + "epoch": 0.18842059209566867, + "grad_norm": 1.1989296674728394, + "learning_rate": 4.666466346153847e-05, + "loss": 1.1038, + "step": 323 + }, + { + "epoch": 0.18900393758203296, + "grad_norm": 1.1725409030914307, + "learning_rate": 4.6649639423076926e-05, + "loss": 0.9671, + "step": 324 + }, + { + "epoch": 0.18958728306839726, + "grad_norm": 1.1871923208236694, + "learning_rate": 4.6634615384615384e-05, + "loss": 0.9863, + "step": 325 + }, + { + "epoch": 0.19017062855476155, + "grad_norm": 1.4059133529663086, + "learning_rate": 4.661959134615384e-05, + "loss": 0.858, + "step": 326 + }, + { + "epoch": 0.19075397404112585, + "grad_norm": 1.4571958780288696, + "learning_rate": 4.660456730769231e-05, + "loss": 1.1032, + "step": 327 + }, + { + "epoch": 0.19133731952749017, + "grad_norm": 1.4414441585540771, + "learning_rate": 4.6589543269230773e-05, + "loss": 1.1884, + "step": 328 + }, + { + "epoch": 0.19192066501385446, + "grad_norm": 1.9027329683303833, + "learning_rate": 4.657451923076923e-05, + "loss": 1.0415, + "step": 329 + }, + { + "epoch": 0.19250401050021876, + "grad_norm": 1.2300493717193604, + "learning_rate": 4.65594951923077e-05, + "loss": 0.8911, + "step": 330 + }, + { + "epoch": 0.19308735598658305, + "grad_norm": 3.4765067100524902, + "learning_rate": 4.6544471153846156e-05, + "loss": 0.9321, + "step": 331 + }, + { + "epoch": 0.19367070147294735, + "grad_norm": 1.4455662965774536, + "learning_rate": 4.6529447115384614e-05, + "loss": 0.9986, + "step": 332 + }, + { + "epoch": 0.19425404695931164, + "grad_norm": 1.371666669845581, + "learning_rate": 4.651442307692308e-05, + "loss": 1.1358, + "step": 333 + }, + { + "epoch": 0.19483739244567594, + "grad_norm": 1.2533602714538574, + "learning_rate": 4.649939903846154e-05, + "loss": 0.9561, + "step": 334 + }, + { + "epoch": 0.19542073793204026, + "grad_norm": 1.152901291847229, + "learning_rate": 4.6484375e-05, + "loss": 1.0428, + "step": 335 + }, + { + "epoch": 0.19600408341840456, + "grad_norm": 1.435378074645996, + "learning_rate": 4.646935096153847e-05, + "loss": 0.9433, + "step": 336 + }, + { + "epoch": 0.19658742890476885, + "grad_norm": 1.480175495147705, + "learning_rate": 4.645432692307693e-05, + "loss": 0.9606, + "step": 337 + }, + { + "epoch": 0.19717077439113315, + "grad_norm": 1.4078121185302734, + "learning_rate": 4.6439302884615386e-05, + "loss": 1.078, + "step": 338 + }, + { + "epoch": 0.19775411987749744, + "grad_norm": 1.3780559301376343, + "learning_rate": 4.6424278846153844e-05, + "loss": 1.1521, + "step": 339 + }, + { + "epoch": 0.19833746536386174, + "grad_norm": 1.5239243507385254, + "learning_rate": 4.640925480769231e-05, + "loss": 0.9855, + "step": 340 + }, + { + "epoch": 0.19892081085022606, + "grad_norm": 1.095953106880188, + "learning_rate": 4.6394230769230775e-05, + "loss": 1.1482, + "step": 341 + }, + { + "epoch": 0.19950415633659035, + "grad_norm": 1.4764552116394043, + "learning_rate": 4.637920673076923e-05, + "loss": 1.0308, + "step": 342 + }, + { + "epoch": 0.20008750182295465, + "grad_norm": 1.2678275108337402, + "learning_rate": 4.636418269230769e-05, + "loss": 1.0498, + "step": 343 + }, + { + "epoch": 0.20067084730931894, + "grad_norm": 1.3208849430084229, + "learning_rate": 4.634915865384616e-05, + "loss": 0.8625, + "step": 344 + }, + { + "epoch": 0.20125419279568324, + "grad_norm": 1.3627556562423706, + "learning_rate": 4.6334134615384615e-05, + "loss": 1.0739, + "step": 345 + }, + { + "epoch": 0.20183753828204753, + "grad_norm": 1.3360272645950317, + "learning_rate": 4.631911057692308e-05, + "loss": 0.9951, + "step": 346 + }, + { + "epoch": 0.20242088376841183, + "grad_norm": 1.2518396377563477, + "learning_rate": 4.630408653846154e-05, + "loss": 1.0037, + "step": 347 + }, + { + "epoch": 0.20300422925477615, + "grad_norm": 1.2477564811706543, + "learning_rate": 4.6289062500000005e-05, + "loss": 1.0325, + "step": 348 + }, + { + "epoch": 0.20358757474114045, + "grad_norm": 0.9390896558761597, + "learning_rate": 4.627403846153846e-05, + "loss": 0.9671, + "step": 349 + }, + { + "epoch": 0.20417092022750474, + "grad_norm": 1.2609106302261353, + "learning_rate": 4.625901442307692e-05, + "loss": 0.9002, + "step": 350 + }, + { + "epoch": 0.20475426571386904, + "grad_norm": 1.3142434358596802, + "learning_rate": 4.624399038461539e-05, + "loss": 1.0176, + "step": 351 + }, + { + "epoch": 0.20533761120023333, + "grad_norm": 2.33760142326355, + "learning_rate": 4.6228966346153845e-05, + "loss": 1.2718, + "step": 352 + }, + { + "epoch": 0.20592095668659763, + "grad_norm": 1.0962882041931152, + "learning_rate": 4.621394230769231e-05, + "loss": 1.1066, + "step": 353 + }, + { + "epoch": 0.20650430217296192, + "grad_norm": 1.1384212970733643, + "learning_rate": 4.6198918269230776e-05, + "loss": 0.986, + "step": 354 + }, + { + "epoch": 0.20708764765932625, + "grad_norm": 1.3480985164642334, + "learning_rate": 4.6183894230769234e-05, + "loss": 0.9396, + "step": 355 + }, + { + "epoch": 0.20767099314569054, + "grad_norm": 1.3062304258346558, + "learning_rate": 4.616887019230769e-05, + "loss": 1.1495, + "step": 356 + }, + { + "epoch": 0.20825433863205484, + "grad_norm": 1.1252844333648682, + "learning_rate": 4.615384615384616e-05, + "loss": 1.2231, + "step": 357 + }, + { + "epoch": 0.20883768411841913, + "grad_norm": 1.3172509670257568, + "learning_rate": 4.613882211538462e-05, + "loss": 1.181, + "step": 358 + }, + { + "epoch": 0.20942102960478343, + "grad_norm": 1.2648638486862183, + "learning_rate": 4.612379807692308e-05, + "loss": 1.2795, + "step": 359 + }, + { + "epoch": 0.21000437509114772, + "grad_norm": 1.266517162322998, + "learning_rate": 4.610877403846154e-05, + "loss": 1.1233, + "step": 360 + }, + { + "epoch": 0.21058772057751204, + "grad_norm": 1.5995312929153442, + "learning_rate": 4.609375e-05, + "loss": 0.9868, + "step": 361 + }, + { + "epoch": 0.21117106606387634, + "grad_norm": 1.338070034980774, + "learning_rate": 4.6078725961538464e-05, + "loss": 1.0835, + "step": 362 + }, + { + "epoch": 0.21175441155024063, + "grad_norm": 1.2133007049560547, + "learning_rate": 4.606370192307692e-05, + "loss": 1.0838, + "step": 363 + }, + { + "epoch": 0.21233775703660493, + "grad_norm": 1.2662042379379272, + "learning_rate": 4.604867788461539e-05, + "loss": 1.0508, + "step": 364 + }, + { + "epoch": 0.21292110252296922, + "grad_norm": 1.1545066833496094, + "learning_rate": 4.6033653846153846e-05, + "loss": 1.0582, + "step": 365 + }, + { + "epoch": 0.21350444800933352, + "grad_norm": 1.2459946870803833, + "learning_rate": 4.601862980769231e-05, + "loss": 1.0589, + "step": 366 + }, + { + "epoch": 0.21408779349569781, + "grad_norm": 1.1584820747375488, + "learning_rate": 4.600360576923077e-05, + "loss": 0.9987, + "step": 367 + }, + { + "epoch": 0.21467113898206214, + "grad_norm": 1.049363136291504, + "learning_rate": 4.598858173076923e-05, + "loss": 1.0067, + "step": 368 + }, + { + "epoch": 0.21525448446842643, + "grad_norm": 1.3197144269943237, + "learning_rate": 4.5973557692307694e-05, + "loss": 1.1545, + "step": 369 + }, + { + "epoch": 0.21583782995479073, + "grad_norm": 1.3010430335998535, + "learning_rate": 4.595853365384616e-05, + "loss": 1.0185, + "step": 370 + }, + { + "epoch": 0.21642117544115502, + "grad_norm": 1.2531121969223022, + "learning_rate": 4.594350961538462e-05, + "loss": 1.1825, + "step": 371 + }, + { + "epoch": 0.21700452092751932, + "grad_norm": 1.7184710502624512, + "learning_rate": 4.592848557692308e-05, + "loss": 1.0196, + "step": 372 + }, + { + "epoch": 0.2175878664138836, + "grad_norm": 1.173525094985962, + "learning_rate": 4.591346153846154e-05, + "loss": 1.2479, + "step": 373 + }, + { + "epoch": 0.21817121190024794, + "grad_norm": 1.450506567955017, + "learning_rate": 4.58984375e-05, + "loss": 1.1017, + "step": 374 + }, + { + "epoch": 0.21875455738661223, + "grad_norm": 1.1033124923706055, + "learning_rate": 4.5883413461538465e-05, + "loss": 0.8721, + "step": 375 + }, + { + "epoch": 0.21933790287297653, + "grad_norm": 1.3852168321609497, + "learning_rate": 4.5868389423076924e-05, + "loss": 1.0496, + "step": 376 + }, + { + "epoch": 0.21992124835934082, + "grad_norm": 1.2315133810043335, + "learning_rate": 4.585336538461539e-05, + "loss": 1.2742, + "step": 377 + }, + { + "epoch": 0.22050459384570512, + "grad_norm": 1.1665183305740356, + "learning_rate": 4.583834134615385e-05, + "loss": 0.8682, + "step": 378 + }, + { + "epoch": 0.2210879393320694, + "grad_norm": 1.1642358303070068, + "learning_rate": 4.5823317307692306e-05, + "loss": 1.0468, + "step": 379 + }, + { + "epoch": 0.2216712848184337, + "grad_norm": 1.1943094730377197, + "learning_rate": 4.580829326923077e-05, + "loss": 0.9771, + "step": 380 + }, + { + "epoch": 0.22225463030479803, + "grad_norm": 1.4384686946868896, + "learning_rate": 4.579326923076923e-05, + "loss": 1.1928, + "step": 381 + }, + { + "epoch": 0.22283797579116232, + "grad_norm": 2.045624256134033, + "learning_rate": 4.5778245192307695e-05, + "loss": 1.1249, + "step": 382 + }, + { + "epoch": 0.22342132127752662, + "grad_norm": 1.4147062301635742, + "learning_rate": 4.576322115384616e-05, + "loss": 1.0644, + "step": 383 + }, + { + "epoch": 0.22400466676389091, + "grad_norm": 1.4192973375320435, + "learning_rate": 4.574819711538462e-05, + "loss": 1.0954, + "step": 384 + }, + { + "epoch": 0.2245880122502552, + "grad_norm": 2.5359246730804443, + "learning_rate": 4.573317307692308e-05, + "loss": 0.9555, + "step": 385 + }, + { + "epoch": 0.2251713577366195, + "grad_norm": 1.3057085275650024, + "learning_rate": 4.5718149038461536e-05, + "loss": 1.2688, + "step": 386 + }, + { + "epoch": 0.2257547032229838, + "grad_norm": 1.5768152475357056, + "learning_rate": 4.5703125e-05, + "loss": 1.0067, + "step": 387 + }, + { + "epoch": 0.22633804870934812, + "grad_norm": 1.3447943925857544, + "learning_rate": 4.5688100961538467e-05, + "loss": 1.1379, + "step": 388 + }, + { + "epoch": 0.22692139419571242, + "grad_norm": 1.4754528999328613, + "learning_rate": 4.5673076923076925e-05, + "loss": 0.8801, + "step": 389 + }, + { + "epoch": 0.2275047396820767, + "grad_norm": 1.4515520334243774, + "learning_rate": 4.565805288461539e-05, + "loss": 1.0797, + "step": 390 + }, + { + "epoch": 0.228088085168441, + "grad_norm": 1.317018747329712, + "learning_rate": 4.564302884615385e-05, + "loss": 1.0002, + "step": 391 + }, + { + "epoch": 0.2286714306548053, + "grad_norm": 1.7866982221603394, + "learning_rate": 4.562800480769231e-05, + "loss": 1.0067, + "step": 392 + }, + { + "epoch": 0.2292547761411696, + "grad_norm": 1.3489786386489868, + "learning_rate": 4.561298076923077e-05, + "loss": 1.0879, + "step": 393 + }, + { + "epoch": 0.22983812162753392, + "grad_norm": 1.4815866947174072, + "learning_rate": 4.559795673076923e-05, + "loss": 1.1511, + "step": 394 + }, + { + "epoch": 0.23042146711389822, + "grad_norm": 1.099830985069275, + "learning_rate": 4.5582932692307696e-05, + "loss": 1.2723, + "step": 395 + }, + { + "epoch": 0.2310048126002625, + "grad_norm": 1.2879087924957275, + "learning_rate": 4.556790865384616e-05, + "loss": 1.0579, + "step": 396 + }, + { + "epoch": 0.2315881580866268, + "grad_norm": 1.2693564891815186, + "learning_rate": 4.5552884615384613e-05, + "loss": 1.2117, + "step": 397 + }, + { + "epoch": 0.2321715035729911, + "grad_norm": 1.5512031316757202, + "learning_rate": 4.553786057692308e-05, + "loss": 0.9932, + "step": 398 + }, + { + "epoch": 0.2327548490593554, + "grad_norm": 2.0312271118164062, + "learning_rate": 4.552283653846154e-05, + "loss": 0.8886, + "step": 399 + }, + { + "epoch": 0.2333381945457197, + "grad_norm": 1.1745553016662598, + "learning_rate": 4.55078125e-05, + "loss": 1.1362, + "step": 400 + }, + { + "epoch": 0.2333381945457197, + "eval_loss_squad": 1.0443613978661597, + "eval_perplexity": 7.789854227908129, + "eval_perplexity_reconstruct": 1.9117095009373144, + "step": 400 + }, + { + "epoch": 0.233921540032084, + "grad_norm": 1.2545403242111206, + "learning_rate": 4.549278846153847e-05, + "loss": 1.1109, + "step": 401 + }, + { + "epoch": 0.2345048855184483, + "grad_norm": 1.2915353775024414, + "learning_rate": 4.5477764423076926e-05, + "loss": 0.9874, + "step": 402 + }, + { + "epoch": 0.2350882310048126, + "grad_norm": 1.3028852939605713, + "learning_rate": 4.5462740384615385e-05, + "loss": 0.9613, + "step": 403 + }, + { + "epoch": 0.2356715764911769, + "grad_norm": 1.5871734619140625, + "learning_rate": 4.544771634615384e-05, + "loss": 1.0638, + "step": 404 + }, + { + "epoch": 0.2362549219775412, + "grad_norm": 1.4773112535476685, + "learning_rate": 4.543269230769231e-05, + "loss": 0.9079, + "step": 405 + }, + { + "epoch": 0.2368382674639055, + "grad_norm": 1.2807551622390747, + "learning_rate": 4.5417668269230774e-05, + "loss": 1.0703, + "step": 406 + }, + { + "epoch": 0.23742161295026978, + "grad_norm": 1.3610466718673706, + "learning_rate": 4.540264423076923e-05, + "loss": 0.962, + "step": 407 + }, + { + "epoch": 0.2380049584366341, + "grad_norm": 1.3228495121002197, + "learning_rate": 4.53876201923077e-05, + "loss": 1.0652, + "step": 408 + }, + { + "epoch": 0.2385883039229984, + "grad_norm": 1.153469443321228, + "learning_rate": 4.5372596153846156e-05, + "loss": 1.2528, + "step": 409 + }, + { + "epoch": 0.2391716494093627, + "grad_norm": 1.5087329149246216, + "learning_rate": 4.5357572115384615e-05, + "loss": 0.9128, + "step": 410 + }, + { + "epoch": 0.239754994895727, + "grad_norm": 1.523759365081787, + "learning_rate": 4.534254807692308e-05, + "loss": 1.2005, + "step": 411 + }, + { + "epoch": 0.2403383403820913, + "grad_norm": 1.2065186500549316, + "learning_rate": 4.532752403846154e-05, + "loss": 0.9159, + "step": 412 + }, + { + "epoch": 0.24092168586845558, + "grad_norm": 1.2266783714294434, + "learning_rate": 4.5312500000000004e-05, + "loss": 1.1309, + "step": 413 + }, + { + "epoch": 0.2415050313548199, + "grad_norm": 1.5053402185440063, + "learning_rate": 4.529747596153847e-05, + "loss": 1.1071, + "step": 414 + }, + { + "epoch": 0.2420883768411842, + "grad_norm": 1.978948712348938, + "learning_rate": 4.528245192307692e-05, + "loss": 1.1895, + "step": 415 + }, + { + "epoch": 0.2426717223275485, + "grad_norm": 1.3853555917739868, + "learning_rate": 4.5267427884615386e-05, + "loss": 1.0861, + "step": 416 + }, + { + "epoch": 0.2432550678139128, + "grad_norm": 1.2731225490570068, + "learning_rate": 4.5252403846153844e-05, + "loss": 0.8939, + "step": 417 + }, + { + "epoch": 0.24383841330027708, + "grad_norm": 1.1784796714782715, + "learning_rate": 4.523737980769231e-05, + "loss": 1.1628, + "step": 418 + }, + { + "epoch": 0.24442175878664138, + "grad_norm": 1.4403506517410278, + "learning_rate": 4.5222355769230775e-05, + "loss": 1.2741, + "step": 419 + }, + { + "epoch": 0.24500510427300567, + "grad_norm": 1.3002524375915527, + "learning_rate": 4.5207331730769233e-05, + "loss": 1.0343, + "step": 420 + }, + { + "epoch": 0.24558844975937, + "grad_norm": 1.323982834815979, + "learning_rate": 4.519230769230769e-05, + "loss": 1.0944, + "step": 421 + }, + { + "epoch": 0.2461717952457343, + "grad_norm": 1.2002545595169067, + "learning_rate": 4.517728365384616e-05, + "loss": 1.0209, + "step": 422 + }, + { + "epoch": 0.2467551407320986, + "grad_norm": 1.1465034484863281, + "learning_rate": 4.5162259615384616e-05, + "loss": 1.0743, + "step": 423 + }, + { + "epoch": 0.24733848621846288, + "grad_norm": 1.2983150482177734, + "learning_rate": 4.514723557692308e-05, + "loss": 1.1408, + "step": 424 + }, + { + "epoch": 0.24792183170482718, + "grad_norm": 1.2283337116241455, + "learning_rate": 4.513221153846154e-05, + "loss": 0.8972, + "step": 425 + }, + { + "epoch": 0.24850517719119147, + "grad_norm": 1.3164033889770508, + "learning_rate": 4.5117187500000005e-05, + "loss": 1.2549, + "step": 426 + }, + { + "epoch": 0.2490885226775558, + "grad_norm": 1.2797776460647583, + "learning_rate": 4.510216346153846e-05, + "loss": 0.9427, + "step": 427 + }, + { + "epoch": 0.2496718681639201, + "grad_norm": 1.0905731916427612, + "learning_rate": 4.508713942307692e-05, + "loss": 1.127, + "step": 428 + }, + { + "epoch": 0.25025521365028436, + "grad_norm": 1.4118478298187256, + "learning_rate": 4.507211538461539e-05, + "loss": 0.9869, + "step": 429 + }, + { + "epoch": 0.25083855913664865, + "grad_norm": 1.125809907913208, + "learning_rate": 4.5057091346153846e-05, + "loss": 1.0308, + "step": 430 + }, + { + "epoch": 0.251421904623013, + "grad_norm": 1.1028636693954468, + "learning_rate": 4.504206730769231e-05, + "loss": 1.0587, + "step": 431 + }, + { + "epoch": 0.2520052501093773, + "grad_norm": 1.59440279006958, + "learning_rate": 4.5027043269230776e-05, + "loss": 1.0767, + "step": 432 + }, + { + "epoch": 0.2525885955957416, + "grad_norm": 1.067054033279419, + "learning_rate": 4.501201923076923e-05, + "loss": 1.1787, + "step": 433 + }, + { + "epoch": 0.2531719410821059, + "grad_norm": 1.1260448694229126, + "learning_rate": 4.499699519230769e-05, + "loss": 1.0111, + "step": 434 + }, + { + "epoch": 0.2537552865684702, + "grad_norm": 1.079969048500061, + "learning_rate": 4.498197115384616e-05, + "loss": 1.0204, + "step": 435 + }, + { + "epoch": 0.2543386320548345, + "grad_norm": 1.1547199487686157, + "learning_rate": 4.496694711538462e-05, + "loss": 0.9295, + "step": 436 + }, + { + "epoch": 0.2549219775411988, + "grad_norm": 1.3175557851791382, + "learning_rate": 4.495192307692308e-05, + "loss": 0.9133, + "step": 437 + }, + { + "epoch": 0.25550532302756307, + "grad_norm": 1.323306918144226, + "learning_rate": 4.493689903846154e-05, + "loss": 0.9118, + "step": 438 + }, + { + "epoch": 0.25608866851392736, + "grad_norm": 1.2758287191390991, + "learning_rate": 4.4921875e-05, + "loss": 1.2725, + "step": 439 + }, + { + "epoch": 0.25667201400029166, + "grad_norm": 1.7028545141220093, + "learning_rate": 4.4906850961538465e-05, + "loss": 1.088, + "step": 440 + }, + { + "epoch": 0.25725535948665595, + "grad_norm": 1.8174322843551636, + "learning_rate": 4.489182692307692e-05, + "loss": 1.1273, + "step": 441 + }, + { + "epoch": 0.25783870497302025, + "grad_norm": 1.1969882249832153, + "learning_rate": 4.487680288461539e-05, + "loss": 1.2244, + "step": 442 + }, + { + "epoch": 0.25842205045938454, + "grad_norm": 0.9966511726379395, + "learning_rate": 4.486177884615385e-05, + "loss": 1.1073, + "step": 443 + }, + { + "epoch": 0.2590053959457489, + "grad_norm": 1.1165422201156616, + "learning_rate": 4.484675480769231e-05, + "loss": 0.9374, + "step": 444 + }, + { + "epoch": 0.2595887414321132, + "grad_norm": 1.374568223953247, + "learning_rate": 4.483173076923077e-05, + "loss": 1.0658, + "step": 445 + }, + { + "epoch": 0.2601720869184775, + "grad_norm": 1.2743726968765259, + "learning_rate": 4.481670673076923e-05, + "loss": 1.1376, + "step": 446 + }, + { + "epoch": 0.2607554324048418, + "grad_norm": 1.2559353113174438, + "learning_rate": 4.4801682692307694e-05, + "loss": 0.9594, + "step": 447 + }, + { + "epoch": 0.2613387778912061, + "grad_norm": 1.220955491065979, + "learning_rate": 4.478665865384616e-05, + "loss": 1.0246, + "step": 448 + }, + { + "epoch": 0.26192212337757037, + "grad_norm": 1.0389074087142944, + "learning_rate": 4.477163461538462e-05, + "loss": 0.973, + "step": 449 + }, + { + "epoch": 0.26250546886393467, + "grad_norm": 1.1613105535507202, + "learning_rate": 4.4756610576923083e-05, + "loss": 1.1086, + "step": 450 + }, + { + "epoch": 0.26308881435029896, + "grad_norm": 1.147077202796936, + "learning_rate": 4.4741586538461535e-05, + "loss": 1.2115, + "step": 451 + }, + { + "epoch": 0.26367215983666326, + "grad_norm": 1.1235055923461914, + "learning_rate": 4.47265625e-05, + "loss": 1.1007, + "step": 452 + }, + { + "epoch": 0.26425550532302755, + "grad_norm": 2.1269114017486572, + "learning_rate": 4.4711538461538466e-05, + "loss": 1.1473, + "step": 453 + }, + { + "epoch": 0.26483885080939185, + "grad_norm": 1.0030604600906372, + "learning_rate": 4.4696514423076924e-05, + "loss": 0.9971, + "step": 454 + }, + { + "epoch": 0.26542219629575614, + "grad_norm": 1.013385534286499, + "learning_rate": 4.468149038461539e-05, + "loss": 0.9949, + "step": 455 + }, + { + "epoch": 0.26600554178212044, + "grad_norm": 1.333153486251831, + "learning_rate": 4.466646634615385e-05, + "loss": 0.9749, + "step": 456 + }, + { + "epoch": 0.2665888872684848, + "grad_norm": 1.3025758266448975, + "learning_rate": 4.4651442307692306e-05, + "loss": 0.961, + "step": 457 + }, + { + "epoch": 0.2671722327548491, + "grad_norm": 1.1991068124771118, + "learning_rate": 4.463641826923077e-05, + "loss": 0.9674, + "step": 458 + }, + { + "epoch": 0.2677555782412134, + "grad_norm": 1.5575584173202515, + "learning_rate": 4.462139423076923e-05, + "loss": 0.9976, + "step": 459 + }, + { + "epoch": 0.26833892372757767, + "grad_norm": 1.234240174293518, + "learning_rate": 4.4606370192307696e-05, + "loss": 0.969, + "step": 460 + }, + { + "epoch": 0.26892226921394197, + "grad_norm": 1.2866865396499634, + "learning_rate": 4.459134615384616e-05, + "loss": 0.7539, + "step": 461 + }, + { + "epoch": 0.26950561470030626, + "grad_norm": 1.3814231157302856, + "learning_rate": 4.457632211538462e-05, + "loss": 1.014, + "step": 462 + }, + { + "epoch": 0.27008896018667056, + "grad_norm": 1.2410728931427002, + "learning_rate": 4.456129807692308e-05, + "loss": 1.1885, + "step": 463 + }, + { + "epoch": 0.27067230567303485, + "grad_norm": 1.1972429752349854, + "learning_rate": 4.4546274038461536e-05, + "loss": 1.1027, + "step": 464 + }, + { + "epoch": 0.27125565115939915, + "grad_norm": 1.2238330841064453, + "learning_rate": 4.453125e-05, + "loss": 0.9025, + "step": 465 + }, + { + "epoch": 0.27183899664576344, + "grad_norm": 1.2489745616912842, + "learning_rate": 4.451622596153847e-05, + "loss": 1.0432, + "step": 466 + }, + { + "epoch": 0.27242234213212774, + "grad_norm": 1.2359799146652222, + "learning_rate": 4.4501201923076925e-05, + "loss": 1.0075, + "step": 467 + }, + { + "epoch": 0.27300568761849203, + "grad_norm": 1.4064029455184937, + "learning_rate": 4.448617788461539e-05, + "loss": 1.1092, + "step": 468 + }, + { + "epoch": 0.2735890331048563, + "grad_norm": 1.2376477718353271, + "learning_rate": 4.447115384615384e-05, + "loss": 1.0766, + "step": 469 + }, + { + "epoch": 0.2741723785912207, + "grad_norm": 1.1279197931289673, + "learning_rate": 4.445612980769231e-05, + "loss": 1.1218, + "step": 470 + }, + { + "epoch": 0.274755724077585, + "grad_norm": 1.5163652896881104, + "learning_rate": 4.444110576923077e-05, + "loss": 0.9557, + "step": 471 + }, + { + "epoch": 0.27533906956394927, + "grad_norm": 1.198194146156311, + "learning_rate": 4.442608173076923e-05, + "loss": 0.8365, + "step": 472 + }, + { + "epoch": 0.27592241505031356, + "grad_norm": 1.205476999282837, + "learning_rate": 4.44110576923077e-05, + "loss": 0.9057, + "step": 473 + }, + { + "epoch": 0.27650576053667786, + "grad_norm": 1.0138362646102905, + "learning_rate": 4.4396033653846155e-05, + "loss": 1.0689, + "step": 474 + }, + { + "epoch": 0.27708910602304215, + "grad_norm": 1.353697657585144, + "learning_rate": 4.4381009615384614e-05, + "loss": 0.9925, + "step": 475 + }, + { + "epoch": 0.27767245150940645, + "grad_norm": 1.3365930318832397, + "learning_rate": 4.436598557692308e-05, + "loss": 0.8991, + "step": 476 + }, + { + "epoch": 0.27825579699577074, + "grad_norm": 1.3026283979415894, + "learning_rate": 4.435096153846154e-05, + "loss": 1.1419, + "step": 477 + }, + { + "epoch": 0.27883914248213504, + "grad_norm": 1.311883807182312, + "learning_rate": 4.43359375e-05, + "loss": 0.9451, + "step": 478 + }, + { + "epoch": 0.27942248796849933, + "grad_norm": 0.950835108757019, + "learning_rate": 4.432091346153847e-05, + "loss": 0.8928, + "step": 479 + }, + { + "epoch": 0.28000583345486363, + "grad_norm": 1.2371882200241089, + "learning_rate": 4.4305889423076927e-05, + "loss": 1.076, + "step": 480 + }, + { + "epoch": 0.2805891789412279, + "grad_norm": 1.6330941915512085, + "learning_rate": 4.4290865384615385e-05, + "loss": 1.2121, + "step": 481 + }, + { + "epoch": 0.2811725244275922, + "grad_norm": 1.3535494804382324, + "learning_rate": 4.4275841346153844e-05, + "loss": 1.1427, + "step": 482 + }, + { + "epoch": 0.2817558699139565, + "grad_norm": 1.4552642107009888, + "learning_rate": 4.426081730769231e-05, + "loss": 1.0305, + "step": 483 + }, + { + "epoch": 0.28233921540032086, + "grad_norm": 0.9442708492279053, + "learning_rate": 4.4245793269230774e-05, + "loss": 1.0597, + "step": 484 + }, + { + "epoch": 0.28292256088668516, + "grad_norm": 1.0428327322006226, + "learning_rate": 4.423076923076923e-05, + "loss": 0.9091, + "step": 485 + }, + { + "epoch": 0.28350590637304945, + "grad_norm": 1.2180997133255005, + "learning_rate": 4.42157451923077e-05, + "loss": 0.9809, + "step": 486 + }, + { + "epoch": 0.28408925185941375, + "grad_norm": 1.153455138206482, + "learning_rate": 4.4200721153846156e-05, + "loss": 1.0152, + "step": 487 + }, + { + "epoch": 0.28467259734577804, + "grad_norm": 1.2968738079071045, + "learning_rate": 4.4185697115384615e-05, + "loss": 1.0907, + "step": 488 + }, + { + "epoch": 0.28525594283214234, + "grad_norm": 1.2618736028671265, + "learning_rate": 4.417067307692308e-05, + "loss": 0.9007, + "step": 489 + }, + { + "epoch": 0.28583928831850663, + "grad_norm": 1.4131522178649902, + "learning_rate": 4.415564903846154e-05, + "loss": 1.0028, + "step": 490 + }, + { + "epoch": 0.28642263380487093, + "grad_norm": 1.2588627338409424, + "learning_rate": 4.4140625000000004e-05, + "loss": 1.1391, + "step": 491 + }, + { + "epoch": 0.2870059792912352, + "grad_norm": 1.4097051620483398, + "learning_rate": 4.412560096153846e-05, + "loss": 1.1168, + "step": 492 + }, + { + "epoch": 0.2875893247775995, + "grad_norm": 0.9398725032806396, + "learning_rate": 4.411057692307692e-05, + "loss": 0.961, + "step": 493 + }, + { + "epoch": 0.2881726702639638, + "grad_norm": 1.3721821308135986, + "learning_rate": 4.4095552884615386e-05, + "loss": 0.8813, + "step": 494 + }, + { + "epoch": 0.2887560157503281, + "grad_norm": 1.4227555990219116, + "learning_rate": 4.4080528846153845e-05, + "loss": 1.2802, + "step": 495 + }, + { + "epoch": 0.2893393612366924, + "grad_norm": 1.5191651582717896, + "learning_rate": 4.406550480769231e-05, + "loss": 1.1544, + "step": 496 + }, + { + "epoch": 0.28992270672305676, + "grad_norm": 1.350059986114502, + "learning_rate": 4.4050480769230775e-05, + "loss": 1.1791, + "step": 497 + }, + { + "epoch": 0.29050605220942105, + "grad_norm": 1.2704778909683228, + "learning_rate": 4.4035456730769234e-05, + "loss": 1.0238, + "step": 498 + }, + { + "epoch": 0.29108939769578535, + "grad_norm": 1.3052599430084229, + "learning_rate": 4.402043269230769e-05, + "loss": 0.8804, + "step": 499 + }, + { + "epoch": 0.29167274318214964, + "grad_norm": 1.4801050424575806, + "learning_rate": 4.400540865384616e-05, + "loss": 1.0356, + "step": 500 + }, + { + "epoch": 0.29225608866851394, + "grad_norm": 1.1770102977752686, + "learning_rate": 4.3990384615384616e-05, + "loss": 0.9618, + "step": 501 + }, + { + "epoch": 0.29283943415487823, + "grad_norm": 1.216208577156067, + "learning_rate": 4.397536057692308e-05, + "loss": 1.1352, + "step": 502 + }, + { + "epoch": 0.2934227796412425, + "grad_norm": 1.1072677373886108, + "learning_rate": 4.396033653846154e-05, + "loss": 1.2635, + "step": 503 + }, + { + "epoch": 0.2940061251276068, + "grad_norm": 1.2480254173278809, + "learning_rate": 4.3945312500000005e-05, + "loss": 1.0547, + "step": 504 + }, + { + "epoch": 0.2945894706139711, + "grad_norm": 1.227055549621582, + "learning_rate": 4.3930288461538464e-05, + "loss": 1.1527, + "step": 505 + }, + { + "epoch": 0.2951728161003354, + "grad_norm": 1.1896963119506836, + "learning_rate": 4.391526442307692e-05, + "loss": 1.0816, + "step": 506 + }, + { + "epoch": 0.2957561615866997, + "grad_norm": 1.480958342552185, + "learning_rate": 4.390024038461539e-05, + "loss": 1.0701, + "step": 507 + }, + { + "epoch": 0.296339507073064, + "grad_norm": 1.4093650579452515, + "learning_rate": 4.3885216346153846e-05, + "loss": 1.1059, + "step": 508 + }, + { + "epoch": 0.2969228525594283, + "grad_norm": 1.1637531518936157, + "learning_rate": 4.387019230769231e-05, + "loss": 1.1347, + "step": 509 + }, + { + "epoch": 0.29750619804579265, + "grad_norm": 1.185320496559143, + "learning_rate": 4.385516826923077e-05, + "loss": 1.0149, + "step": 510 + }, + { + "epoch": 0.29808954353215694, + "grad_norm": 1.3317254781723022, + "learning_rate": 4.384014423076923e-05, + "loss": 0.9834, + "step": 511 + }, + { + "epoch": 0.29867288901852124, + "grad_norm": 1.0526453256607056, + "learning_rate": 4.3825120192307693e-05, + "loss": 0.9779, + "step": 512 + }, + { + "epoch": 0.29925623450488553, + "grad_norm": 1.2846801280975342, + "learning_rate": 4.381009615384616e-05, + "loss": 1.1655, + "step": 513 + }, + { + "epoch": 0.2998395799912498, + "grad_norm": 1.4375282526016235, + "learning_rate": 4.379507211538462e-05, + "loss": 1.1173, + "step": 514 + }, + { + "epoch": 0.3004229254776141, + "grad_norm": 1.0759106874465942, + "learning_rate": 4.378004807692308e-05, + "loss": 0.909, + "step": 515 + }, + { + "epoch": 0.3010062709639784, + "grad_norm": 1.262014389038086, + "learning_rate": 4.376502403846154e-05, + "loss": 1.1499, + "step": 516 + }, + { + "epoch": 0.3015896164503427, + "grad_norm": 1.5526944398880005, + "learning_rate": 4.375e-05, + "loss": 0.975, + "step": 517 + }, + { + "epoch": 0.302172961936707, + "grad_norm": 1.2012171745300293, + "learning_rate": 4.3734975961538465e-05, + "loss": 0.791, + "step": 518 + }, + { + "epoch": 0.3027563074230713, + "grad_norm": 1.248779058456421, + "learning_rate": 4.371995192307692e-05, + "loss": 0.9285, + "step": 519 + }, + { + "epoch": 0.3033396529094356, + "grad_norm": 1.387321949005127, + "learning_rate": 4.370492788461539e-05, + "loss": 1.0779, + "step": 520 + }, + { + "epoch": 0.3039229983957999, + "grad_norm": 1.4328728914260864, + "learning_rate": 4.368990384615385e-05, + "loss": 1.1295, + "step": 521 + }, + { + "epoch": 0.3045063438821642, + "grad_norm": 1.231877326965332, + "learning_rate": 4.367487980769231e-05, + "loss": 0.9938, + "step": 522 + }, + { + "epoch": 0.30508968936852854, + "grad_norm": 1.1031498908996582, + "learning_rate": 4.365985576923077e-05, + "loss": 1.0653, + "step": 523 + }, + { + "epoch": 0.30567303485489283, + "grad_norm": 1.112518310546875, + "learning_rate": 4.364483173076923e-05, + "loss": 1.033, + "step": 524 + }, + { + "epoch": 0.30625638034125713, + "grad_norm": 1.2784297466278076, + "learning_rate": 4.3629807692307695e-05, + "loss": 1.026, + "step": 525 + }, + { + "epoch": 0.3068397258276214, + "grad_norm": 1.298776626586914, + "learning_rate": 4.361478365384616e-05, + "loss": 0.8997, + "step": 526 + }, + { + "epoch": 0.3074230713139857, + "grad_norm": 1.134751319885254, + "learning_rate": 4.359975961538462e-05, + "loss": 1.2168, + "step": 527 + }, + { + "epoch": 0.30800641680035, + "grad_norm": 1.0475013256072998, + "learning_rate": 4.358473557692308e-05, + "loss": 1.1632, + "step": 528 + }, + { + "epoch": 0.3085897622867143, + "grad_norm": 1.1225156784057617, + "learning_rate": 4.3569711538461535e-05, + "loss": 1.0383, + "step": 529 + }, + { + "epoch": 0.3091731077730786, + "grad_norm": 1.3873623609542847, + "learning_rate": 4.35546875e-05, + "loss": 1.0286, + "step": 530 + }, + { + "epoch": 0.3097564532594429, + "grad_norm": 1.9602346420288086, + "learning_rate": 4.3539663461538466e-05, + "loss": 1.1737, + "step": 531 + }, + { + "epoch": 0.3103397987458072, + "grad_norm": 1.5184245109558105, + "learning_rate": 4.3524639423076925e-05, + "loss": 1.0894, + "step": 532 + }, + { + "epoch": 0.3109231442321715, + "grad_norm": 1.3252931833267212, + "learning_rate": 4.350961538461539e-05, + "loss": 1.0596, + "step": 533 + }, + { + "epoch": 0.3115064897185358, + "grad_norm": 1.6967353820800781, + "learning_rate": 4.349459134615385e-05, + "loss": 1.0257, + "step": 534 + }, + { + "epoch": 0.3120898352049001, + "grad_norm": 1.2516505718231201, + "learning_rate": 4.347956730769231e-05, + "loss": 1.3018, + "step": 535 + }, + { + "epoch": 0.3126731806912644, + "grad_norm": 2.050544500350952, + "learning_rate": 4.346454326923077e-05, + "loss": 1.155, + "step": 536 + }, + { + "epoch": 0.3132565261776287, + "grad_norm": 1.6617178916931152, + "learning_rate": 4.344951923076923e-05, + "loss": 0.9574, + "step": 537 + }, + { + "epoch": 0.313839871663993, + "grad_norm": 1.0113744735717773, + "learning_rate": 4.3434495192307696e-05, + "loss": 1.0248, + "step": 538 + }, + { + "epoch": 0.3144232171503573, + "grad_norm": 1.1463007926940918, + "learning_rate": 4.341947115384616e-05, + "loss": 1.0335, + "step": 539 + }, + { + "epoch": 0.3150065626367216, + "grad_norm": 1.0310202836990356, + "learning_rate": 4.340444711538462e-05, + "loss": 0.8483, + "step": 540 + }, + { + "epoch": 0.3155899081230859, + "grad_norm": 1.2262356281280518, + "learning_rate": 4.338942307692308e-05, + "loss": 1.1332, + "step": 541 + }, + { + "epoch": 0.3161732536094502, + "grad_norm": 1.4348859786987305, + "learning_rate": 4.337439903846154e-05, + "loss": 0.9394, + "step": 542 + }, + { + "epoch": 0.3167565990958145, + "grad_norm": 1.4490423202514648, + "learning_rate": 4.3359375e-05, + "loss": 1.1187, + "step": 543 + }, + { + "epoch": 0.3173399445821788, + "grad_norm": 1.3444559574127197, + "learning_rate": 4.334435096153847e-05, + "loss": 1.0358, + "step": 544 + }, + { + "epoch": 0.3179232900685431, + "grad_norm": 1.1789164543151855, + "learning_rate": 4.3329326923076926e-05, + "loss": 0.9231, + "step": 545 + }, + { + "epoch": 0.3185066355549074, + "grad_norm": 1.231382966041565, + "learning_rate": 4.3314302884615384e-05, + "loss": 1.0525, + "step": 546 + }, + { + "epoch": 0.3190899810412717, + "grad_norm": 1.0804648399353027, + "learning_rate": 4.329927884615384e-05, + "loss": 0.9816, + "step": 547 + }, + { + "epoch": 0.31967332652763597, + "grad_norm": 1.2451800107955933, + "learning_rate": 4.328425480769231e-05, + "loss": 0.8803, + "step": 548 + }, + { + "epoch": 0.32025667201400027, + "grad_norm": 1.2045650482177734, + "learning_rate": 4.326923076923077e-05, + "loss": 1.1134, + "step": 549 + }, + { + "epoch": 0.3208400175003646, + "grad_norm": 1.3157379627227783, + "learning_rate": 4.325420673076923e-05, + "loss": 1.2101, + "step": 550 + }, + { + "epoch": 0.3214233629867289, + "grad_norm": 1.2755893468856812, + "learning_rate": 4.32391826923077e-05, + "loss": 0.9826, + "step": 551 + }, + { + "epoch": 0.3220067084730932, + "grad_norm": 1.2792096138000488, + "learning_rate": 4.3224158653846156e-05, + "loss": 0.8629, + "step": 552 + }, + { + "epoch": 0.3225900539594575, + "grad_norm": 1.1155221462249756, + "learning_rate": 4.3209134615384614e-05, + "loss": 1.1298, + "step": 553 + }, + { + "epoch": 0.3231733994458218, + "grad_norm": 1.3425167798995972, + "learning_rate": 4.319411057692308e-05, + "loss": 1.0914, + "step": 554 + }, + { + "epoch": 0.3237567449321861, + "grad_norm": 2.2145228385925293, + "learning_rate": 4.317908653846154e-05, + "loss": 0.9944, + "step": 555 + }, + { + "epoch": 0.3243400904185504, + "grad_norm": 1.3437769412994385, + "learning_rate": 4.31640625e-05, + "loss": 1.0431, + "step": 556 + }, + { + "epoch": 0.3249234359049147, + "grad_norm": 1.253940463066101, + "learning_rate": 4.314903846153847e-05, + "loss": 0.8568, + "step": 557 + }, + { + "epoch": 0.325506781391279, + "grad_norm": 1.6543011665344238, + "learning_rate": 4.313401442307693e-05, + "loss": 0.9353, + "step": 558 + }, + { + "epoch": 0.32609012687764327, + "grad_norm": 1.3182803392410278, + "learning_rate": 4.3118990384615385e-05, + "loss": 0.8821, + "step": 559 + }, + { + "epoch": 0.32667347236400757, + "grad_norm": 1.3994224071502686, + "learning_rate": 4.3103966346153844e-05, + "loss": 0.9704, + "step": 560 + }, + { + "epoch": 0.32725681785037186, + "grad_norm": 1.1596894264221191, + "learning_rate": 4.308894230769231e-05, + "loss": 1.1954, + "step": 561 + }, + { + "epoch": 0.32784016333673616, + "grad_norm": 1.1696090698242188, + "learning_rate": 4.3073918269230774e-05, + "loss": 0.994, + "step": 562 + }, + { + "epoch": 0.3284235088231005, + "grad_norm": 1.0677425861358643, + "learning_rate": 4.305889423076923e-05, + "loss": 0.9962, + "step": 563 + }, + { + "epoch": 0.3290068543094648, + "grad_norm": 1.27321457862854, + "learning_rate": 4.304387019230769e-05, + "loss": 1.1947, + "step": 564 + }, + { + "epoch": 0.3295901997958291, + "grad_norm": 1.2170658111572266, + "learning_rate": 4.302884615384616e-05, + "loss": 1.2042, + "step": 565 + }, + { + "epoch": 0.3301735452821934, + "grad_norm": 1.4280612468719482, + "learning_rate": 4.3013822115384615e-05, + "loss": 1.3016, + "step": 566 + }, + { + "epoch": 0.3307568907685577, + "grad_norm": 1.2479727268218994, + "learning_rate": 4.299879807692308e-05, + "loss": 1.172, + "step": 567 + }, + { + "epoch": 0.331340236254922, + "grad_norm": 1.192344069480896, + "learning_rate": 4.298377403846154e-05, + "loss": 1.0476, + "step": 568 + }, + { + "epoch": 0.3319235817412863, + "grad_norm": 1.4639259576797485, + "learning_rate": 4.2968750000000004e-05, + "loss": 0.8669, + "step": 569 + }, + { + "epoch": 0.3325069272276506, + "grad_norm": 1.4050641059875488, + "learning_rate": 4.295372596153846e-05, + "loss": 1.1723, + "step": 570 + }, + { + "epoch": 0.33309027271401487, + "grad_norm": 1.117199420928955, + "learning_rate": 4.293870192307692e-05, + "loss": 1.0075, + "step": 571 + }, + { + "epoch": 0.33367361820037916, + "grad_norm": 1.2617186307907104, + "learning_rate": 4.2923677884615387e-05, + "loss": 1.1577, + "step": 572 + }, + { + "epoch": 0.33425696368674346, + "grad_norm": 1.269479513168335, + "learning_rate": 4.2908653846153845e-05, + "loss": 1.1862, + "step": 573 + }, + { + "epoch": 0.33484030917310775, + "grad_norm": 1.3484537601470947, + "learning_rate": 4.289362980769231e-05, + "loss": 1.0458, + "step": 574 + }, + { + "epoch": 0.33542365465947205, + "grad_norm": 1.2798513174057007, + "learning_rate": 4.2878605769230776e-05, + "loss": 1.1412, + "step": 575 + }, + { + "epoch": 0.3360070001458364, + "grad_norm": 1.1217023134231567, + "learning_rate": 4.2863581730769234e-05, + "loss": 0.8785, + "step": 576 + }, + { + "epoch": 0.3365903456322007, + "grad_norm": 1.2047514915466309, + "learning_rate": 4.284855769230769e-05, + "loss": 1.0627, + "step": 577 + }, + { + "epoch": 0.337173691118565, + "grad_norm": 1.3945984840393066, + "learning_rate": 4.283353365384616e-05, + "loss": 1.122, + "step": 578 + }, + { + "epoch": 0.3377570366049293, + "grad_norm": 1.466818928718567, + "learning_rate": 4.2818509615384616e-05, + "loss": 1.0109, + "step": 579 + }, + { + "epoch": 0.3383403820912936, + "grad_norm": 1.4969720840454102, + "learning_rate": 4.280348557692308e-05, + "loss": 1.0855, + "step": 580 + }, + { + "epoch": 0.3389237275776579, + "grad_norm": 1.2596534490585327, + "learning_rate": 4.278846153846154e-05, + "loss": 1.0654, + "step": 581 + }, + { + "epoch": 0.33950707306402217, + "grad_norm": 1.1512525081634521, + "learning_rate": 4.27734375e-05, + "loss": 1.1286, + "step": 582 + }, + { + "epoch": 0.34009041855038646, + "grad_norm": 1.2515615224838257, + "learning_rate": 4.2758413461538464e-05, + "loss": 1.2436, + "step": 583 + }, + { + "epoch": 0.34067376403675076, + "grad_norm": 1.2093133926391602, + "learning_rate": 4.274338942307692e-05, + "loss": 1.0487, + "step": 584 + }, + { + "epoch": 0.34125710952311505, + "grad_norm": 1.1588521003723145, + "learning_rate": 4.272836538461539e-05, + "loss": 1.1469, + "step": 585 + }, + { + "epoch": 0.34184045500947935, + "grad_norm": 1.1132664680480957, + "learning_rate": 4.2713341346153846e-05, + "loss": 1.0547, + "step": 586 + }, + { + "epoch": 0.34242380049584364, + "grad_norm": 1.2172892093658447, + "learning_rate": 4.269831730769231e-05, + "loss": 1.2139, + "step": 587 + }, + { + "epoch": 0.34300714598220794, + "grad_norm": 1.605918526649475, + "learning_rate": 4.268329326923077e-05, + "loss": 0.9379, + "step": 588 + }, + { + "epoch": 0.34359049146857223, + "grad_norm": 1.560250163078308, + "learning_rate": 4.266826923076923e-05, + "loss": 0.992, + "step": 589 + }, + { + "epoch": 0.3441738369549366, + "grad_norm": 1.7285302877426147, + "learning_rate": 4.2653245192307694e-05, + "loss": 0.9653, + "step": 590 + }, + { + "epoch": 0.3447571824413009, + "grad_norm": 1.458788514137268, + "learning_rate": 4.263822115384616e-05, + "loss": 1.1627, + "step": 591 + }, + { + "epoch": 0.3453405279276652, + "grad_norm": 1.444248080253601, + "learning_rate": 4.262319711538462e-05, + "loss": 1.1464, + "step": 592 + }, + { + "epoch": 0.34592387341402947, + "grad_norm": 1.3933836221694946, + "learning_rate": 4.260817307692308e-05, + "loss": 1.0868, + "step": 593 + }, + { + "epoch": 0.34650721890039377, + "grad_norm": 1.6488311290740967, + "learning_rate": 4.259314903846154e-05, + "loss": 1.0009, + "step": 594 + }, + { + "epoch": 0.34709056438675806, + "grad_norm": 1.1838133335113525, + "learning_rate": 4.2578125e-05, + "loss": 0.884, + "step": 595 + }, + { + "epoch": 0.34767390987312236, + "grad_norm": 1.3585830926895142, + "learning_rate": 4.2563100961538465e-05, + "loss": 1.1594, + "step": 596 + }, + { + "epoch": 0.34825725535948665, + "grad_norm": 1.4692409038543701, + "learning_rate": 4.2548076923076924e-05, + "loss": 1.0424, + "step": 597 + }, + { + "epoch": 0.34884060084585095, + "grad_norm": 1.2625113725662231, + "learning_rate": 4.253305288461539e-05, + "loss": 1.171, + "step": 598 + }, + { + "epoch": 0.34942394633221524, + "grad_norm": 1.2111209630966187, + "learning_rate": 4.251802884615385e-05, + "loss": 0.9783, + "step": 599 + }, + { + "epoch": 0.35000729181857954, + "grad_norm": 1.224150538444519, + "learning_rate": 4.2503004807692306e-05, + "loss": 1.1189, + "step": 600 + }, + { + "epoch": 0.35000729181857954, + "eval_loss_squad": 1.0769534187577665, + "eval_perplexity": 7.957609155257721, + "eval_perplexity_reconstruct": 1.950415498039559, + "step": 600 + }, + { + "epoch": 0.35059063730494383, + "grad_norm": 2.5112569332122803, + "learning_rate": 4.248798076923077e-05, + "loss": 1.1328, + "step": 601 + }, + { + "epoch": 0.3511739827913081, + "grad_norm": 1.6178394556045532, + "learning_rate": 4.247295673076923e-05, + "loss": 1.1492, + "step": 602 + }, + { + "epoch": 0.3517573282776725, + "grad_norm": 0.9533804059028625, + "learning_rate": 4.2457932692307695e-05, + "loss": 1.1439, + "step": 603 + }, + { + "epoch": 0.35234067376403677, + "grad_norm": 1.2348041534423828, + "learning_rate": 4.244290865384616e-05, + "loss": 0.8496, + "step": 604 + }, + { + "epoch": 0.35292401925040107, + "grad_norm": 1.199015498161316, + "learning_rate": 4.242788461538462e-05, + "loss": 1.0841, + "step": 605 + }, + { + "epoch": 0.35350736473676536, + "grad_norm": 1.264140248298645, + "learning_rate": 4.241286057692308e-05, + "loss": 0.9882, + "step": 606 + }, + { + "epoch": 0.35409071022312966, + "grad_norm": 1.1264489889144897, + "learning_rate": 4.2397836538461536e-05, + "loss": 0.8837, + "step": 607 + }, + { + "epoch": 0.35467405570949395, + "grad_norm": 1.0938150882720947, + "learning_rate": 4.23828125e-05, + "loss": 0.9441, + "step": 608 + }, + { + "epoch": 0.35525740119585825, + "grad_norm": 1.190896987915039, + "learning_rate": 4.2367788461538466e-05, + "loss": 1.0634, + "step": 609 + }, + { + "epoch": 0.35584074668222254, + "grad_norm": 1.4452359676361084, + "learning_rate": 4.2352764423076925e-05, + "loss": 0.786, + "step": 610 + }, + { + "epoch": 0.35642409216858684, + "grad_norm": 1.373305082321167, + "learning_rate": 4.233774038461539e-05, + "loss": 0.9567, + "step": 611 + }, + { + "epoch": 0.35700743765495113, + "grad_norm": 1.3388804197311401, + "learning_rate": 4.232271634615385e-05, + "loss": 0.9216, + "step": 612 + }, + { + "epoch": 0.3575907831413154, + "grad_norm": 2.152236223220825, + "learning_rate": 4.230769230769231e-05, + "loss": 1.1881, + "step": 613 + }, + { + "epoch": 0.3581741286276797, + "grad_norm": 1.3865232467651367, + "learning_rate": 4.229266826923077e-05, + "loss": 1.0252, + "step": 614 + }, + { + "epoch": 0.358757474114044, + "grad_norm": 1.3287843465805054, + "learning_rate": 4.227764423076923e-05, + "loss": 1.1208, + "step": 615 + }, + { + "epoch": 0.35934081960040837, + "grad_norm": 1.2160487174987793, + "learning_rate": 4.2262620192307696e-05, + "loss": 0.7792, + "step": 616 + }, + { + "epoch": 0.35992416508677266, + "grad_norm": 1.3507564067840576, + "learning_rate": 4.224759615384616e-05, + "loss": 0.8473, + "step": 617 + }, + { + "epoch": 0.36050751057313696, + "grad_norm": 1.738625168800354, + "learning_rate": 4.223257211538461e-05, + "loss": 1.016, + "step": 618 + }, + { + "epoch": 0.36109085605950125, + "grad_norm": 1.3850781917572021, + "learning_rate": 4.221754807692308e-05, + "loss": 1.1247, + "step": 619 + }, + { + "epoch": 0.36167420154586555, + "grad_norm": 1.1078225374221802, + "learning_rate": 4.220252403846154e-05, + "loss": 1.0403, + "step": 620 + }, + { + "epoch": 0.36225754703222984, + "grad_norm": 1.5114189386367798, + "learning_rate": 4.21875e-05, + "loss": 1.136, + "step": 621 + }, + { + "epoch": 0.36284089251859414, + "grad_norm": 1.1798583269119263, + "learning_rate": 4.217247596153847e-05, + "loss": 1.0254, + "step": 622 + }, + { + "epoch": 0.36342423800495843, + "grad_norm": 1.6477906703948975, + "learning_rate": 4.2157451923076926e-05, + "loss": 1.0695, + "step": 623 + }, + { + "epoch": 0.36400758349132273, + "grad_norm": 1.1232982873916626, + "learning_rate": 4.2142427884615385e-05, + "loss": 0.7686, + "step": 624 + }, + { + "epoch": 0.364590928977687, + "grad_norm": 1.483515739440918, + "learning_rate": 4.212740384615384e-05, + "loss": 1.0878, + "step": 625 + }, + { + "epoch": 0.3651742744640513, + "grad_norm": 1.361569881439209, + "learning_rate": 4.211237980769231e-05, + "loss": 1.0778, + "step": 626 + }, + { + "epoch": 0.3657576199504156, + "grad_norm": 1.3304964303970337, + "learning_rate": 4.2097355769230774e-05, + "loss": 1.2297, + "step": 627 + }, + { + "epoch": 0.3663409654367799, + "grad_norm": 1.2258186340332031, + "learning_rate": 4.208233173076923e-05, + "loss": 1.0869, + "step": 628 + }, + { + "epoch": 0.36692431092314426, + "grad_norm": 2.0285935401916504, + "learning_rate": 4.20673076923077e-05, + "loss": 1.0043, + "step": 629 + }, + { + "epoch": 0.36750765640950855, + "grad_norm": 1.278552770614624, + "learning_rate": 4.2052283653846156e-05, + "loss": 0.9999, + "step": 630 + }, + { + "epoch": 0.36809100189587285, + "grad_norm": 1.9170531034469604, + "learning_rate": 4.2037259615384614e-05, + "loss": 0.8887, + "step": 631 + }, + { + "epoch": 0.36867434738223714, + "grad_norm": 1.070654034614563, + "learning_rate": 4.202223557692308e-05, + "loss": 0.8389, + "step": 632 + }, + { + "epoch": 0.36925769286860144, + "grad_norm": 1.3350582122802734, + "learning_rate": 4.200721153846154e-05, + "loss": 1.1057, + "step": 633 + }, + { + "epoch": 0.36984103835496573, + "grad_norm": 1.2793062925338745, + "learning_rate": 4.1992187500000003e-05, + "loss": 1.0407, + "step": 634 + }, + { + "epoch": 0.37042438384133003, + "grad_norm": 1.2050164937973022, + "learning_rate": 4.197716346153847e-05, + "loss": 1.0973, + "step": 635 + }, + { + "epoch": 0.3710077293276943, + "grad_norm": 1.1398264169692993, + "learning_rate": 4.196213942307692e-05, + "loss": 1.162, + "step": 636 + }, + { + "epoch": 0.3715910748140586, + "grad_norm": 1.2741928100585938, + "learning_rate": 4.1947115384615386e-05, + "loss": 0.9104, + "step": 637 + }, + { + "epoch": 0.3721744203004229, + "grad_norm": 1.6818182468414307, + "learning_rate": 4.1932091346153844e-05, + "loss": 0.9304, + "step": 638 + }, + { + "epoch": 0.3727577657867872, + "grad_norm": 1.0803583860397339, + "learning_rate": 4.191706730769231e-05, + "loss": 1.1518, + "step": 639 + }, + { + "epoch": 0.3733411112731515, + "grad_norm": 1.178480625152588, + "learning_rate": 4.1902043269230775e-05, + "loss": 0.9778, + "step": 640 + }, + { + "epoch": 0.3739244567595158, + "grad_norm": 1.3216358423233032, + "learning_rate": 4.188701923076923e-05, + "loss": 1.0803, + "step": 641 + }, + { + "epoch": 0.3745078022458801, + "grad_norm": 1.191552758216858, + "learning_rate": 4.187199519230769e-05, + "loss": 1.0794, + "step": 642 + }, + { + "epoch": 0.37509114773224445, + "grad_norm": 1.240075945854187, + "learning_rate": 4.185697115384616e-05, + "loss": 1.2029, + "step": 643 + }, + { + "epoch": 0.37567449321860874, + "grad_norm": 1.2938767671585083, + "learning_rate": 4.1841947115384616e-05, + "loss": 1.1531, + "step": 644 + }, + { + "epoch": 0.37625783870497304, + "grad_norm": 1.4226038455963135, + "learning_rate": 4.182692307692308e-05, + "loss": 1.0929, + "step": 645 + }, + { + "epoch": 0.37684118419133733, + "grad_norm": 1.0986605882644653, + "learning_rate": 4.181189903846154e-05, + "loss": 0.8105, + "step": 646 + }, + { + "epoch": 0.3774245296777016, + "grad_norm": 1.0812294483184814, + "learning_rate": 4.1796875000000005e-05, + "loss": 1.0476, + "step": 647 + }, + { + "epoch": 0.3780078751640659, + "grad_norm": 1.7585773468017578, + "learning_rate": 4.178185096153846e-05, + "loss": 1.2113, + "step": 648 + }, + { + "epoch": 0.3785912206504302, + "grad_norm": 1.1438621282577515, + "learning_rate": 4.176682692307692e-05, + "loss": 0.8594, + "step": 649 + }, + { + "epoch": 0.3791745661367945, + "grad_norm": 1.1411939859390259, + "learning_rate": 4.175180288461539e-05, + "loss": 0.8793, + "step": 650 + }, + { + "epoch": 0.3797579116231588, + "grad_norm": 1.1287474632263184, + "learning_rate": 4.1736778846153845e-05, + "loss": 0.8364, + "step": 651 + }, + { + "epoch": 0.3803412571095231, + "grad_norm": 1.285798192024231, + "learning_rate": 4.172175480769231e-05, + "loss": 1.1573, + "step": 652 + }, + { + "epoch": 0.3809246025958874, + "grad_norm": 1.1466621160507202, + "learning_rate": 4.1706730769230776e-05, + "loss": 0.9106, + "step": 653 + }, + { + "epoch": 0.3815079480822517, + "grad_norm": 1.2659651041030884, + "learning_rate": 4.1691706730769234e-05, + "loss": 0.9637, + "step": 654 + }, + { + "epoch": 0.382091293568616, + "grad_norm": 1.6592333316802979, + "learning_rate": 4.167668269230769e-05, + "loss": 0.9402, + "step": 655 + }, + { + "epoch": 0.38267463905498034, + "grad_norm": 0.9642985463142395, + "learning_rate": 4.166165865384616e-05, + "loss": 0.9339, + "step": 656 + }, + { + "epoch": 0.38325798454134463, + "grad_norm": 1.3237916231155396, + "learning_rate": 4.164663461538462e-05, + "loss": 1.1165, + "step": 657 + }, + { + "epoch": 0.3838413300277089, + "grad_norm": 1.2888929843902588, + "learning_rate": 4.163161057692308e-05, + "loss": 1.1668, + "step": 658 + }, + { + "epoch": 0.3844246755140732, + "grad_norm": 1.3504743576049805, + "learning_rate": 4.161658653846154e-05, + "loss": 1.2046, + "step": 659 + }, + { + "epoch": 0.3850080210004375, + "grad_norm": 1.2326107025146484, + "learning_rate": 4.16015625e-05, + "loss": 0.8843, + "step": 660 + }, + { + "epoch": 0.3855913664868018, + "grad_norm": 1.083722472190857, + "learning_rate": 4.1586538461538464e-05, + "loss": 0.8806, + "step": 661 + }, + { + "epoch": 0.3861747119731661, + "grad_norm": 1.2427271604537964, + "learning_rate": 4.157151442307692e-05, + "loss": 1.1388, + "step": 662 + }, + { + "epoch": 0.3867580574595304, + "grad_norm": 1.2181854248046875, + "learning_rate": 4.155649038461539e-05, + "loss": 0.979, + "step": 663 + }, + { + "epoch": 0.3873414029458947, + "grad_norm": 1.6642446517944336, + "learning_rate": 4.1541466346153847e-05, + "loss": 1.0273, + "step": 664 + }, + { + "epoch": 0.387924748432259, + "grad_norm": 3.092848539352417, + "learning_rate": 4.152644230769231e-05, + "loss": 0.9759, + "step": 665 + }, + { + "epoch": 0.3885080939186233, + "grad_norm": 1.4167457818984985, + "learning_rate": 4.151141826923077e-05, + "loss": 1.1041, + "step": 666 + }, + { + "epoch": 0.3890914394049876, + "grad_norm": 1.4154654741287231, + "learning_rate": 4.149639423076923e-05, + "loss": 1.0213, + "step": 667 + }, + { + "epoch": 0.3896747848913519, + "grad_norm": 1.112176775932312, + "learning_rate": 4.1481370192307694e-05, + "loss": 0.9382, + "step": 668 + }, + { + "epoch": 0.39025813037771623, + "grad_norm": 1.0001271963119507, + "learning_rate": 4.146634615384616e-05, + "loss": 1.0377, + "step": 669 + }, + { + "epoch": 0.3908414758640805, + "grad_norm": 1.0494935512542725, + "learning_rate": 4.145132211538462e-05, + "loss": 1.0897, + "step": 670 + }, + { + "epoch": 0.3914248213504448, + "grad_norm": 1.2128801345825195, + "learning_rate": 4.143629807692308e-05, + "loss": 0.9406, + "step": 671 + }, + { + "epoch": 0.3920081668368091, + "grad_norm": 1.1072816848754883, + "learning_rate": 4.142127403846154e-05, + "loss": 0.9358, + "step": 672 + }, + { + "epoch": 0.3925915123231734, + "grad_norm": 1.251513123512268, + "learning_rate": 4.140625e-05, + "loss": 1.0557, + "step": 673 + }, + { + "epoch": 0.3931748578095377, + "grad_norm": 1.124248743057251, + "learning_rate": 4.1391225961538465e-05, + "loss": 1.1116, + "step": 674 + }, + { + "epoch": 0.393758203295902, + "grad_norm": 1.1827398538589478, + "learning_rate": 4.1376201923076924e-05, + "loss": 1.1149, + "step": 675 + }, + { + "epoch": 0.3943415487822663, + "grad_norm": 1.1932939291000366, + "learning_rate": 4.136117788461539e-05, + "loss": 0.865, + "step": 676 + }, + { + "epoch": 0.3949248942686306, + "grad_norm": 1.1423388719558716, + "learning_rate": 4.134615384615385e-05, + "loss": 1.0161, + "step": 677 + }, + { + "epoch": 0.3955082397549949, + "grad_norm": 1.1362109184265137, + "learning_rate": 4.1331129807692306e-05, + "loss": 1.1078, + "step": 678 + }, + { + "epoch": 0.3960915852413592, + "grad_norm": 1.173803448677063, + "learning_rate": 4.131610576923077e-05, + "loss": 1.0828, + "step": 679 + }, + { + "epoch": 0.3966749307277235, + "grad_norm": 1.1851235628128052, + "learning_rate": 4.130108173076923e-05, + "loss": 1.0101, + "step": 680 + }, + { + "epoch": 0.39725827621408777, + "grad_norm": 1.1966065168380737, + "learning_rate": 4.1286057692307695e-05, + "loss": 1.0556, + "step": 681 + }, + { + "epoch": 0.3978416217004521, + "grad_norm": 1.042441725730896, + "learning_rate": 4.127103365384616e-05, + "loss": 0.9449, + "step": 682 + }, + { + "epoch": 0.3984249671868164, + "grad_norm": 1.3966326713562012, + "learning_rate": 4.125600961538462e-05, + "loss": 1.2354, + "step": 683 + }, + { + "epoch": 0.3990083126731807, + "grad_norm": 1.7100861072540283, + "learning_rate": 4.124098557692308e-05, + "loss": 1.0369, + "step": 684 + }, + { + "epoch": 0.399591658159545, + "grad_norm": 1.1547572612762451, + "learning_rate": 4.1225961538461536e-05, + "loss": 1.2298, + "step": 685 + }, + { + "epoch": 0.4001750036459093, + "grad_norm": 1.2655994892120361, + "learning_rate": 4.12109375e-05, + "loss": 1.0823, + "step": 686 + }, + { + "epoch": 0.4007583491322736, + "grad_norm": 1.1846497058868408, + "learning_rate": 4.119591346153847e-05, + "loss": 1.127, + "step": 687 + }, + { + "epoch": 0.4013416946186379, + "grad_norm": 1.3547903299331665, + "learning_rate": 4.1180889423076925e-05, + "loss": 0.9981, + "step": 688 + }, + { + "epoch": 0.4019250401050022, + "grad_norm": 1.16106379032135, + "learning_rate": 4.116586538461539e-05, + "loss": 1.0675, + "step": 689 + }, + { + "epoch": 0.4025083855913665, + "grad_norm": 1.3457812070846558, + "learning_rate": 4.115084134615385e-05, + "loss": 1.0359, + "step": 690 + }, + { + "epoch": 0.4030917310777308, + "grad_norm": 1.440121054649353, + "learning_rate": 4.113581730769231e-05, + "loss": 1.0681, + "step": 691 + }, + { + "epoch": 0.40367507656409507, + "grad_norm": 1.382415533065796, + "learning_rate": 4.112079326923077e-05, + "loss": 0.9819, + "step": 692 + }, + { + "epoch": 0.40425842205045937, + "grad_norm": 1.391350507736206, + "learning_rate": 4.110576923076923e-05, + "loss": 1.0845, + "step": 693 + }, + { + "epoch": 0.40484176753682366, + "grad_norm": 1.2903047800064087, + "learning_rate": 4.1090745192307696e-05, + "loss": 1.0631, + "step": 694 + }, + { + "epoch": 0.405425113023188, + "grad_norm": 1.1688337326049805, + "learning_rate": 4.107572115384616e-05, + "loss": 0.9691, + "step": 695 + }, + { + "epoch": 0.4060084585095523, + "grad_norm": 1.327972173690796, + "learning_rate": 4.1060697115384613e-05, + "loss": 1.0152, + "step": 696 + }, + { + "epoch": 0.4065918039959166, + "grad_norm": 1.1001508235931396, + "learning_rate": 4.104567307692308e-05, + "loss": 0.9932, + "step": 697 + }, + { + "epoch": 0.4071751494822809, + "grad_norm": 1.1654032468795776, + "learning_rate": 4.103064903846154e-05, + "loss": 1.0077, + "step": 698 + }, + { + "epoch": 0.4077584949686452, + "grad_norm": 1.3325926065444946, + "learning_rate": 4.1015625e-05, + "loss": 0.9754, + "step": 699 + }, + { + "epoch": 0.4083418404550095, + "grad_norm": 1.0101975202560425, + "learning_rate": 4.100060096153847e-05, + "loss": 0.835, + "step": 700 + }, + { + "epoch": 0.4089251859413738, + "grad_norm": 1.0377428531646729, + "learning_rate": 4.0985576923076926e-05, + "loss": 0.9193, + "step": 701 + }, + { + "epoch": 0.4095085314277381, + "grad_norm": 1.4344565868377686, + "learning_rate": 4.0970552884615385e-05, + "loss": 0.981, + "step": 702 + }, + { + "epoch": 0.41009187691410237, + "grad_norm": 1.0151726007461548, + "learning_rate": 4.095552884615384e-05, + "loss": 0.8954, + "step": 703 + }, + { + "epoch": 0.41067522240046667, + "grad_norm": 1.2651805877685547, + "learning_rate": 4.094050480769231e-05, + "loss": 0.8135, + "step": 704 + }, + { + "epoch": 0.41125856788683096, + "grad_norm": 1.1307834386825562, + "learning_rate": 4.0925480769230774e-05, + "loss": 1.0629, + "step": 705 + }, + { + "epoch": 0.41184191337319526, + "grad_norm": 1.5362266302108765, + "learning_rate": 4.091045673076923e-05, + "loss": 1.1536, + "step": 706 + }, + { + "epoch": 0.41242525885955955, + "grad_norm": 1.1772881746292114, + "learning_rate": 4.08954326923077e-05, + "loss": 1.3347, + "step": 707 + }, + { + "epoch": 0.41300860434592385, + "grad_norm": 1.2242248058319092, + "learning_rate": 4.0880408653846156e-05, + "loss": 0.9295, + "step": 708 + }, + { + "epoch": 0.4135919498322882, + "grad_norm": 1.2769187688827515, + "learning_rate": 4.0865384615384615e-05, + "loss": 1.0842, + "step": 709 + }, + { + "epoch": 0.4141752953186525, + "grad_norm": 0.9419771432876587, + "learning_rate": 4.085036057692308e-05, + "loss": 0.9203, + "step": 710 + }, + { + "epoch": 0.4147586408050168, + "grad_norm": 1.5197808742523193, + "learning_rate": 4.083533653846154e-05, + "loss": 0.8543, + "step": 711 + }, + { + "epoch": 0.4153419862913811, + "grad_norm": 1.1694600582122803, + "learning_rate": 4.0820312500000004e-05, + "loss": 1.262, + "step": 712 + }, + { + "epoch": 0.4159253317777454, + "grad_norm": 1.1101324558258057, + "learning_rate": 4.080528846153847e-05, + "loss": 0.8561, + "step": 713 + }, + { + "epoch": 0.4165086772641097, + "grad_norm": 1.1828705072402954, + "learning_rate": 4.079026442307692e-05, + "loss": 0.8932, + "step": 714 + }, + { + "epoch": 0.41709202275047397, + "grad_norm": 1.1211762428283691, + "learning_rate": 4.0775240384615386e-05, + "loss": 0.9816, + "step": 715 + }, + { + "epoch": 0.41767536823683826, + "grad_norm": 1.2857762575149536, + "learning_rate": 4.0760216346153845e-05, + "loss": 1.0257, + "step": 716 + }, + { + "epoch": 0.41825871372320256, + "grad_norm": 1.0974875688552856, + "learning_rate": 4.074519230769231e-05, + "loss": 0.6593, + "step": 717 + }, + { + "epoch": 0.41884205920956685, + "grad_norm": 1.2576038837432861, + "learning_rate": 4.0730168269230775e-05, + "loss": 0.9487, + "step": 718 + }, + { + "epoch": 0.41942540469593115, + "grad_norm": 1.1279481649398804, + "learning_rate": 4.0715144230769234e-05, + "loss": 1.0483, + "step": 719 + }, + { + "epoch": 0.42000875018229544, + "grad_norm": 1.191821575164795, + "learning_rate": 4.070012019230769e-05, + "loss": 1.0075, + "step": 720 + }, + { + "epoch": 0.42059209566865974, + "grad_norm": 0.9579320549964905, + "learning_rate": 4.068509615384616e-05, + "loss": 1.1948, + "step": 721 + }, + { + "epoch": 0.4211754411550241, + "grad_norm": 1.1844723224639893, + "learning_rate": 4.0670072115384616e-05, + "loss": 1.0395, + "step": 722 + }, + { + "epoch": 0.4217587866413884, + "grad_norm": 1.192607045173645, + "learning_rate": 4.065504807692308e-05, + "loss": 0.8447, + "step": 723 + }, + { + "epoch": 0.4223421321277527, + "grad_norm": 1.256598949432373, + "learning_rate": 4.064002403846154e-05, + "loss": 0.8741, + "step": 724 + }, + { + "epoch": 0.422925477614117, + "grad_norm": 1.1558480262756348, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.1007, + "step": 725 + }, + { + "epoch": 0.42350882310048127, + "grad_norm": 1.2486516237258911, + "learning_rate": 4.0609975961538463e-05, + "loss": 1.0104, + "step": 726 + }, + { + "epoch": 0.42409216858684556, + "grad_norm": 1.1530859470367432, + "learning_rate": 4.059495192307692e-05, + "loss": 0.9664, + "step": 727 + }, + { + "epoch": 0.42467551407320986, + "grad_norm": 1.1082444190979004, + "learning_rate": 4.057992788461539e-05, + "loss": 1.2137, + "step": 728 + }, + { + "epoch": 0.42525885955957415, + "grad_norm": 1.1947239637374878, + "learning_rate": 4.0564903846153846e-05, + "loss": 1.1242, + "step": 729 + }, + { + "epoch": 0.42584220504593845, + "grad_norm": 1.185736060142517, + "learning_rate": 4.054987980769231e-05, + "loss": 1.0753, + "step": 730 + }, + { + "epoch": 0.42642555053230274, + "grad_norm": 1.292222499847412, + "learning_rate": 4.0534855769230776e-05, + "loss": 1.2377, + "step": 731 + }, + { + "epoch": 0.42700889601866704, + "grad_norm": 1.3122601509094238, + "learning_rate": 4.051983173076923e-05, + "loss": 1.0001, + "step": 732 + }, + { + "epoch": 0.42759224150503133, + "grad_norm": 1.284077525138855, + "learning_rate": 4.050480769230769e-05, + "loss": 0.8811, + "step": 733 + }, + { + "epoch": 0.42817558699139563, + "grad_norm": 1.266852855682373, + "learning_rate": 4.048978365384616e-05, + "loss": 0.8545, + "step": 734 + }, + { + "epoch": 0.42875893247776, + "grad_norm": 1.1647552251815796, + "learning_rate": 4.047475961538462e-05, + "loss": 0.9484, + "step": 735 + }, + { + "epoch": 0.4293422779641243, + "grad_norm": 1.3035751581192017, + "learning_rate": 4.045973557692308e-05, + "loss": 1.0405, + "step": 736 + }, + { + "epoch": 0.42992562345048857, + "grad_norm": 1.1369545459747314, + "learning_rate": 4.044471153846154e-05, + "loss": 0.8982, + "step": 737 + }, + { + "epoch": 0.43050896893685286, + "grad_norm": 1.1648366451263428, + "learning_rate": 4.04296875e-05, + "loss": 1.0459, + "step": 738 + }, + { + "epoch": 0.43109231442321716, + "grad_norm": 1.3287396430969238, + "learning_rate": 4.0414663461538465e-05, + "loss": 1.0519, + "step": 739 + }, + { + "epoch": 0.43167565990958146, + "grad_norm": 1.1275254487991333, + "learning_rate": 4.039963942307692e-05, + "loss": 0.8556, + "step": 740 + }, + { + "epoch": 0.43225900539594575, + "grad_norm": 1.3335869312286377, + "learning_rate": 4.038461538461539e-05, + "loss": 0.9997, + "step": 741 + }, + { + "epoch": 0.43284235088231005, + "grad_norm": 1.073275089263916, + "learning_rate": 4.036959134615385e-05, + "loss": 1.1496, + "step": 742 + }, + { + "epoch": 0.43342569636867434, + "grad_norm": 1.194717288017273, + "learning_rate": 4.035456730769231e-05, + "loss": 0.958, + "step": 743 + }, + { + "epoch": 0.43400904185503864, + "grad_norm": 1.55983567237854, + "learning_rate": 4.033954326923077e-05, + "loss": 1.2247, + "step": 744 + }, + { + "epoch": 0.43459238734140293, + "grad_norm": 2.449402093887329, + "learning_rate": 4.032451923076923e-05, + "loss": 1.2115, + "step": 745 + }, + { + "epoch": 0.4351757328277672, + "grad_norm": 1.2797013521194458, + "learning_rate": 4.0309495192307694e-05, + "loss": 0.8518, + "step": 746 + }, + { + "epoch": 0.4357590783141315, + "grad_norm": 1.1281001567840576, + "learning_rate": 4.029447115384616e-05, + "loss": 1.0215, + "step": 747 + }, + { + "epoch": 0.43634242380049587, + "grad_norm": 1.1771413087844849, + "learning_rate": 4.027944711538462e-05, + "loss": 1.1645, + "step": 748 + }, + { + "epoch": 0.43692576928686017, + "grad_norm": 1.181060552597046, + "learning_rate": 4.0264423076923083e-05, + "loss": 0.8806, + "step": 749 + }, + { + "epoch": 0.43750911477322446, + "grad_norm": 1.2284411191940308, + "learning_rate": 4.0249399038461535e-05, + "loss": 1.1494, + "step": 750 + }, + { + "epoch": 0.43809246025958876, + "grad_norm": 1.0676560401916504, + "learning_rate": 4.0234375e-05, + "loss": 0.9234, + "step": 751 + }, + { + "epoch": 0.43867580574595305, + "grad_norm": 1.481394648551941, + "learning_rate": 4.0219350961538466e-05, + "loss": 0.904, + "step": 752 + }, + { + "epoch": 0.43925915123231735, + "grad_norm": 1.0527502298355103, + "learning_rate": 4.0204326923076924e-05, + "loss": 1.0413, + "step": 753 + }, + { + "epoch": 0.43984249671868164, + "grad_norm": 1.2585145235061646, + "learning_rate": 4.018930288461539e-05, + "loss": 0.9207, + "step": 754 + }, + { + "epoch": 0.44042584220504594, + "grad_norm": 1.5194056034088135, + "learning_rate": 4.017427884615385e-05, + "loss": 1.0445, + "step": 755 + }, + { + "epoch": 0.44100918769141023, + "grad_norm": 1.1574163436889648, + "learning_rate": 4.0159254807692307e-05, + "loss": 0.8065, + "step": 756 + }, + { + "epoch": 0.4415925331777745, + "grad_norm": 1.351405143737793, + "learning_rate": 4.014423076923077e-05, + "loss": 0.914, + "step": 757 + }, + { + "epoch": 0.4421758786641388, + "grad_norm": 1.1521351337432861, + "learning_rate": 4.012920673076923e-05, + "loss": 0.9017, + "step": 758 + }, + { + "epoch": 0.4427592241505031, + "grad_norm": 1.9487134218215942, + "learning_rate": 4.0114182692307696e-05, + "loss": 1.03, + "step": 759 + }, + { + "epoch": 0.4433425696368674, + "grad_norm": 2.560194969177246, + "learning_rate": 4.009915865384616e-05, + "loss": 1.0656, + "step": 760 + }, + { + "epoch": 0.4439259151232317, + "grad_norm": 1.140284538269043, + "learning_rate": 4.008413461538462e-05, + "loss": 1.0233, + "step": 761 + }, + { + "epoch": 0.44450926060959606, + "grad_norm": 1.3260776996612549, + "learning_rate": 4.006911057692308e-05, + "loss": 1.0017, + "step": 762 + }, + { + "epoch": 0.44509260609596035, + "grad_norm": 1.1119332313537598, + "learning_rate": 4.0054086538461536e-05, + "loss": 1.2699, + "step": 763 + }, + { + "epoch": 0.44567595158232465, + "grad_norm": 1.3355944156646729, + "learning_rate": 4.00390625e-05, + "loss": 0.9841, + "step": 764 + }, + { + "epoch": 0.44625929706868894, + "grad_norm": 1.3637195825576782, + "learning_rate": 4.002403846153847e-05, + "loss": 1.0625, + "step": 765 + }, + { + "epoch": 0.44684264255505324, + "grad_norm": 1.1988649368286133, + "learning_rate": 4.0009014423076925e-05, + "loss": 1.0488, + "step": 766 + }, + { + "epoch": 0.44742598804141753, + "grad_norm": 2.336710214614868, + "learning_rate": 3.999399038461539e-05, + "loss": 1.1126, + "step": 767 + }, + { + "epoch": 0.44800933352778183, + "grad_norm": 1.3706448078155518, + "learning_rate": 3.997896634615384e-05, + "loss": 1.0118, + "step": 768 + }, + { + "epoch": 0.4485926790141461, + "grad_norm": 1.171186089515686, + "learning_rate": 3.996394230769231e-05, + "loss": 1.024, + "step": 769 + }, + { + "epoch": 0.4491760245005104, + "grad_norm": 1.424375057220459, + "learning_rate": 3.994891826923077e-05, + "loss": 1.0885, + "step": 770 + }, + { + "epoch": 0.4497593699868747, + "grad_norm": 1.218055009841919, + "learning_rate": 3.993389423076923e-05, + "loss": 0.9143, + "step": 771 + }, + { + "epoch": 0.450342715473239, + "grad_norm": 1.1051896810531616, + "learning_rate": 3.99188701923077e-05, + "loss": 0.8874, + "step": 772 + }, + { + "epoch": 0.4509260609596033, + "grad_norm": 1.2627577781677246, + "learning_rate": 3.9903846153846155e-05, + "loss": 0.9562, + "step": 773 + }, + { + "epoch": 0.4515094064459676, + "grad_norm": 2.4132933616638184, + "learning_rate": 3.9888822115384614e-05, + "loss": 1.0477, + "step": 774 + }, + { + "epoch": 0.45209275193233195, + "grad_norm": 1.4425148963928223, + "learning_rate": 3.987379807692308e-05, + "loss": 0.8765, + "step": 775 + }, + { + "epoch": 0.45267609741869624, + "grad_norm": 1.0001459121704102, + "learning_rate": 3.985877403846154e-05, + "loss": 0.9992, + "step": 776 + }, + { + "epoch": 0.45325944290506054, + "grad_norm": 1.2499384880065918, + "learning_rate": 3.984375e-05, + "loss": 1.0017, + "step": 777 + }, + { + "epoch": 0.45384278839142483, + "grad_norm": 1.0234681367874146, + "learning_rate": 3.982872596153847e-05, + "loss": 0.9316, + "step": 778 + }, + { + "epoch": 0.45442613387778913, + "grad_norm": 1.1663625240325928, + "learning_rate": 3.981370192307693e-05, + "loss": 1.0178, + "step": 779 + }, + { + "epoch": 0.4550094793641534, + "grad_norm": 1.0986937284469604, + "learning_rate": 3.9798677884615385e-05, + "loss": 0.9092, + "step": 780 + }, + { + "epoch": 0.4555928248505177, + "grad_norm": 1.2490792274475098, + "learning_rate": 3.9783653846153844e-05, + "loss": 1.046, + "step": 781 + }, + { + "epoch": 0.456176170336882, + "grad_norm": 1.217383861541748, + "learning_rate": 3.976862980769231e-05, + "loss": 0.9513, + "step": 782 + }, + { + "epoch": 0.4567595158232463, + "grad_norm": 1.4201329946517944, + "learning_rate": 3.9753605769230774e-05, + "loss": 1.0895, + "step": 783 + }, + { + "epoch": 0.4573428613096106, + "grad_norm": 1.3387080430984497, + "learning_rate": 3.973858173076923e-05, + "loss": 0.9925, + "step": 784 + }, + { + "epoch": 0.4579262067959749, + "grad_norm": 1.2447290420532227, + "learning_rate": 3.97235576923077e-05, + "loss": 1.1017, + "step": 785 + }, + { + "epoch": 0.4585095522823392, + "grad_norm": 1.403903841972351, + "learning_rate": 3.9708533653846156e-05, + "loss": 0.8692, + "step": 786 + }, + { + "epoch": 0.4590928977687035, + "grad_norm": 1.3049734830856323, + "learning_rate": 3.9693509615384615e-05, + "loss": 1.1045, + "step": 787 + }, + { + "epoch": 0.45967624325506784, + "grad_norm": 2.3408448696136475, + "learning_rate": 3.967848557692308e-05, + "loss": 1.0606, + "step": 788 + }, + { + "epoch": 0.46025958874143214, + "grad_norm": 1.189640760421753, + "learning_rate": 3.966346153846154e-05, + "loss": 1.0287, + "step": 789 + }, + { + "epoch": 0.46084293422779643, + "grad_norm": 1.1157665252685547, + "learning_rate": 3.9648437500000004e-05, + "loss": 0.9822, + "step": 790 + }, + { + "epoch": 0.4614262797141607, + "grad_norm": 1.242864966392517, + "learning_rate": 3.963341346153846e-05, + "loss": 0.827, + "step": 791 + }, + { + "epoch": 0.462009625200525, + "grad_norm": 1.1176204681396484, + "learning_rate": 3.961838942307692e-05, + "loss": 1.2449, + "step": 792 + }, + { + "epoch": 0.4625929706868893, + "grad_norm": 1.0381675958633423, + "learning_rate": 3.9603365384615386e-05, + "loss": 1.1754, + "step": 793 + }, + { + "epoch": 0.4631763161732536, + "grad_norm": 1.2429289817810059, + "learning_rate": 3.9588341346153845e-05, + "loss": 1.0103, + "step": 794 + }, + { + "epoch": 0.4637596616596179, + "grad_norm": 1.214370608329773, + "learning_rate": 3.957331730769231e-05, + "loss": 0.9133, + "step": 795 + }, + { + "epoch": 0.4643430071459822, + "grad_norm": 1.249859094619751, + "learning_rate": 3.9558293269230775e-05, + "loss": 1.2252, + "step": 796 + }, + { + "epoch": 0.4649263526323465, + "grad_norm": 1.1522125005722046, + "learning_rate": 3.9543269230769234e-05, + "loss": 0.8547, + "step": 797 + }, + { + "epoch": 0.4655096981187108, + "grad_norm": 1.0914942026138306, + "learning_rate": 3.952824519230769e-05, + "loss": 1.1068, + "step": 798 + }, + { + "epoch": 0.4660930436050751, + "grad_norm": 1.2309699058532715, + "learning_rate": 3.951322115384616e-05, + "loss": 1.1301, + "step": 799 + }, + { + "epoch": 0.4666763890914394, + "grad_norm": 1.3135758638381958, + "learning_rate": 3.9498197115384616e-05, + "loss": 0.947, + "step": 800 + }, + { + "epoch": 0.4666763890914394, + "eval_loss_squad": 0.9123072922043503, + "eval_perplexity": 7.919284365740796, + "eval_perplexity_reconstruct": 1.9212373943429781, + "step": 800 + }, + { + "epoch": 0.46725973457780373, + "grad_norm": 1.0412598848342896, + "learning_rate": 3.948317307692308e-05, + "loss": 1.1682, + "step": 801 + }, + { + "epoch": 0.467843080064168, + "grad_norm": 0.986031711101532, + "learning_rate": 3.946814903846154e-05, + "loss": 0.7983, + "step": 802 + }, + { + "epoch": 0.4684264255505323, + "grad_norm": 1.149339199066162, + "learning_rate": 3.9453125000000005e-05, + "loss": 0.9834, + "step": 803 + }, + { + "epoch": 0.4690097710368966, + "grad_norm": 1.2143068313598633, + "learning_rate": 3.9438100961538464e-05, + "loss": 1.1033, + "step": 804 + }, + { + "epoch": 0.4695931165232609, + "grad_norm": 1.015660285949707, + "learning_rate": 3.942307692307692e-05, + "loss": 0.8502, + "step": 805 + }, + { + "epoch": 0.4701764620096252, + "grad_norm": 1.3838074207305908, + "learning_rate": 3.940805288461539e-05, + "loss": 0.9986, + "step": 806 + }, + { + "epoch": 0.4707598074959895, + "grad_norm": 1.1482564210891724, + "learning_rate": 3.9393028846153846e-05, + "loss": 1.0143, + "step": 807 + }, + { + "epoch": 0.4713431529823538, + "grad_norm": 1.0859732627868652, + "learning_rate": 3.937800480769231e-05, + "loss": 0.9108, + "step": 808 + }, + { + "epoch": 0.4719264984687181, + "grad_norm": 1.4561444520950317, + "learning_rate": 3.936298076923077e-05, + "loss": 0.7463, + "step": 809 + }, + { + "epoch": 0.4725098439550824, + "grad_norm": 1.909414529800415, + "learning_rate": 3.934795673076923e-05, + "loss": 1.0691, + "step": 810 + }, + { + "epoch": 0.4730931894414467, + "grad_norm": 2.1978914737701416, + "learning_rate": 3.9332932692307694e-05, + "loss": 1.263, + "step": 811 + }, + { + "epoch": 0.473676534927811, + "grad_norm": 1.4734632968902588, + "learning_rate": 3.931790865384616e-05, + "loss": 0.8494, + "step": 812 + }, + { + "epoch": 0.47425988041417527, + "grad_norm": 1.2567843198776245, + "learning_rate": 3.930288461538462e-05, + "loss": 1.0615, + "step": 813 + }, + { + "epoch": 0.47484322590053957, + "grad_norm": 1.2729908227920532, + "learning_rate": 3.928786057692308e-05, + "loss": 0.8998, + "step": 814 + }, + { + "epoch": 0.4754265713869039, + "grad_norm": 1.0162193775177002, + "learning_rate": 3.927283653846154e-05, + "loss": 1.0442, + "step": 815 + }, + { + "epoch": 0.4760099168732682, + "grad_norm": 1.0893824100494385, + "learning_rate": 3.92578125e-05, + "loss": 0.9158, + "step": 816 + }, + { + "epoch": 0.4765932623596325, + "grad_norm": 0.9300756454467773, + "learning_rate": 3.9242788461538465e-05, + "loss": 0.9869, + "step": 817 + }, + { + "epoch": 0.4771766078459968, + "grad_norm": 1.3809287548065186, + "learning_rate": 3.9227764423076923e-05, + "loss": 0.9937, + "step": 818 + }, + { + "epoch": 0.4777599533323611, + "grad_norm": 1.1738115549087524, + "learning_rate": 3.921274038461539e-05, + "loss": 1.0634, + "step": 819 + }, + { + "epoch": 0.4783432988187254, + "grad_norm": 1.2441519498825073, + "learning_rate": 3.919771634615385e-05, + "loss": 0.7852, + "step": 820 + }, + { + "epoch": 0.4789266443050897, + "grad_norm": 0.8443605303764343, + "learning_rate": 3.918269230769231e-05, + "loss": 1.0473, + "step": 821 + }, + { + "epoch": 0.479509989791454, + "grad_norm": 1.2155978679656982, + "learning_rate": 3.916766826923077e-05, + "loss": 0.9415, + "step": 822 + }, + { + "epoch": 0.4800933352778183, + "grad_norm": 1.1366358995437622, + "learning_rate": 3.915264423076923e-05, + "loss": 0.9134, + "step": 823 + }, + { + "epoch": 0.4806766807641826, + "grad_norm": 1.1710559129714966, + "learning_rate": 3.9137620192307695e-05, + "loss": 1.0786, + "step": 824 + }, + { + "epoch": 0.48126002625054687, + "grad_norm": 1.0255122184753418, + "learning_rate": 3.912259615384616e-05, + "loss": 1.0628, + "step": 825 + }, + { + "epoch": 0.48184337173691116, + "grad_norm": 0.9826768636703491, + "learning_rate": 3.910757211538462e-05, + "loss": 0.8179, + "step": 826 + }, + { + "epoch": 0.48242671722327546, + "grad_norm": 1.3819292783737183, + "learning_rate": 3.909254807692308e-05, + "loss": 1.0797, + "step": 827 + }, + { + "epoch": 0.4830100627096398, + "grad_norm": 1.038661003112793, + "learning_rate": 3.9077524038461536e-05, + "loss": 1.2463, + "step": 828 + }, + { + "epoch": 0.4835934081960041, + "grad_norm": 1.517325758934021, + "learning_rate": 3.90625e-05, + "loss": 1.0843, + "step": 829 + }, + { + "epoch": 0.4841767536823684, + "grad_norm": 1.1275871992111206, + "learning_rate": 3.9047475961538466e-05, + "loss": 1.0089, + "step": 830 + }, + { + "epoch": 0.4847600991687327, + "grad_norm": 2.5894081592559814, + "learning_rate": 3.9032451923076925e-05, + "loss": 1.0206, + "step": 831 + }, + { + "epoch": 0.485343444655097, + "grad_norm": 1.2647501230239868, + "learning_rate": 3.901742788461539e-05, + "loss": 1.0476, + "step": 832 + }, + { + "epoch": 0.4859267901414613, + "grad_norm": 1.1321070194244385, + "learning_rate": 3.900240384615385e-05, + "loss": 0.9772, + "step": 833 + }, + { + "epoch": 0.4865101356278256, + "grad_norm": 1.4880715608596802, + "learning_rate": 3.898737980769231e-05, + "loss": 0.7914, + "step": 834 + }, + { + "epoch": 0.4870934811141899, + "grad_norm": 1.07522714138031, + "learning_rate": 3.897235576923077e-05, + "loss": 0.9263, + "step": 835 + }, + { + "epoch": 0.48767682660055417, + "grad_norm": 1.0414209365844727, + "learning_rate": 3.895733173076923e-05, + "loss": 1.2107, + "step": 836 + }, + { + "epoch": 0.48826017208691846, + "grad_norm": 1.1117522716522217, + "learning_rate": 3.8942307692307696e-05, + "loss": 0.833, + "step": 837 + }, + { + "epoch": 0.48884351757328276, + "grad_norm": 1.225772500038147, + "learning_rate": 3.892728365384616e-05, + "loss": 1.0641, + "step": 838 + }, + { + "epoch": 0.48942686305964705, + "grad_norm": 1.2368497848510742, + "learning_rate": 3.891225961538462e-05, + "loss": 0.9278, + "step": 839 + }, + { + "epoch": 0.49001020854601135, + "grad_norm": 1.068732500076294, + "learning_rate": 3.889723557692308e-05, + "loss": 0.8247, + "step": 840 + }, + { + "epoch": 0.4905935540323757, + "grad_norm": 1.1534667015075684, + "learning_rate": 3.888221153846154e-05, + "loss": 0.9824, + "step": 841 + }, + { + "epoch": 0.49117689951874, + "grad_norm": 1.3575752973556519, + "learning_rate": 3.88671875e-05, + "loss": 1.0802, + "step": 842 + }, + { + "epoch": 0.4917602450051043, + "grad_norm": 1.2261594533920288, + "learning_rate": 3.885216346153847e-05, + "loss": 1.1202, + "step": 843 + }, + { + "epoch": 0.4923435904914686, + "grad_norm": 1.2742036581039429, + "learning_rate": 3.8837139423076926e-05, + "loss": 1.0831, + "step": 844 + }, + { + "epoch": 0.4929269359778329, + "grad_norm": 1.2143781185150146, + "learning_rate": 3.8822115384615384e-05, + "loss": 1.0475, + "step": 845 + }, + { + "epoch": 0.4935102814641972, + "grad_norm": 1.1177541017532349, + "learning_rate": 3.880709134615384e-05, + "loss": 1.0494, + "step": 846 + }, + { + "epoch": 0.49409362695056147, + "grad_norm": 1.1861687898635864, + "learning_rate": 3.879206730769231e-05, + "loss": 0.9165, + "step": 847 + }, + { + "epoch": 0.49467697243692577, + "grad_norm": 1.2765378952026367, + "learning_rate": 3.877704326923077e-05, + "loss": 0.891, + "step": 848 + }, + { + "epoch": 0.49526031792329006, + "grad_norm": 1.1258078813552856, + "learning_rate": 3.876201923076923e-05, + "loss": 0.9761, + "step": 849 + }, + { + "epoch": 0.49584366340965436, + "grad_norm": 1.1065664291381836, + "learning_rate": 3.87469951923077e-05, + "loss": 0.9383, + "step": 850 + }, + { + "epoch": 0.49642700889601865, + "grad_norm": 1.1673333644866943, + "learning_rate": 3.8731971153846156e-05, + "loss": 0.9632, + "step": 851 + }, + { + "epoch": 0.49701035438238295, + "grad_norm": 1.428477168083191, + "learning_rate": 3.8716947115384614e-05, + "loss": 0.9079, + "step": 852 + }, + { + "epoch": 0.49759369986874724, + "grad_norm": 1.1232894659042358, + "learning_rate": 3.870192307692308e-05, + "loss": 0.9956, + "step": 853 + }, + { + "epoch": 0.4981770453551116, + "grad_norm": 1.2544480562210083, + "learning_rate": 3.868689903846154e-05, + "loss": 1.0855, + "step": 854 + }, + { + "epoch": 0.4987603908414759, + "grad_norm": 1.152759313583374, + "learning_rate": 3.8671875e-05, + "loss": 1.0324, + "step": 855 + }, + { + "epoch": 0.4993437363278402, + "grad_norm": 1.1294784545898438, + "learning_rate": 3.865685096153847e-05, + "loss": 0.9175, + "step": 856 + }, + { + "epoch": 0.4999270818142045, + "grad_norm": 1.2857279777526855, + "learning_rate": 3.864182692307693e-05, + "loss": 1.2653, + "step": 857 + }, + { + "epoch": 0.5005104273005687, + "grad_norm": 1.9902052879333496, + "learning_rate": 3.8626802884615385e-05, + "loss": 0.9604, + "step": 858 + }, + { + "epoch": 0.501093772786933, + "grad_norm": 1.4721367359161377, + "learning_rate": 3.8611778846153844e-05, + "loss": 1.066, + "step": 859 + }, + { + "epoch": 0.5016771182732973, + "grad_norm": 1.022174596786499, + "learning_rate": 3.859675480769231e-05, + "loss": 0.9604, + "step": 860 + }, + { + "epoch": 0.5022604637596617, + "grad_norm": 1.3919785022735596, + "learning_rate": 3.8581730769230775e-05, + "loss": 0.9068, + "step": 861 + }, + { + "epoch": 0.502843809246026, + "grad_norm": 1.2962651252746582, + "learning_rate": 3.856670673076923e-05, + "loss": 1.1166, + "step": 862 + }, + { + "epoch": 0.5034271547323903, + "grad_norm": 1.2550551891326904, + "learning_rate": 3.855168269230769e-05, + "loss": 0.9605, + "step": 863 + }, + { + "epoch": 0.5040105002187546, + "grad_norm": 1.1890307664871216, + "learning_rate": 3.853665865384616e-05, + "loss": 1.1311, + "step": 864 + }, + { + "epoch": 0.5045938457051189, + "grad_norm": 1.2987920045852661, + "learning_rate": 3.8521634615384615e-05, + "loss": 1.0043, + "step": 865 + }, + { + "epoch": 0.5051771911914832, + "grad_norm": 1.0546956062316895, + "learning_rate": 3.850661057692308e-05, + "loss": 1.1154, + "step": 866 + }, + { + "epoch": 0.5057605366778475, + "grad_norm": 1.1739052534103394, + "learning_rate": 3.849158653846154e-05, + "loss": 1.0501, + "step": 867 + }, + { + "epoch": 0.5063438821642118, + "grad_norm": 1.171311378479004, + "learning_rate": 3.8476562500000004e-05, + "loss": 1.109, + "step": 868 + }, + { + "epoch": 0.5069272276505761, + "grad_norm": 1.0606648921966553, + "learning_rate": 3.846153846153846e-05, + "loss": 0.9756, + "step": 869 + }, + { + "epoch": 0.5075105731369404, + "grad_norm": 1.1545205116271973, + "learning_rate": 3.844651442307692e-05, + "loss": 1.073, + "step": 870 + }, + { + "epoch": 0.5080939186233047, + "grad_norm": 1.2131757736206055, + "learning_rate": 3.843149038461539e-05, + "loss": 1.1398, + "step": 871 + }, + { + "epoch": 0.508677264109669, + "grad_norm": 1.140005111694336, + "learning_rate": 3.8416466346153845e-05, + "loss": 0.9707, + "step": 872 + }, + { + "epoch": 0.5092606095960333, + "grad_norm": 1.253605604171753, + "learning_rate": 3.840144230769231e-05, + "loss": 1.1389, + "step": 873 + }, + { + "epoch": 0.5098439550823975, + "grad_norm": 1.2395936250686646, + "learning_rate": 3.8386418269230776e-05, + "loss": 1.0647, + "step": 874 + }, + { + "epoch": 0.5104273005687618, + "grad_norm": 1.092179298400879, + "learning_rate": 3.8371394230769234e-05, + "loss": 1.0585, + "step": 875 + }, + { + "epoch": 0.5110106460551261, + "grad_norm": 1.6231690645217896, + "learning_rate": 3.835637019230769e-05, + "loss": 0.8506, + "step": 876 + }, + { + "epoch": 0.5115939915414904, + "grad_norm": 1.1169666051864624, + "learning_rate": 3.834134615384616e-05, + "loss": 0.8958, + "step": 877 + }, + { + "epoch": 0.5121773370278547, + "grad_norm": 1.2263548374176025, + "learning_rate": 3.8326322115384616e-05, + "loss": 0.9337, + "step": 878 + }, + { + "epoch": 0.512760682514219, + "grad_norm": 1.1182410717010498, + "learning_rate": 3.831129807692308e-05, + "loss": 0.8743, + "step": 879 + }, + { + "epoch": 0.5133440280005833, + "grad_norm": 1.1924457550048828, + "learning_rate": 3.829627403846154e-05, + "loss": 0.9632, + "step": 880 + }, + { + "epoch": 0.5139273734869476, + "grad_norm": 1.2248971462249756, + "learning_rate": 3.828125e-05, + "loss": 1.1191, + "step": 881 + }, + { + "epoch": 0.5145107189733119, + "grad_norm": 1.1758016347885132, + "learning_rate": 3.8266225961538464e-05, + "loss": 0.9399, + "step": 882 + }, + { + "epoch": 0.5150940644596762, + "grad_norm": 1.1751339435577393, + "learning_rate": 3.825120192307692e-05, + "loss": 0.8132, + "step": 883 + }, + { + "epoch": 0.5156774099460405, + "grad_norm": 1.093839168548584, + "learning_rate": 3.823617788461539e-05, + "loss": 1.1629, + "step": 884 + }, + { + "epoch": 0.5162607554324048, + "grad_norm": 3.200582265853882, + "learning_rate": 3.8221153846153846e-05, + "loss": 1.1263, + "step": 885 + }, + { + "epoch": 0.5168441009187691, + "grad_norm": 1.2929569482803345, + "learning_rate": 3.820612980769231e-05, + "loss": 1.1846, + "step": 886 + }, + { + "epoch": 0.5174274464051335, + "grad_norm": 1.3224855661392212, + "learning_rate": 3.819110576923077e-05, + "loss": 0.9277, + "step": 887 + }, + { + "epoch": 0.5180107918914978, + "grad_norm": 1.1607059240341187, + "learning_rate": 3.817608173076923e-05, + "loss": 0.9946, + "step": 888 + }, + { + "epoch": 0.5185941373778621, + "grad_norm": 2.7251431941986084, + "learning_rate": 3.8161057692307694e-05, + "loss": 1.1296, + "step": 889 + }, + { + "epoch": 0.5191774828642264, + "grad_norm": 1.1163549423217773, + "learning_rate": 3.814603365384616e-05, + "loss": 0.9401, + "step": 890 + }, + { + "epoch": 0.5197608283505907, + "grad_norm": 1.1019116640090942, + "learning_rate": 3.813100961538462e-05, + "loss": 0.8071, + "step": 891 + }, + { + "epoch": 0.520344173836955, + "grad_norm": 1.1490522623062134, + "learning_rate": 3.811598557692308e-05, + "loss": 0.9441, + "step": 892 + }, + { + "epoch": 0.5209275193233193, + "grad_norm": 1.1176124811172485, + "learning_rate": 3.810096153846154e-05, + "loss": 1.0805, + "step": 893 + }, + { + "epoch": 0.5215108648096836, + "grad_norm": 1.4119200706481934, + "learning_rate": 3.80859375e-05, + "loss": 1.3708, + "step": 894 + }, + { + "epoch": 0.5220942102960479, + "grad_norm": 1.2920783758163452, + "learning_rate": 3.8070913461538465e-05, + "loss": 1.2315, + "step": 895 + }, + { + "epoch": 0.5226775557824122, + "grad_norm": 1.2457376718521118, + "learning_rate": 3.8055889423076924e-05, + "loss": 1.0076, + "step": 896 + }, + { + "epoch": 0.5232609012687764, + "grad_norm": 1.5919691324234009, + "learning_rate": 3.804086538461539e-05, + "loss": 0.9201, + "step": 897 + }, + { + "epoch": 0.5238442467551407, + "grad_norm": 1.3675076961517334, + "learning_rate": 3.802584134615385e-05, + "loss": 0.9394, + "step": 898 + }, + { + "epoch": 0.524427592241505, + "grad_norm": 1.148927927017212, + "learning_rate": 3.8010817307692306e-05, + "loss": 0.8612, + "step": 899 + }, + { + "epoch": 0.5250109377278693, + "grad_norm": 1.258003830909729, + "learning_rate": 3.799579326923077e-05, + "loss": 1.1393, + "step": 900 + }, + { + "epoch": 0.5255942832142336, + "grad_norm": 1.0879266262054443, + "learning_rate": 3.798076923076923e-05, + "loss": 1.2231, + "step": 901 + }, + { + "epoch": 0.5261776287005979, + "grad_norm": 1.3922462463378906, + "learning_rate": 3.7965745192307695e-05, + "loss": 1.0593, + "step": 902 + }, + { + "epoch": 0.5267609741869622, + "grad_norm": 1.225117564201355, + "learning_rate": 3.795072115384616e-05, + "loss": 1.2312, + "step": 903 + }, + { + "epoch": 0.5273443196733265, + "grad_norm": 1.3208086490631104, + "learning_rate": 3.793569711538462e-05, + "loss": 1.3092, + "step": 904 + }, + { + "epoch": 0.5279276651596908, + "grad_norm": 1.0285180807113647, + "learning_rate": 3.792067307692308e-05, + "loss": 0.9734, + "step": 905 + }, + { + "epoch": 0.5285110106460551, + "grad_norm": 1.4396753311157227, + "learning_rate": 3.7905649038461536e-05, + "loss": 1.2026, + "step": 906 + }, + { + "epoch": 0.5290943561324194, + "grad_norm": 0.9777195453643799, + "learning_rate": 3.7890625e-05, + "loss": 0.9129, + "step": 907 + }, + { + "epoch": 0.5296777016187837, + "grad_norm": 1.0864167213439941, + "learning_rate": 3.7875600961538466e-05, + "loss": 0.8855, + "step": 908 + }, + { + "epoch": 0.530261047105148, + "grad_norm": 1.0939980745315552, + "learning_rate": 3.7860576923076925e-05, + "loss": 1.0853, + "step": 909 + }, + { + "epoch": 0.5308443925915123, + "grad_norm": 1.1439995765686035, + "learning_rate": 3.784555288461539e-05, + "loss": 1.1568, + "step": 910 + }, + { + "epoch": 0.5314277380778766, + "grad_norm": 1.2120246887207031, + "learning_rate": 3.783052884615385e-05, + "loss": 0.9977, + "step": 911 + }, + { + "epoch": 0.5320110835642409, + "grad_norm": 1.0372374057769775, + "learning_rate": 3.781550480769231e-05, + "loss": 0.8251, + "step": 912 + }, + { + "epoch": 0.5325944290506052, + "grad_norm": 1.2008637189865112, + "learning_rate": 3.780048076923077e-05, + "loss": 1.0291, + "step": 913 + }, + { + "epoch": 0.5331777745369696, + "grad_norm": 1.475831151008606, + "learning_rate": 3.778545673076923e-05, + "loss": 1.0554, + "step": 914 + }, + { + "epoch": 0.5337611200233339, + "grad_norm": 1.3396097421646118, + "learning_rate": 3.7770432692307696e-05, + "loss": 1.0433, + "step": 915 + }, + { + "epoch": 0.5343444655096982, + "grad_norm": 1.40950345993042, + "learning_rate": 3.775540865384616e-05, + "loss": 0.963, + "step": 916 + }, + { + "epoch": 0.5349278109960625, + "grad_norm": 1.8577003479003906, + "learning_rate": 3.774038461538461e-05, + "loss": 0.747, + "step": 917 + }, + { + "epoch": 0.5355111564824268, + "grad_norm": 1.0990678071975708, + "learning_rate": 3.772536057692308e-05, + "loss": 0.869, + "step": 918 + }, + { + "epoch": 0.536094501968791, + "grad_norm": 1.4199498891830444, + "learning_rate": 3.771033653846154e-05, + "loss": 0.8398, + "step": 919 + }, + { + "epoch": 0.5366778474551553, + "grad_norm": 1.5259379148483276, + "learning_rate": 3.76953125e-05, + "loss": 1.0978, + "step": 920 + }, + { + "epoch": 0.5372611929415196, + "grad_norm": 1.208513617515564, + "learning_rate": 3.768028846153847e-05, + "loss": 0.9836, + "step": 921 + }, + { + "epoch": 0.5378445384278839, + "grad_norm": 1.3118703365325928, + "learning_rate": 3.7665264423076926e-05, + "loss": 0.9274, + "step": 922 + }, + { + "epoch": 0.5384278839142482, + "grad_norm": 1.2613270282745361, + "learning_rate": 3.7650240384615385e-05, + "loss": 1.095, + "step": 923 + }, + { + "epoch": 0.5390112294006125, + "grad_norm": 1.056458592414856, + "learning_rate": 3.763521634615384e-05, + "loss": 1.0447, + "step": 924 + }, + { + "epoch": 0.5395945748869768, + "grad_norm": 1.104421854019165, + "learning_rate": 3.762019230769231e-05, + "loss": 0.9413, + "step": 925 + }, + { + "epoch": 0.5401779203733411, + "grad_norm": 1.1966310739517212, + "learning_rate": 3.7605168269230774e-05, + "loss": 1.0595, + "step": 926 + }, + { + "epoch": 0.5407612658597054, + "grad_norm": 1.1758238077163696, + "learning_rate": 3.759014423076923e-05, + "loss": 1.0223, + "step": 927 + }, + { + "epoch": 0.5413446113460697, + "grad_norm": 1.850199580192566, + "learning_rate": 3.75751201923077e-05, + "loss": 1.0916, + "step": 928 + }, + { + "epoch": 0.541927956832434, + "grad_norm": 1.0883837938308716, + "learning_rate": 3.7560096153846156e-05, + "loss": 1.1043, + "step": 929 + }, + { + "epoch": 0.5425113023187983, + "grad_norm": 1.1901664733886719, + "learning_rate": 3.7545072115384614e-05, + "loss": 1.0474, + "step": 930 + }, + { + "epoch": 0.5430946478051626, + "grad_norm": 1.5722588300704956, + "learning_rate": 3.753004807692308e-05, + "loss": 0.9751, + "step": 931 + }, + { + "epoch": 0.5436779932915269, + "grad_norm": 1.2307900190353394, + "learning_rate": 3.751502403846154e-05, + "loss": 1.0156, + "step": 932 + }, + { + "epoch": 0.5442613387778912, + "grad_norm": 1.954958200454712, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.0679, + "step": 933 + }, + { + "epoch": 0.5448446842642555, + "grad_norm": 1.6473888158798218, + "learning_rate": 3.748497596153847e-05, + "loss": 0.9303, + "step": 934 + }, + { + "epoch": 0.5454280297506198, + "grad_norm": 1.390716314315796, + "learning_rate": 3.746995192307692e-05, + "loss": 1.0829, + "step": 935 + }, + { + "epoch": 0.5460113752369841, + "grad_norm": 1.2111623287200928, + "learning_rate": 3.7454927884615386e-05, + "loss": 1.1691, + "step": 936 + }, + { + "epoch": 0.5465947207233484, + "grad_norm": 1.2147167921066284, + "learning_rate": 3.7439903846153844e-05, + "loss": 1.0071, + "step": 937 + }, + { + "epoch": 0.5471780662097127, + "grad_norm": 1.4748153686523438, + "learning_rate": 3.742487980769231e-05, + "loss": 0.945, + "step": 938 + }, + { + "epoch": 0.547761411696077, + "grad_norm": 1.2974998950958252, + "learning_rate": 3.7409855769230775e-05, + "loss": 0.9262, + "step": 939 + }, + { + "epoch": 0.5483447571824414, + "grad_norm": 1.0850262641906738, + "learning_rate": 3.739483173076923e-05, + "loss": 0.8514, + "step": 940 + }, + { + "epoch": 0.5489281026688057, + "grad_norm": 1.409722924232483, + "learning_rate": 3.737980769230769e-05, + "loss": 1.0453, + "step": 941 + }, + { + "epoch": 0.54951144815517, + "grad_norm": 1.253421664237976, + "learning_rate": 3.736478365384616e-05, + "loss": 0.8305, + "step": 942 + }, + { + "epoch": 0.5500947936415342, + "grad_norm": 1.1697018146514893, + "learning_rate": 3.7349759615384616e-05, + "loss": 1.0097, + "step": 943 + }, + { + "epoch": 0.5506781391278985, + "grad_norm": 1.3280630111694336, + "learning_rate": 3.733473557692308e-05, + "loss": 1.1064, + "step": 944 + }, + { + "epoch": 0.5512614846142628, + "grad_norm": 1.7148746252059937, + "learning_rate": 3.731971153846154e-05, + "loss": 1.152, + "step": 945 + }, + { + "epoch": 0.5518448301006271, + "grad_norm": 1.2414015531539917, + "learning_rate": 3.7304687500000005e-05, + "loss": 1.4578, + "step": 946 + }, + { + "epoch": 0.5524281755869914, + "grad_norm": 1.5570706129074097, + "learning_rate": 3.728966346153846e-05, + "loss": 1.1254, + "step": 947 + }, + { + "epoch": 0.5530115210733557, + "grad_norm": 1.2762681245803833, + "learning_rate": 3.727463942307692e-05, + "loss": 1.0296, + "step": 948 + }, + { + "epoch": 0.55359486655972, + "grad_norm": 1.1472910642623901, + "learning_rate": 3.725961538461539e-05, + "loss": 0.9723, + "step": 949 + }, + { + "epoch": 0.5541782120460843, + "grad_norm": 1.429591417312622, + "learning_rate": 3.7244591346153845e-05, + "loss": 1.0805, + "step": 950 + }, + { + "epoch": 0.5547615575324486, + "grad_norm": 1.136590600013733, + "learning_rate": 3.722956730769231e-05, + "loss": 1.2051, + "step": 951 + }, + { + "epoch": 0.5553449030188129, + "grad_norm": 1.1960887908935547, + "learning_rate": 3.7214543269230776e-05, + "loss": 0.9221, + "step": 952 + }, + { + "epoch": 0.5559282485051772, + "grad_norm": 1.2155873775482178, + "learning_rate": 3.719951923076923e-05, + "loss": 0.8845, + "step": 953 + }, + { + "epoch": 0.5565115939915415, + "grad_norm": 1.4684885740280151, + "learning_rate": 3.718449519230769e-05, + "loss": 1.1217, + "step": 954 + }, + { + "epoch": 0.5570949394779058, + "grad_norm": 1.1232365369796753, + "learning_rate": 3.716947115384616e-05, + "loss": 0.96, + "step": 955 + }, + { + "epoch": 0.5576782849642701, + "grad_norm": 1.3967763185501099, + "learning_rate": 3.715444711538462e-05, + "loss": 1.1756, + "step": 956 + }, + { + "epoch": 0.5582616304506344, + "grad_norm": 1.3869478702545166, + "learning_rate": 3.713942307692308e-05, + "loss": 0.904, + "step": 957 + }, + { + "epoch": 0.5588449759369987, + "grad_norm": 1.5184909105300903, + "learning_rate": 3.712439903846154e-05, + "loss": 1.065, + "step": 958 + }, + { + "epoch": 0.559428321423363, + "grad_norm": 1.1123069524765015, + "learning_rate": 3.7109375e-05, + "loss": 0.9782, + "step": 959 + }, + { + "epoch": 0.5600116669097273, + "grad_norm": 1.0042744874954224, + "learning_rate": 3.7094350961538464e-05, + "loss": 1.0655, + "step": 960 + }, + { + "epoch": 0.5605950123960916, + "grad_norm": 1.103111743927002, + "learning_rate": 3.707932692307692e-05, + "loss": 1.0633, + "step": 961 + }, + { + "epoch": 0.5611783578824558, + "grad_norm": 1.1853861808776855, + "learning_rate": 3.706430288461539e-05, + "loss": 1.1423, + "step": 962 + }, + { + "epoch": 0.5617617033688201, + "grad_norm": 1.1413124799728394, + "learning_rate": 3.704927884615385e-05, + "loss": 0.9878, + "step": 963 + }, + { + "epoch": 0.5623450488551844, + "grad_norm": 1.1289325952529907, + "learning_rate": 3.703425480769231e-05, + "loss": 0.9265, + "step": 964 + }, + { + "epoch": 0.5629283943415487, + "grad_norm": 1.21286141872406, + "learning_rate": 3.701923076923077e-05, + "loss": 0.9536, + "step": 965 + }, + { + "epoch": 0.563511739827913, + "grad_norm": 1.1824188232421875, + "learning_rate": 3.700420673076923e-05, + "loss": 0.8876, + "step": 966 + }, + { + "epoch": 0.5640950853142774, + "grad_norm": 1.1528284549713135, + "learning_rate": 3.6989182692307694e-05, + "loss": 0.9186, + "step": 967 + }, + { + "epoch": 0.5646784308006417, + "grad_norm": 0.9889248609542847, + "learning_rate": 3.697415865384616e-05, + "loss": 1.0231, + "step": 968 + }, + { + "epoch": 0.565261776287006, + "grad_norm": 1.1389588117599487, + "learning_rate": 3.695913461538462e-05, + "loss": 0.9333, + "step": 969 + }, + { + "epoch": 0.5658451217733703, + "grad_norm": 1.3849660158157349, + "learning_rate": 3.694411057692308e-05, + "loss": 1.016, + "step": 970 + }, + { + "epoch": 0.5664284672597346, + "grad_norm": 1.1598434448242188, + "learning_rate": 3.6929086538461535e-05, + "loss": 1.2011, + "step": 971 + }, + { + "epoch": 0.5670118127460989, + "grad_norm": 1.3359456062316895, + "learning_rate": 3.69140625e-05, + "loss": 1.0756, + "step": 972 + }, + { + "epoch": 0.5675951582324632, + "grad_norm": 1.4474626779556274, + "learning_rate": 3.6899038461538466e-05, + "loss": 1.0561, + "step": 973 + }, + { + "epoch": 0.5681785037188275, + "grad_norm": 1.1154292821884155, + "learning_rate": 3.6884014423076924e-05, + "loss": 0.8292, + "step": 974 + }, + { + "epoch": 0.5687618492051918, + "grad_norm": 1.0911437273025513, + "learning_rate": 3.686899038461539e-05, + "loss": 0.9813, + "step": 975 + }, + { + "epoch": 0.5693451946915561, + "grad_norm": 1.3380028009414673, + "learning_rate": 3.685396634615385e-05, + "loss": 1.2311, + "step": 976 + }, + { + "epoch": 0.5699285401779204, + "grad_norm": 1.0942449569702148, + "learning_rate": 3.6838942307692306e-05, + "loss": 0.8704, + "step": 977 + }, + { + "epoch": 0.5705118856642847, + "grad_norm": 1.0522291660308838, + "learning_rate": 3.682391826923077e-05, + "loss": 0.7929, + "step": 978 + }, + { + "epoch": 0.571095231150649, + "grad_norm": 1.7213867902755737, + "learning_rate": 3.680889423076923e-05, + "loss": 1.2793, + "step": 979 + }, + { + "epoch": 0.5716785766370133, + "grad_norm": 1.2406498193740845, + "learning_rate": 3.6793870192307695e-05, + "loss": 0.8329, + "step": 980 + }, + { + "epoch": 0.5722619221233776, + "grad_norm": 1.3200228214263916, + "learning_rate": 3.677884615384616e-05, + "loss": 1.0274, + "step": 981 + }, + { + "epoch": 0.5728452676097419, + "grad_norm": 1.1541839838027954, + "learning_rate": 3.676382211538462e-05, + "loss": 0.9626, + "step": 982 + }, + { + "epoch": 0.5734286130961062, + "grad_norm": 1.1976728439331055, + "learning_rate": 3.674879807692308e-05, + "loss": 0.8529, + "step": 983 + }, + { + "epoch": 0.5740119585824704, + "grad_norm": 1.458693027496338, + "learning_rate": 3.6733774038461536e-05, + "loss": 1.1267, + "step": 984 + }, + { + "epoch": 0.5745953040688347, + "grad_norm": 1.5604653358459473, + "learning_rate": 3.671875e-05, + "loss": 1.0798, + "step": 985 + }, + { + "epoch": 0.575178649555199, + "grad_norm": 1.269396424293518, + "learning_rate": 3.670372596153847e-05, + "loss": 1.0479, + "step": 986 + }, + { + "epoch": 0.5757619950415633, + "grad_norm": 1.206891655921936, + "learning_rate": 3.6688701923076925e-05, + "loss": 0.7532, + "step": 987 + }, + { + "epoch": 0.5763453405279276, + "grad_norm": 1.3230568170547485, + "learning_rate": 3.667367788461539e-05, + "loss": 0.9374, + "step": 988 + }, + { + "epoch": 0.5769286860142919, + "grad_norm": 1.2090986967086792, + "learning_rate": 3.665865384615384e-05, + "loss": 0.8378, + "step": 989 + }, + { + "epoch": 0.5775120315006562, + "grad_norm": 1.1172541379928589, + "learning_rate": 3.664362980769231e-05, + "loss": 0.9294, + "step": 990 + }, + { + "epoch": 0.5780953769870205, + "grad_norm": 1.2599154710769653, + "learning_rate": 3.662860576923077e-05, + "loss": 1.0339, + "step": 991 + }, + { + "epoch": 0.5786787224733848, + "grad_norm": 1.5349094867706299, + "learning_rate": 3.661358173076923e-05, + "loss": 1.2734, + "step": 992 + }, + { + "epoch": 0.5792620679597492, + "grad_norm": 1.4437059164047241, + "learning_rate": 3.6598557692307697e-05, + "loss": 1.0486, + "step": 993 + }, + { + "epoch": 0.5798454134461135, + "grad_norm": 1.0442750453948975, + "learning_rate": 3.6583533653846155e-05, + "loss": 0.9691, + "step": 994 + }, + { + "epoch": 0.5804287589324778, + "grad_norm": 1.3448185920715332, + "learning_rate": 3.6568509615384614e-05, + "loss": 1.2887, + "step": 995 + }, + { + "epoch": 0.5810121044188421, + "grad_norm": 1.0911777019500732, + "learning_rate": 3.655348557692308e-05, + "loss": 1.0297, + "step": 996 + }, + { + "epoch": 0.5815954499052064, + "grad_norm": 1.1703611612319946, + "learning_rate": 3.653846153846154e-05, + "loss": 0.8926, + "step": 997 + }, + { + "epoch": 0.5821787953915707, + "grad_norm": 1.7544491291046143, + "learning_rate": 3.65234375e-05, + "loss": 1.1923, + "step": 998 + }, + { + "epoch": 0.582762140877935, + "grad_norm": 1.3902952671051025, + "learning_rate": 3.650841346153847e-05, + "loss": 0.8508, + "step": 999 + }, + { + "epoch": 0.5833454863642993, + "grad_norm": 1.1169430017471313, + "learning_rate": 3.6493389423076926e-05, + "loss": 0.9376, + "step": 1000 + }, + { + "epoch": 0.5833454863642993, + "eval_loss_squad": 0.8289175107888878, + "eval_perplexity": 8.038335329537642, + "eval_perplexity_reconstruct": 1.9354374343602856, + "step": 1000 + }, + { + "epoch": 0.5839288318506636, + "grad_norm": 1.0589932203292847, + "learning_rate": 3.6478365384615385e-05, + "loss": 0.8848, + "step": 1001 + }, + { + "epoch": 0.5845121773370279, + "grad_norm": 1.3385180234909058, + "learning_rate": 3.6463341346153843e-05, + "loss": 1.005, + "step": 1002 + }, + { + "epoch": 0.5850955228233922, + "grad_norm": 1.3170318603515625, + "learning_rate": 3.644831730769231e-05, + "loss": 1.0717, + "step": 1003 + }, + { + "epoch": 0.5856788683097565, + "grad_norm": 1.134464144706726, + "learning_rate": 3.6433293269230774e-05, + "loss": 1.1363, + "step": 1004 + }, + { + "epoch": 0.5862622137961208, + "grad_norm": 1.2407094240188599, + "learning_rate": 3.641826923076923e-05, + "loss": 0.8989, + "step": 1005 + }, + { + "epoch": 0.586845559282485, + "grad_norm": 1.040368914604187, + "learning_rate": 3.64032451923077e-05, + "loss": 0.7677, + "step": 1006 + }, + { + "epoch": 0.5874289047688493, + "grad_norm": 1.2355806827545166, + "learning_rate": 3.6388221153846156e-05, + "loss": 1.0903, + "step": 1007 + }, + { + "epoch": 0.5880122502552136, + "grad_norm": 2.0347537994384766, + "learning_rate": 3.6373197115384615e-05, + "loss": 1.0069, + "step": 1008 + }, + { + "epoch": 0.5885955957415779, + "grad_norm": 1.3925468921661377, + "learning_rate": 3.635817307692308e-05, + "loss": 1.1185, + "step": 1009 + }, + { + "epoch": 0.5891789412279422, + "grad_norm": 1.4087902307510376, + "learning_rate": 3.634314903846154e-05, + "loss": 1.0334, + "step": 1010 + }, + { + "epoch": 0.5897622867143065, + "grad_norm": 1.5874099731445312, + "learning_rate": 3.6328125000000004e-05, + "loss": 1.1728, + "step": 1011 + }, + { + "epoch": 0.5903456322006708, + "grad_norm": 1.1461565494537354, + "learning_rate": 3.631310096153846e-05, + "loss": 1.1395, + "step": 1012 + }, + { + "epoch": 0.5909289776870351, + "grad_norm": 1.3284554481506348, + "learning_rate": 3.629807692307692e-05, + "loss": 0.9558, + "step": 1013 + }, + { + "epoch": 0.5915123231733994, + "grad_norm": 1.1890766620635986, + "learning_rate": 3.6283052884615386e-05, + "loss": 0.8911, + "step": 1014 + }, + { + "epoch": 0.5920956686597637, + "grad_norm": 1.15906822681427, + "learning_rate": 3.6268028846153845e-05, + "loss": 0.8283, + "step": 1015 + }, + { + "epoch": 0.592679014146128, + "grad_norm": 2.1104965209960938, + "learning_rate": 3.625300480769231e-05, + "loss": 0.8631, + "step": 1016 + }, + { + "epoch": 0.5932623596324923, + "grad_norm": 1.0914028882980347, + "learning_rate": 3.6237980769230775e-05, + "loss": 0.9244, + "step": 1017 + }, + { + "epoch": 0.5938457051188566, + "grad_norm": 1.1843571662902832, + "learning_rate": 3.6222956730769234e-05, + "loss": 0.9798, + "step": 1018 + }, + { + "epoch": 0.5944290506052209, + "grad_norm": 2.413841724395752, + "learning_rate": 3.620793269230769e-05, + "loss": 0.9308, + "step": 1019 + }, + { + "epoch": 0.5950123960915853, + "grad_norm": 2.1389975547790527, + "learning_rate": 3.619290865384616e-05, + "loss": 1.0376, + "step": 1020 + }, + { + "epoch": 0.5955957415779496, + "grad_norm": 1.1918492317199707, + "learning_rate": 3.6177884615384616e-05, + "loss": 0.998, + "step": 1021 + }, + { + "epoch": 0.5961790870643139, + "grad_norm": 1.1064860820770264, + "learning_rate": 3.616286057692308e-05, + "loss": 1.0828, + "step": 1022 + }, + { + "epoch": 0.5967624325506782, + "grad_norm": 1.1806142330169678, + "learning_rate": 3.614783653846154e-05, + "loss": 1.1629, + "step": 1023 + }, + { + "epoch": 0.5973457780370425, + "grad_norm": 1.266573429107666, + "learning_rate": 3.6132812500000005e-05, + "loss": 0.9503, + "step": 1024 + }, + { + "epoch": 0.5979291235234068, + "grad_norm": 1.1975195407867432, + "learning_rate": 3.6117788461538463e-05, + "loss": 0.9898, + "step": 1025 + }, + { + "epoch": 0.5985124690097711, + "grad_norm": 1.3326911926269531, + "learning_rate": 3.610276442307692e-05, + "loss": 1.0076, + "step": 1026 + }, + { + "epoch": 0.5990958144961354, + "grad_norm": 1.2400132417678833, + "learning_rate": 3.608774038461539e-05, + "loss": 1.0404, + "step": 1027 + }, + { + "epoch": 0.5996791599824997, + "grad_norm": 1.5326324701309204, + "learning_rate": 3.6072716346153846e-05, + "loss": 0.9418, + "step": 1028 + }, + { + "epoch": 0.600262505468864, + "grad_norm": 1.4190855026245117, + "learning_rate": 3.605769230769231e-05, + "loss": 1.0581, + "step": 1029 + }, + { + "epoch": 0.6008458509552282, + "grad_norm": 1.4408974647521973, + "learning_rate": 3.604266826923077e-05, + "loss": 0.8873, + "step": 1030 + }, + { + "epoch": 0.6014291964415925, + "grad_norm": 1.0051006078720093, + "learning_rate": 3.602764423076923e-05, + "loss": 0.9694, + "step": 1031 + }, + { + "epoch": 0.6020125419279568, + "grad_norm": 1.304622769355774, + "learning_rate": 3.601262019230769e-05, + "loss": 0.894, + "step": 1032 + }, + { + "epoch": 0.6025958874143211, + "grad_norm": 1.6079553365707397, + "learning_rate": 3.599759615384616e-05, + "loss": 1.134, + "step": 1033 + }, + { + "epoch": 0.6031792329006854, + "grad_norm": 1.3666532039642334, + "learning_rate": 3.598257211538462e-05, + "loss": 1.171, + "step": 1034 + }, + { + "epoch": 0.6037625783870497, + "grad_norm": 1.1331239938735962, + "learning_rate": 3.596754807692308e-05, + "loss": 1.014, + "step": 1035 + }, + { + "epoch": 0.604345923873414, + "grad_norm": 1.153573989868164, + "learning_rate": 3.595252403846154e-05, + "loss": 1.1011, + "step": 1036 + }, + { + "epoch": 0.6049292693597783, + "grad_norm": 1.182096004486084, + "learning_rate": 3.59375e-05, + "loss": 1.1392, + "step": 1037 + }, + { + "epoch": 0.6055126148461426, + "grad_norm": 1.1555769443511963, + "learning_rate": 3.5922475961538465e-05, + "loss": 0.8643, + "step": 1038 + }, + { + "epoch": 0.6060959603325069, + "grad_norm": 1.103007197380066, + "learning_rate": 3.590745192307692e-05, + "loss": 0.8785, + "step": 1039 + }, + { + "epoch": 0.6066793058188712, + "grad_norm": 1.2092280387878418, + "learning_rate": 3.589242788461539e-05, + "loss": 1.0269, + "step": 1040 + }, + { + "epoch": 0.6072626513052355, + "grad_norm": 1.2705990076065063, + "learning_rate": 3.587740384615385e-05, + "loss": 0.8287, + "step": 1041 + }, + { + "epoch": 0.6078459967915998, + "grad_norm": 1.1593817472457886, + "learning_rate": 3.586237980769231e-05, + "loss": 0.8295, + "step": 1042 + }, + { + "epoch": 0.6084293422779641, + "grad_norm": 1.1436785459518433, + "learning_rate": 3.584735576923077e-05, + "loss": 0.9481, + "step": 1043 + }, + { + "epoch": 0.6090126877643284, + "grad_norm": 1.4250012636184692, + "learning_rate": 3.583233173076923e-05, + "loss": 1.2019, + "step": 1044 + }, + { + "epoch": 0.6095960332506927, + "grad_norm": 1.3171687126159668, + "learning_rate": 3.5817307692307695e-05, + "loss": 0.8549, + "step": 1045 + }, + { + "epoch": 0.6101793787370571, + "grad_norm": 1.4231929779052734, + "learning_rate": 3.580228365384616e-05, + "loss": 1.1054, + "step": 1046 + }, + { + "epoch": 0.6107627242234214, + "grad_norm": 1.2546910047531128, + "learning_rate": 3.578725961538462e-05, + "loss": 1.0624, + "step": 1047 + }, + { + "epoch": 0.6113460697097857, + "grad_norm": 1.1451596021652222, + "learning_rate": 3.577223557692308e-05, + "loss": 1.1191, + "step": 1048 + }, + { + "epoch": 0.61192941519615, + "grad_norm": 1.250524640083313, + "learning_rate": 3.5757211538461535e-05, + "loss": 1.0673, + "step": 1049 + }, + { + "epoch": 0.6125127606825143, + "grad_norm": 1.2106703519821167, + "learning_rate": 3.57421875e-05, + "loss": 0.9558, + "step": 1050 + }, + { + "epoch": 0.6130961061688786, + "grad_norm": 1.4316116571426392, + "learning_rate": 3.5727163461538466e-05, + "loss": 1.146, + "step": 1051 + }, + { + "epoch": 0.6136794516552428, + "grad_norm": 1.1764512062072754, + "learning_rate": 3.5712139423076924e-05, + "loss": 0.8677, + "step": 1052 + }, + { + "epoch": 0.6142627971416071, + "grad_norm": 1.0635274648666382, + "learning_rate": 3.569711538461539e-05, + "loss": 0.9785, + "step": 1053 + }, + { + "epoch": 0.6148461426279714, + "grad_norm": 1.1735188961029053, + "learning_rate": 3.568209134615385e-05, + "loss": 0.8116, + "step": 1054 + }, + { + "epoch": 0.6154294881143357, + "grad_norm": 1.1931421756744385, + "learning_rate": 3.566706730769231e-05, + "loss": 1.0047, + "step": 1055 + }, + { + "epoch": 0.6160128336007, + "grad_norm": 1.4057892560958862, + "learning_rate": 3.565204326923077e-05, + "loss": 0.9297, + "step": 1056 + }, + { + "epoch": 0.6165961790870643, + "grad_norm": 1.0752261877059937, + "learning_rate": 3.563701923076923e-05, + "loss": 0.9164, + "step": 1057 + }, + { + "epoch": 0.6171795245734286, + "grad_norm": 1.052547574043274, + "learning_rate": 3.5621995192307696e-05, + "loss": 1.0254, + "step": 1058 + }, + { + "epoch": 0.6177628700597929, + "grad_norm": 1.1009021997451782, + "learning_rate": 3.560697115384616e-05, + "loss": 0.888, + "step": 1059 + }, + { + "epoch": 0.6183462155461572, + "grad_norm": 1.2356963157653809, + "learning_rate": 3.559194711538462e-05, + "loss": 1.1234, + "step": 1060 + }, + { + "epoch": 0.6189295610325215, + "grad_norm": 1.765424370765686, + "learning_rate": 3.557692307692308e-05, + "loss": 1.2263, + "step": 1061 + }, + { + "epoch": 0.6195129065188858, + "grad_norm": 1.3897560834884644, + "learning_rate": 3.5561899038461536e-05, + "loss": 1.0729, + "step": 1062 + }, + { + "epoch": 0.6200962520052501, + "grad_norm": 1.1901566982269287, + "learning_rate": 3.5546875e-05, + "loss": 0.9953, + "step": 1063 + }, + { + "epoch": 0.6206795974916144, + "grad_norm": 1.4633679389953613, + "learning_rate": 3.553185096153847e-05, + "loss": 0.9616, + "step": 1064 + }, + { + "epoch": 0.6212629429779787, + "grad_norm": 1.3287920951843262, + "learning_rate": 3.5516826923076926e-05, + "loss": 1.0438, + "step": 1065 + }, + { + "epoch": 0.621846288464343, + "grad_norm": 1.7268593311309814, + "learning_rate": 3.5501802884615384e-05, + "loss": 0.8745, + "step": 1066 + }, + { + "epoch": 0.6224296339507073, + "grad_norm": 1.0686813592910767, + "learning_rate": 3.548677884615384e-05, + "loss": 0.8533, + "step": 1067 + }, + { + "epoch": 0.6230129794370716, + "grad_norm": 1.2637293338775635, + "learning_rate": 3.547175480769231e-05, + "loss": 0.8116, + "step": 1068 + }, + { + "epoch": 0.6235963249234359, + "grad_norm": 1.3439655303955078, + "learning_rate": 3.545673076923077e-05, + "loss": 1.1462, + "step": 1069 + }, + { + "epoch": 0.6241796704098002, + "grad_norm": 1.277295708656311, + "learning_rate": 3.544170673076923e-05, + "loss": 0.9344, + "step": 1070 + }, + { + "epoch": 0.6247630158961645, + "grad_norm": 1.2058827877044678, + "learning_rate": 3.54266826923077e-05, + "loss": 1.0197, + "step": 1071 + }, + { + "epoch": 0.6253463613825287, + "grad_norm": 1.288257360458374, + "learning_rate": 3.5411658653846155e-05, + "loss": 0.925, + "step": 1072 + }, + { + "epoch": 0.6259297068688932, + "grad_norm": 1.1143733263015747, + "learning_rate": 3.5396634615384614e-05, + "loss": 1.0083, + "step": 1073 + }, + { + "epoch": 0.6265130523552574, + "grad_norm": 1.0898163318634033, + "learning_rate": 3.538161057692308e-05, + "loss": 1.0865, + "step": 1074 + }, + { + "epoch": 0.6270963978416217, + "grad_norm": 1.1873056888580322, + "learning_rate": 3.536658653846154e-05, + "loss": 1.0579, + "step": 1075 + }, + { + "epoch": 0.627679743327986, + "grad_norm": 1.0526707172393799, + "learning_rate": 3.53515625e-05, + "loss": 1.0129, + "step": 1076 + }, + { + "epoch": 0.6282630888143503, + "grad_norm": 1.6970293521881104, + "learning_rate": 3.533653846153847e-05, + "loss": 1.1972, + "step": 1077 + }, + { + "epoch": 0.6288464343007146, + "grad_norm": 1.2696473598480225, + "learning_rate": 3.532151442307693e-05, + "loss": 0.8886, + "step": 1078 + }, + { + "epoch": 0.6294297797870789, + "grad_norm": 1.1362708806991577, + "learning_rate": 3.5306490384615385e-05, + "loss": 1.1162, + "step": 1079 + }, + { + "epoch": 0.6300131252734432, + "grad_norm": 1.0971933603286743, + "learning_rate": 3.5291466346153844e-05, + "loss": 0.9418, + "step": 1080 + }, + { + "epoch": 0.6305964707598075, + "grad_norm": 1.5346184968948364, + "learning_rate": 3.527644230769231e-05, + "loss": 1.3067, + "step": 1081 + }, + { + "epoch": 0.6311798162461718, + "grad_norm": 1.10757315158844, + "learning_rate": 3.5261418269230774e-05, + "loss": 0.9629, + "step": 1082 + }, + { + "epoch": 0.6317631617325361, + "grad_norm": 1.1153584718704224, + "learning_rate": 3.524639423076923e-05, + "loss": 0.9693, + "step": 1083 + }, + { + "epoch": 0.6323465072189004, + "grad_norm": 1.2248104810714722, + "learning_rate": 3.523137019230769e-05, + "loss": 0.9687, + "step": 1084 + }, + { + "epoch": 0.6329298527052647, + "grad_norm": 1.2040361166000366, + "learning_rate": 3.5216346153846157e-05, + "loss": 0.8451, + "step": 1085 + }, + { + "epoch": 0.633513198191629, + "grad_norm": 1.2189350128173828, + "learning_rate": 3.5201322115384615e-05, + "loss": 0.8775, + "step": 1086 + }, + { + "epoch": 0.6340965436779933, + "grad_norm": 1.453861117362976, + "learning_rate": 3.518629807692308e-05, + "loss": 0.9802, + "step": 1087 + }, + { + "epoch": 0.6346798891643576, + "grad_norm": 1.6878211498260498, + "learning_rate": 3.517127403846154e-05, + "loss": 0.8514, + "step": 1088 + }, + { + "epoch": 0.6352632346507219, + "grad_norm": 1.2242681980133057, + "learning_rate": 3.5156250000000004e-05, + "loss": 0.7271, + "step": 1089 + }, + { + "epoch": 0.6358465801370862, + "grad_norm": 1.1206653118133545, + "learning_rate": 3.514122596153846e-05, + "loss": 1.0562, + "step": 1090 + }, + { + "epoch": 0.6364299256234505, + "grad_norm": 1.2914477586746216, + "learning_rate": 3.512620192307692e-05, + "loss": 0.943, + "step": 1091 + }, + { + "epoch": 0.6370132711098148, + "grad_norm": 1.4054170846939087, + "learning_rate": 3.5111177884615386e-05, + "loss": 1.064, + "step": 1092 + }, + { + "epoch": 0.637596616596179, + "grad_norm": 1.1132476329803467, + "learning_rate": 3.5096153846153845e-05, + "loss": 1.0468, + "step": 1093 + }, + { + "epoch": 0.6381799620825434, + "grad_norm": 1.4797582626342773, + "learning_rate": 3.508112980769231e-05, + "loss": 1.0369, + "step": 1094 + }, + { + "epoch": 0.6387633075689076, + "grad_norm": 1.1891480684280396, + "learning_rate": 3.5066105769230775e-05, + "loss": 0.93, + "step": 1095 + }, + { + "epoch": 0.6393466530552719, + "grad_norm": 1.1823198795318604, + "learning_rate": 3.5051081730769234e-05, + "loss": 1.2398, + "step": 1096 + }, + { + "epoch": 0.6399299985416362, + "grad_norm": 1.0506694316864014, + "learning_rate": 3.503605769230769e-05, + "loss": 1.0028, + "step": 1097 + }, + { + "epoch": 0.6405133440280005, + "grad_norm": 1.1625219583511353, + "learning_rate": 3.502103365384616e-05, + "loss": 0.9668, + "step": 1098 + }, + { + "epoch": 0.6410966895143649, + "grad_norm": 1.3133008480072021, + "learning_rate": 3.5006009615384616e-05, + "loss": 0.8842, + "step": 1099 + }, + { + "epoch": 0.6416800350007292, + "grad_norm": 1.2925740480422974, + "learning_rate": 3.499098557692308e-05, + "loss": 0.9443, + "step": 1100 + }, + { + "epoch": 0.6422633804870935, + "grad_norm": 1.1056509017944336, + "learning_rate": 3.497596153846154e-05, + "loss": 1.0339, + "step": 1101 + }, + { + "epoch": 0.6428467259734578, + "grad_norm": 1.4267328977584839, + "learning_rate": 3.49609375e-05, + "loss": 0.9475, + "step": 1102 + }, + { + "epoch": 0.6434300714598221, + "grad_norm": 1.0847243070602417, + "learning_rate": 3.4945913461538464e-05, + "loss": 1.0789, + "step": 1103 + }, + { + "epoch": 0.6440134169461864, + "grad_norm": 1.2231626510620117, + "learning_rate": 3.493088942307692e-05, + "loss": 0.9817, + "step": 1104 + }, + { + "epoch": 0.6445967624325507, + "grad_norm": 1.1734437942504883, + "learning_rate": 3.491586538461539e-05, + "loss": 1.0806, + "step": 1105 + }, + { + "epoch": 0.645180107918915, + "grad_norm": 1.1729824542999268, + "learning_rate": 3.4900841346153846e-05, + "loss": 1.0979, + "step": 1106 + }, + { + "epoch": 0.6457634534052793, + "grad_norm": 1.1877245903015137, + "learning_rate": 3.488581730769231e-05, + "loss": 1.0407, + "step": 1107 + }, + { + "epoch": 0.6463467988916436, + "grad_norm": 1.0097910165786743, + "learning_rate": 3.487079326923077e-05, + "loss": 0.9112, + "step": 1108 + }, + { + "epoch": 0.6469301443780079, + "grad_norm": 1.1973975896835327, + "learning_rate": 3.485576923076923e-05, + "loss": 0.873, + "step": 1109 + }, + { + "epoch": 0.6475134898643722, + "grad_norm": 1.1800600290298462, + "learning_rate": 3.4840745192307694e-05, + "loss": 0.9863, + "step": 1110 + }, + { + "epoch": 0.6480968353507365, + "grad_norm": 1.5829706192016602, + "learning_rate": 3.482572115384616e-05, + "loss": 0.9469, + "step": 1111 + }, + { + "epoch": 0.6486801808371008, + "grad_norm": 1.5107544660568237, + "learning_rate": 3.481069711538462e-05, + "loss": 0.8981, + "step": 1112 + }, + { + "epoch": 0.6492635263234651, + "grad_norm": 1.1659681797027588, + "learning_rate": 3.479567307692308e-05, + "loss": 1.0151, + "step": 1113 + }, + { + "epoch": 0.6498468718098294, + "grad_norm": 1.2110625505447388, + "learning_rate": 3.478064903846154e-05, + "loss": 0.8399, + "step": 1114 + }, + { + "epoch": 0.6504302172961937, + "grad_norm": 1.2130272388458252, + "learning_rate": 3.4765625e-05, + "loss": 0.9638, + "step": 1115 + }, + { + "epoch": 0.651013562782558, + "grad_norm": 1.4531303644180298, + "learning_rate": 3.4750600961538465e-05, + "loss": 0.9451, + "step": 1116 + }, + { + "epoch": 0.6515969082689222, + "grad_norm": 1.2456012964248657, + "learning_rate": 3.4735576923076923e-05, + "loss": 0.9804, + "step": 1117 + }, + { + "epoch": 0.6521802537552865, + "grad_norm": 1.8596121072769165, + "learning_rate": 3.472055288461539e-05, + "loss": 1.0599, + "step": 1118 + }, + { + "epoch": 0.6527635992416508, + "grad_norm": 1.2399401664733887, + "learning_rate": 3.470552884615385e-05, + "loss": 0.978, + "step": 1119 + }, + { + "epoch": 0.6533469447280151, + "grad_norm": 1.25180983543396, + "learning_rate": 3.4690504807692306e-05, + "loss": 1.1725, + "step": 1120 + }, + { + "epoch": 0.6539302902143794, + "grad_norm": 1.2084770202636719, + "learning_rate": 3.467548076923077e-05, + "loss": 0.8519, + "step": 1121 + }, + { + "epoch": 0.6545136357007437, + "grad_norm": 1.1235297918319702, + "learning_rate": 3.466045673076923e-05, + "loss": 0.9979, + "step": 1122 + }, + { + "epoch": 0.655096981187108, + "grad_norm": 1.1955933570861816, + "learning_rate": 3.4645432692307695e-05, + "loss": 1.2237, + "step": 1123 + }, + { + "epoch": 0.6556803266734723, + "grad_norm": 1.144810676574707, + "learning_rate": 3.463040865384616e-05, + "loss": 0.7198, + "step": 1124 + }, + { + "epoch": 0.6562636721598366, + "grad_norm": 1.1226835250854492, + "learning_rate": 3.461538461538462e-05, + "loss": 1.0139, + "step": 1125 + }, + { + "epoch": 0.656847017646201, + "grad_norm": 1.0557211637496948, + "learning_rate": 3.460036057692308e-05, + "loss": 1.2123, + "step": 1126 + }, + { + "epoch": 0.6574303631325653, + "grad_norm": 1.051958441734314, + "learning_rate": 3.4585336538461536e-05, + "loss": 1.1768, + "step": 1127 + }, + { + "epoch": 0.6580137086189296, + "grad_norm": 1.0615653991699219, + "learning_rate": 3.45703125e-05, + "loss": 0.9798, + "step": 1128 + }, + { + "epoch": 0.6585970541052939, + "grad_norm": 1.022555947303772, + "learning_rate": 3.4555288461538466e-05, + "loss": 0.7892, + "step": 1129 + }, + { + "epoch": 0.6591803995916582, + "grad_norm": 1.3960226774215698, + "learning_rate": 3.4540264423076925e-05, + "loss": 0.922, + "step": 1130 + }, + { + "epoch": 0.6597637450780225, + "grad_norm": 1.0755395889282227, + "learning_rate": 3.452524038461539e-05, + "loss": 1.0775, + "step": 1131 + }, + { + "epoch": 0.6603470905643868, + "grad_norm": 1.075884222984314, + "learning_rate": 3.451021634615385e-05, + "loss": 1.149, + "step": 1132 + }, + { + "epoch": 0.6609304360507511, + "grad_norm": 1.4113826751708984, + "learning_rate": 3.449519230769231e-05, + "loss": 0.8611, + "step": 1133 + }, + { + "epoch": 0.6615137815371154, + "grad_norm": 1.0343314409255981, + "learning_rate": 3.448016826923077e-05, + "loss": 0.8762, + "step": 1134 + }, + { + "epoch": 0.6620971270234797, + "grad_norm": 1.0894453525543213, + "learning_rate": 3.446514423076923e-05, + "loss": 1.0281, + "step": 1135 + }, + { + "epoch": 0.662680472509844, + "grad_norm": 1.2878773212432861, + "learning_rate": 3.4450120192307696e-05, + "loss": 1.0719, + "step": 1136 + }, + { + "epoch": 0.6632638179962083, + "grad_norm": 1.1200342178344727, + "learning_rate": 3.443509615384616e-05, + "loss": 1.0834, + "step": 1137 + }, + { + "epoch": 0.6638471634825726, + "grad_norm": 1.2115342617034912, + "learning_rate": 3.442007211538461e-05, + "loss": 1.1008, + "step": 1138 + }, + { + "epoch": 0.6644305089689369, + "grad_norm": 1.214706301689148, + "learning_rate": 3.440504807692308e-05, + "loss": 1.2134, + "step": 1139 + }, + { + "epoch": 0.6650138544553011, + "grad_norm": 1.3579165935516357, + "learning_rate": 3.439002403846154e-05, + "loss": 0.8706, + "step": 1140 + }, + { + "epoch": 0.6655971999416654, + "grad_norm": 1.3200846910476685, + "learning_rate": 3.4375e-05, + "loss": 0.9311, + "step": 1141 + }, + { + "epoch": 0.6661805454280297, + "grad_norm": 1.4525578022003174, + "learning_rate": 3.435997596153847e-05, + "loss": 1.0345, + "step": 1142 + }, + { + "epoch": 0.666763890914394, + "grad_norm": 1.2601604461669922, + "learning_rate": 3.4344951923076926e-05, + "loss": 1.1084, + "step": 1143 + }, + { + "epoch": 0.6673472364007583, + "grad_norm": 1.278788447380066, + "learning_rate": 3.4329927884615384e-05, + "loss": 0.9541, + "step": 1144 + }, + { + "epoch": 0.6679305818871226, + "grad_norm": 0.9493159651756287, + "learning_rate": 3.431490384615384e-05, + "loss": 0.9375, + "step": 1145 + }, + { + "epoch": 0.6685139273734869, + "grad_norm": 1.0539159774780273, + "learning_rate": 3.429987980769231e-05, + "loss": 0.9884, + "step": 1146 + }, + { + "epoch": 0.6690972728598512, + "grad_norm": 1.1678050756454468, + "learning_rate": 3.4284855769230773e-05, + "loss": 1.0402, + "step": 1147 + }, + { + "epoch": 0.6696806183462155, + "grad_norm": 1.2369019985198975, + "learning_rate": 3.426983173076923e-05, + "loss": 0.9254, + "step": 1148 + }, + { + "epoch": 0.6702639638325798, + "grad_norm": 1.539076566696167, + "learning_rate": 3.42548076923077e-05, + "loss": 1.1155, + "step": 1149 + }, + { + "epoch": 0.6708473093189441, + "grad_norm": 1.074963092803955, + "learning_rate": 3.4239783653846156e-05, + "loss": 1.0213, + "step": 1150 + }, + { + "epoch": 0.6714306548053084, + "grad_norm": 1.2291454076766968, + "learning_rate": 3.4224759615384614e-05, + "loss": 0.898, + "step": 1151 + }, + { + "epoch": 0.6720140002916728, + "grad_norm": 1.1478317975997925, + "learning_rate": 3.420973557692308e-05, + "loss": 0.8589, + "step": 1152 + }, + { + "epoch": 0.6725973457780371, + "grad_norm": 1.096078872680664, + "learning_rate": 3.419471153846154e-05, + "loss": 0.8751, + "step": 1153 + }, + { + "epoch": 0.6731806912644014, + "grad_norm": 1.4119384288787842, + "learning_rate": 3.41796875e-05, + "loss": 1.0693, + "step": 1154 + }, + { + "epoch": 0.6737640367507657, + "grad_norm": 1.2378814220428467, + "learning_rate": 3.416466346153847e-05, + "loss": 1.2076, + "step": 1155 + }, + { + "epoch": 0.67434738223713, + "grad_norm": 1.2882436513900757, + "learning_rate": 3.414963942307692e-05, + "loss": 1.1177, + "step": 1156 + }, + { + "epoch": 0.6749307277234943, + "grad_norm": 1.1691817045211792, + "learning_rate": 3.4134615384615386e-05, + "loss": 1.0826, + "step": 1157 + }, + { + "epoch": 0.6755140732098586, + "grad_norm": 1.168468713760376, + "learning_rate": 3.4119591346153844e-05, + "loss": 0.9258, + "step": 1158 + }, + { + "epoch": 0.6760974186962229, + "grad_norm": 1.3444772958755493, + "learning_rate": 3.410456730769231e-05, + "loss": 1.1413, + "step": 1159 + }, + { + "epoch": 0.6766807641825872, + "grad_norm": 1.2808760404586792, + "learning_rate": 3.4089543269230775e-05, + "loss": 0.8778, + "step": 1160 + }, + { + "epoch": 0.6772641096689515, + "grad_norm": 1.1354130506515503, + "learning_rate": 3.407451923076923e-05, + "loss": 0.8865, + "step": 1161 + }, + { + "epoch": 0.6778474551553157, + "grad_norm": 1.0770645141601562, + "learning_rate": 3.405949519230769e-05, + "loss": 1.0861, + "step": 1162 + }, + { + "epoch": 0.67843080064168, + "grad_norm": 1.1578465700149536, + "learning_rate": 3.404447115384616e-05, + "loss": 1.0487, + "step": 1163 + }, + { + "epoch": 0.6790141461280443, + "grad_norm": 1.0803139209747314, + "learning_rate": 3.4029447115384615e-05, + "loss": 0.8303, + "step": 1164 + }, + { + "epoch": 0.6795974916144086, + "grad_norm": 1.0990511178970337, + "learning_rate": 3.401442307692308e-05, + "loss": 1.1653, + "step": 1165 + }, + { + "epoch": 0.6801808371007729, + "grad_norm": 0.9813050627708435, + "learning_rate": 3.399939903846154e-05, + "loss": 0.9875, + "step": 1166 + }, + { + "epoch": 0.6807641825871372, + "grad_norm": 1.0541377067565918, + "learning_rate": 3.3984375000000004e-05, + "loss": 0.9924, + "step": 1167 + }, + { + "epoch": 0.6813475280735015, + "grad_norm": 1.2727155685424805, + "learning_rate": 3.396935096153846e-05, + "loss": 0.7893, + "step": 1168 + }, + { + "epoch": 0.6819308735598658, + "grad_norm": 1.1019082069396973, + "learning_rate": 3.395432692307692e-05, + "loss": 1.2714, + "step": 1169 + }, + { + "epoch": 0.6825142190462301, + "grad_norm": 0.9809292554855347, + "learning_rate": 3.393930288461539e-05, + "loss": 0.8502, + "step": 1170 + }, + { + "epoch": 0.6830975645325944, + "grad_norm": 0.8613129258155823, + "learning_rate": 3.3924278846153845e-05, + "loss": 0.8276, + "step": 1171 + }, + { + "epoch": 0.6836809100189587, + "grad_norm": 1.049072265625, + "learning_rate": 3.390925480769231e-05, + "loss": 1.0627, + "step": 1172 + }, + { + "epoch": 0.684264255505323, + "grad_norm": 1.205977201461792, + "learning_rate": 3.3894230769230776e-05, + "loss": 0.9767, + "step": 1173 + }, + { + "epoch": 0.6848476009916873, + "grad_norm": 1.2076629400253296, + "learning_rate": 3.387920673076923e-05, + "loss": 0.8472, + "step": 1174 + }, + { + "epoch": 0.6854309464780516, + "grad_norm": 1.244746208190918, + "learning_rate": 3.386418269230769e-05, + "loss": 1.0404, + "step": 1175 + }, + { + "epoch": 0.6860142919644159, + "grad_norm": 1.0318353176116943, + "learning_rate": 3.384915865384616e-05, + "loss": 1.2889, + "step": 1176 + }, + { + "epoch": 0.6865976374507802, + "grad_norm": 1.1536865234375, + "learning_rate": 3.3834134615384617e-05, + "loss": 1.0932, + "step": 1177 + }, + { + "epoch": 0.6871809829371445, + "grad_norm": 1.0421112775802612, + "learning_rate": 3.381911057692308e-05, + "loss": 1.1513, + "step": 1178 + }, + { + "epoch": 0.6877643284235089, + "grad_norm": 1.2524076700210571, + "learning_rate": 3.380408653846154e-05, + "loss": 1.0571, + "step": 1179 + }, + { + "epoch": 0.6883476739098732, + "grad_norm": 1.3088963031768799, + "learning_rate": 3.37890625e-05, + "loss": 1.0539, + "step": 1180 + }, + { + "epoch": 0.6889310193962375, + "grad_norm": 1.4658859968185425, + "learning_rate": 3.3774038461538464e-05, + "loss": 0.9256, + "step": 1181 + }, + { + "epoch": 0.6895143648826018, + "grad_norm": 1.3050382137298584, + "learning_rate": 3.375901442307692e-05, + "loss": 1.1349, + "step": 1182 + }, + { + "epoch": 0.690097710368966, + "grad_norm": 1.318977952003479, + "learning_rate": 3.374399038461539e-05, + "loss": 1.0711, + "step": 1183 + }, + { + "epoch": 0.6906810558553304, + "grad_norm": 1.2855241298675537, + "learning_rate": 3.3728966346153846e-05, + "loss": 0.9612, + "step": 1184 + }, + { + "epoch": 0.6912644013416946, + "grad_norm": 1.3077634572982788, + "learning_rate": 3.371394230769231e-05, + "loss": 1.0041, + "step": 1185 + }, + { + "epoch": 0.6918477468280589, + "grad_norm": 1.1256729364395142, + "learning_rate": 3.369891826923077e-05, + "loss": 1.0035, + "step": 1186 + }, + { + "epoch": 0.6924310923144232, + "grad_norm": 1.3386636972427368, + "learning_rate": 3.368389423076923e-05, + "loss": 0.9681, + "step": 1187 + }, + { + "epoch": 0.6930144378007875, + "grad_norm": 1.4480712413787842, + "learning_rate": 3.3668870192307694e-05, + "loss": 1.1093, + "step": 1188 + }, + { + "epoch": 0.6935977832871518, + "grad_norm": 1.1406118869781494, + "learning_rate": 3.365384615384616e-05, + "loss": 1.0623, + "step": 1189 + }, + { + "epoch": 0.6941811287735161, + "grad_norm": 1.1809027194976807, + "learning_rate": 3.363882211538462e-05, + "loss": 0.9942, + "step": 1190 + }, + { + "epoch": 0.6947644742598804, + "grad_norm": 1.3855853080749512, + "learning_rate": 3.362379807692308e-05, + "loss": 0.9911, + "step": 1191 + }, + { + "epoch": 0.6953478197462447, + "grad_norm": 1.2072291374206543, + "learning_rate": 3.3608774038461535e-05, + "loss": 0.9015, + "step": 1192 + }, + { + "epoch": 0.695931165232609, + "grad_norm": 1.3127961158752441, + "learning_rate": 3.359375e-05, + "loss": 0.8459, + "step": 1193 + }, + { + "epoch": 0.6965145107189733, + "grad_norm": 1.1735903024673462, + "learning_rate": 3.3578725961538465e-05, + "loss": 0.9313, + "step": 1194 + }, + { + "epoch": 0.6970978562053376, + "grad_norm": 1.1430635452270508, + "learning_rate": 3.3563701923076924e-05, + "loss": 1.1029, + "step": 1195 + }, + { + "epoch": 0.6976812016917019, + "grad_norm": 1.0707919597625732, + "learning_rate": 3.354867788461539e-05, + "loss": 1.0674, + "step": 1196 + }, + { + "epoch": 0.6982645471780662, + "grad_norm": 1.1463159322738647, + "learning_rate": 3.353365384615385e-05, + "loss": 1.0293, + "step": 1197 + }, + { + "epoch": 0.6988478926644305, + "grad_norm": 0.9569932818412781, + "learning_rate": 3.3518629807692306e-05, + "loss": 0.726, + "step": 1198 + }, + { + "epoch": 0.6994312381507948, + "grad_norm": 1.4707874059677124, + "learning_rate": 3.350360576923077e-05, + "loss": 0.9595, + "step": 1199 + }, + { + "epoch": 0.7000145836371591, + "grad_norm": 1.0669324398040771, + "learning_rate": 3.348858173076923e-05, + "loss": 1.1805, + "step": 1200 + }, + { + "epoch": 0.7000145836371591, + "eval_loss_squad": 0.8398913412541151, + "eval_perplexity": 8.243036213620654, + "eval_perplexity_reconstruct": 1.9400207914704957, + "step": 1200 + }, + { + "epoch": 0.7005979291235234, + "grad_norm": 1.3081518411636353, + "learning_rate": 3.3473557692307695e-05, + "loss": 1.2301, + "step": 1201 + }, + { + "epoch": 0.7011812746098877, + "grad_norm": 1.2794771194458008, + "learning_rate": 3.345853365384616e-05, + "loss": 0.7746, + "step": 1202 + }, + { + "epoch": 0.701764620096252, + "grad_norm": 1.2433689832687378, + "learning_rate": 3.344350961538462e-05, + "loss": 0.9256, + "step": 1203 + }, + { + "epoch": 0.7023479655826163, + "grad_norm": 1.1298249959945679, + "learning_rate": 3.342848557692308e-05, + "loss": 1.0316, + "step": 1204 + }, + { + "epoch": 0.7029313110689807, + "grad_norm": 1.1544654369354248, + "learning_rate": 3.3413461538461536e-05, + "loss": 0.9899, + "step": 1205 + }, + { + "epoch": 0.703514656555345, + "grad_norm": 1.4191893339157104, + "learning_rate": 3.33984375e-05, + "loss": 0.8943, + "step": 1206 + }, + { + "epoch": 0.7040980020417092, + "grad_norm": 1.1830098628997803, + "learning_rate": 3.3383413461538466e-05, + "loss": 0.9173, + "step": 1207 + }, + { + "epoch": 0.7046813475280735, + "grad_norm": 1.1737726926803589, + "learning_rate": 3.3368389423076925e-05, + "loss": 0.9999, + "step": 1208 + }, + { + "epoch": 0.7052646930144378, + "grad_norm": 1.3882325887680054, + "learning_rate": 3.335336538461539e-05, + "loss": 1.0172, + "step": 1209 + }, + { + "epoch": 0.7058480385008021, + "grad_norm": 1.1368088722229004, + "learning_rate": 3.333834134615384e-05, + "loss": 0.9639, + "step": 1210 + }, + { + "epoch": 0.7064313839871664, + "grad_norm": 1.0348821878433228, + "learning_rate": 3.332331730769231e-05, + "loss": 1.0874, + "step": 1211 + }, + { + "epoch": 0.7070147294735307, + "grad_norm": 1.3544259071350098, + "learning_rate": 3.330829326923077e-05, + "loss": 1.0331, + "step": 1212 + }, + { + "epoch": 0.707598074959895, + "grad_norm": 0.9908936619758606, + "learning_rate": 3.329326923076923e-05, + "loss": 0.7225, + "step": 1213 + }, + { + "epoch": 0.7081814204462593, + "grad_norm": 1.2547880411148071, + "learning_rate": 3.3278245192307696e-05, + "loss": 1.0878, + "step": 1214 + }, + { + "epoch": 0.7087647659326236, + "grad_norm": 1.617482304573059, + "learning_rate": 3.3263221153846155e-05, + "loss": 1.0804, + "step": 1215 + }, + { + "epoch": 0.7093481114189879, + "grad_norm": 1.039589285850525, + "learning_rate": 3.324819711538461e-05, + "loss": 0.9664, + "step": 1216 + }, + { + "epoch": 0.7099314569053522, + "grad_norm": 1.2409045696258545, + "learning_rate": 3.323317307692308e-05, + "loss": 0.9162, + "step": 1217 + }, + { + "epoch": 0.7105148023917165, + "grad_norm": 1.286959171295166, + "learning_rate": 3.321814903846154e-05, + "loss": 1.032, + "step": 1218 + }, + { + "epoch": 0.7110981478780808, + "grad_norm": 1.3693422079086304, + "learning_rate": 3.3203125e-05, + "loss": 1.0232, + "step": 1219 + }, + { + "epoch": 0.7116814933644451, + "grad_norm": 1.3210753202438354, + "learning_rate": 3.318810096153847e-05, + "loss": 0.9577, + "step": 1220 + }, + { + "epoch": 0.7122648388508094, + "grad_norm": 1.1538783311843872, + "learning_rate": 3.3173076923076926e-05, + "loss": 1.0804, + "step": 1221 + }, + { + "epoch": 0.7128481843371737, + "grad_norm": 1.1558293104171753, + "learning_rate": 3.3158052884615385e-05, + "loss": 1.1509, + "step": 1222 + }, + { + "epoch": 0.713431529823538, + "grad_norm": 0.9820008277893066, + "learning_rate": 3.314302884615384e-05, + "loss": 0.814, + "step": 1223 + }, + { + "epoch": 0.7140148753099023, + "grad_norm": 0.9828884601593018, + "learning_rate": 3.312800480769231e-05, + "loss": 0.8456, + "step": 1224 + }, + { + "epoch": 0.7145982207962666, + "grad_norm": 1.3056386709213257, + "learning_rate": 3.3112980769230774e-05, + "loss": 1.0279, + "step": 1225 + }, + { + "epoch": 0.7151815662826309, + "grad_norm": 1.1356948614120483, + "learning_rate": 3.309795673076923e-05, + "loss": 1.0234, + "step": 1226 + }, + { + "epoch": 0.7157649117689951, + "grad_norm": 1.2792552709579468, + "learning_rate": 3.30829326923077e-05, + "loss": 1.1383, + "step": 1227 + }, + { + "epoch": 0.7163482572553594, + "grad_norm": 1.0980956554412842, + "learning_rate": 3.3067908653846156e-05, + "loss": 1.0534, + "step": 1228 + }, + { + "epoch": 0.7169316027417237, + "grad_norm": 1.2338215112686157, + "learning_rate": 3.3052884615384615e-05, + "loss": 0.9844, + "step": 1229 + }, + { + "epoch": 0.717514948228088, + "grad_norm": 1.4420562982559204, + "learning_rate": 3.303786057692308e-05, + "loss": 0.8789, + "step": 1230 + }, + { + "epoch": 0.7180982937144523, + "grad_norm": 1.26366126537323, + "learning_rate": 3.302283653846154e-05, + "loss": 0.8189, + "step": 1231 + }, + { + "epoch": 0.7186816392008167, + "grad_norm": 1.1623914241790771, + "learning_rate": 3.3007812500000004e-05, + "loss": 0.9789, + "step": 1232 + }, + { + "epoch": 0.719264984687181, + "grad_norm": 1.0107698440551758, + "learning_rate": 3.299278846153846e-05, + "loss": 0.7271, + "step": 1233 + }, + { + "epoch": 0.7198483301735453, + "grad_norm": 1.185608148574829, + "learning_rate": 3.297776442307692e-05, + "loss": 1.1248, + "step": 1234 + }, + { + "epoch": 0.7204316756599096, + "grad_norm": 1.077970027923584, + "learning_rate": 3.2962740384615386e-05, + "loss": 1.1963, + "step": 1235 + }, + { + "epoch": 0.7210150211462739, + "grad_norm": 1.684244990348816, + "learning_rate": 3.2947716346153844e-05, + "loss": 0.8828, + "step": 1236 + }, + { + "epoch": 0.7215983666326382, + "grad_norm": 1.24001944065094, + "learning_rate": 3.293269230769231e-05, + "loss": 0.8627, + "step": 1237 + }, + { + "epoch": 0.7221817121190025, + "grad_norm": 1.0665417909622192, + "learning_rate": 3.2917668269230775e-05, + "loss": 0.7553, + "step": 1238 + }, + { + "epoch": 0.7227650576053668, + "grad_norm": 1.1986167430877686, + "learning_rate": 3.2902644230769233e-05, + "loss": 0.7785, + "step": 1239 + }, + { + "epoch": 0.7233484030917311, + "grad_norm": 1.313407301902771, + "learning_rate": 3.288762019230769e-05, + "loss": 0.9125, + "step": 1240 + }, + { + "epoch": 0.7239317485780954, + "grad_norm": 1.1136894226074219, + "learning_rate": 3.287259615384616e-05, + "loss": 0.975, + "step": 1241 + }, + { + "epoch": 0.7245150940644597, + "grad_norm": 1.2097238302230835, + "learning_rate": 3.2857572115384616e-05, + "loss": 0.8688, + "step": 1242 + }, + { + "epoch": 0.725098439550824, + "grad_norm": 1.280327320098877, + "learning_rate": 3.284254807692308e-05, + "loss": 0.9758, + "step": 1243 + }, + { + "epoch": 0.7256817850371883, + "grad_norm": 1.2413567304611206, + "learning_rate": 3.282752403846154e-05, + "loss": 0.9494, + "step": 1244 + }, + { + "epoch": 0.7262651305235526, + "grad_norm": 1.1193758249282837, + "learning_rate": 3.2812500000000005e-05, + "loss": 1.0908, + "step": 1245 + }, + { + "epoch": 0.7268484760099169, + "grad_norm": 1.1849331855773926, + "learning_rate": 3.279747596153846e-05, + "loss": 1.0299, + "step": 1246 + }, + { + "epoch": 0.7274318214962812, + "grad_norm": 2.731739044189453, + "learning_rate": 3.278245192307692e-05, + "loss": 0.8046, + "step": 1247 + }, + { + "epoch": 0.7280151669826455, + "grad_norm": 1.145367980003357, + "learning_rate": 3.276742788461539e-05, + "loss": 0.9687, + "step": 1248 + }, + { + "epoch": 0.7285985124690098, + "grad_norm": 1.2098665237426758, + "learning_rate": 3.2752403846153846e-05, + "loss": 0.8909, + "step": 1249 + }, + { + "epoch": 0.729181857955374, + "grad_norm": 1.4284601211547852, + "learning_rate": 3.273737980769231e-05, + "loss": 0.8545, + "step": 1250 + }, + { + "epoch": 0.7297652034417383, + "grad_norm": 1.4550679922103882, + "learning_rate": 3.272235576923077e-05, + "loss": 0.9656, + "step": 1251 + }, + { + "epoch": 0.7303485489281026, + "grad_norm": 1.2722722291946411, + "learning_rate": 3.270733173076923e-05, + "loss": 0.8022, + "step": 1252 + }, + { + "epoch": 0.7309318944144669, + "grad_norm": 1.3001720905303955, + "learning_rate": 3.269230769230769e-05, + "loss": 0.9008, + "step": 1253 + }, + { + "epoch": 0.7315152399008312, + "grad_norm": 1.610422968864441, + "learning_rate": 3.267728365384616e-05, + "loss": 0.7881, + "step": 1254 + }, + { + "epoch": 0.7320985853871955, + "grad_norm": 1.0116015672683716, + "learning_rate": 3.266225961538462e-05, + "loss": 0.7952, + "step": 1255 + }, + { + "epoch": 0.7326819308735598, + "grad_norm": 1.4856303930282593, + "learning_rate": 3.264723557692308e-05, + "loss": 1.0552, + "step": 1256 + }, + { + "epoch": 0.7332652763599241, + "grad_norm": 1.7719351053237915, + "learning_rate": 3.263221153846154e-05, + "loss": 1.058, + "step": 1257 + }, + { + "epoch": 0.7338486218462885, + "grad_norm": 1.1480412483215332, + "learning_rate": 3.26171875e-05, + "loss": 0.8906, + "step": 1258 + }, + { + "epoch": 0.7344319673326528, + "grad_norm": 1.2761352062225342, + "learning_rate": 3.2602163461538464e-05, + "loss": 0.8908, + "step": 1259 + }, + { + "epoch": 0.7350153128190171, + "grad_norm": 1.1891727447509766, + "learning_rate": 3.258713942307692e-05, + "loss": 0.9932, + "step": 1260 + }, + { + "epoch": 0.7355986583053814, + "grad_norm": 1.0514845848083496, + "learning_rate": 3.257211538461539e-05, + "loss": 1.1528, + "step": 1261 + }, + { + "epoch": 0.7361820037917457, + "grad_norm": 1.4285988807678223, + "learning_rate": 3.255709134615385e-05, + "loss": 0.8171, + "step": 1262 + }, + { + "epoch": 0.73676534927811, + "grad_norm": 1.2109655141830444, + "learning_rate": 3.254206730769231e-05, + "loss": 1.1154, + "step": 1263 + }, + { + "epoch": 0.7373486947644743, + "grad_norm": 1.417160153388977, + "learning_rate": 3.252704326923077e-05, + "loss": 1.0018, + "step": 1264 + }, + { + "epoch": 0.7379320402508386, + "grad_norm": 1.045836091041565, + "learning_rate": 3.251201923076923e-05, + "loss": 0.9084, + "step": 1265 + }, + { + "epoch": 0.7385153857372029, + "grad_norm": 1.0985413789749146, + "learning_rate": 3.2496995192307694e-05, + "loss": 0.7832, + "step": 1266 + }, + { + "epoch": 0.7390987312235672, + "grad_norm": 1.1846632957458496, + "learning_rate": 3.248197115384616e-05, + "loss": 0.9933, + "step": 1267 + }, + { + "epoch": 0.7396820767099315, + "grad_norm": 1.1051980257034302, + "learning_rate": 3.246694711538462e-05, + "loss": 1.2966, + "step": 1268 + }, + { + "epoch": 0.7402654221962958, + "grad_norm": 1.1988706588745117, + "learning_rate": 3.2451923076923077e-05, + "loss": 1.052, + "step": 1269 + }, + { + "epoch": 0.7408487676826601, + "grad_norm": 1.1623855829238892, + "learning_rate": 3.2436899038461535e-05, + "loss": 0.9502, + "step": 1270 + }, + { + "epoch": 0.7414321131690244, + "grad_norm": 1.6816089153289795, + "learning_rate": 3.2421875e-05, + "loss": 1.1816, + "step": 1271 + }, + { + "epoch": 0.7420154586553886, + "grad_norm": 1.2776967287063599, + "learning_rate": 3.2406850961538466e-05, + "loss": 1.0167, + "step": 1272 + }, + { + "epoch": 0.7425988041417529, + "grad_norm": 1.7386460304260254, + "learning_rate": 3.2391826923076924e-05, + "loss": 1.0841, + "step": 1273 + }, + { + "epoch": 0.7431821496281172, + "grad_norm": 1.1177300214767456, + "learning_rate": 3.237680288461539e-05, + "loss": 0.8789, + "step": 1274 + }, + { + "epoch": 0.7437654951144815, + "grad_norm": 1.161293864250183, + "learning_rate": 3.236177884615385e-05, + "loss": 0.9454, + "step": 1275 + }, + { + "epoch": 0.7443488406008458, + "grad_norm": 1.609604001045227, + "learning_rate": 3.2346754807692306e-05, + "loss": 0.9307, + "step": 1276 + }, + { + "epoch": 0.7449321860872101, + "grad_norm": 0.9415054321289062, + "learning_rate": 3.233173076923077e-05, + "loss": 0.9462, + "step": 1277 + }, + { + "epoch": 0.7455155315735744, + "grad_norm": 1.4762784242630005, + "learning_rate": 3.231670673076923e-05, + "loss": 1.0188, + "step": 1278 + }, + { + "epoch": 0.7460988770599387, + "grad_norm": 1.2860867977142334, + "learning_rate": 3.2301682692307695e-05, + "loss": 0.8886, + "step": 1279 + }, + { + "epoch": 0.746682222546303, + "grad_norm": 1.2090308666229248, + "learning_rate": 3.228665865384616e-05, + "loss": 1.0265, + "step": 1280 + }, + { + "epoch": 0.7472655680326673, + "grad_norm": 1.0692979097366333, + "learning_rate": 3.227163461538462e-05, + "loss": 1.014, + "step": 1281 + }, + { + "epoch": 0.7478489135190316, + "grad_norm": 1.2457678318023682, + "learning_rate": 3.225661057692308e-05, + "loss": 1.0442, + "step": 1282 + }, + { + "epoch": 0.7484322590053959, + "grad_norm": 1.1981621980667114, + "learning_rate": 3.2241586538461536e-05, + "loss": 0.924, + "step": 1283 + }, + { + "epoch": 0.7490156044917602, + "grad_norm": 1.448926568031311, + "learning_rate": 3.22265625e-05, + "loss": 1.2461, + "step": 1284 + }, + { + "epoch": 0.7495989499781246, + "grad_norm": 1.2373019456863403, + "learning_rate": 3.221153846153847e-05, + "loss": 1.1221, + "step": 1285 + }, + { + "epoch": 0.7501822954644889, + "grad_norm": 1.135291337966919, + "learning_rate": 3.2196514423076925e-05, + "loss": 0.9693, + "step": 1286 + }, + { + "epoch": 0.7507656409508532, + "grad_norm": 1.3530988693237305, + "learning_rate": 3.2181490384615384e-05, + "loss": 0.9934, + "step": 1287 + }, + { + "epoch": 0.7513489864372175, + "grad_norm": 1.216654658317566, + "learning_rate": 3.216646634615384e-05, + "loss": 1.1738, + "step": 1288 + }, + { + "epoch": 0.7519323319235818, + "grad_norm": 1.1719613075256348, + "learning_rate": 3.215144230769231e-05, + "loss": 0.8775, + "step": 1289 + }, + { + "epoch": 0.7525156774099461, + "grad_norm": 1.223272442817688, + "learning_rate": 3.213641826923077e-05, + "loss": 1.0273, + "step": 1290 + }, + { + "epoch": 0.7530990228963104, + "grad_norm": 1.4900156259536743, + "learning_rate": 3.212139423076923e-05, + "loss": 1.0808, + "step": 1291 + }, + { + "epoch": 0.7536823683826747, + "grad_norm": 1.1983774900436401, + "learning_rate": 3.21063701923077e-05, + "loss": 1.1111, + "step": 1292 + }, + { + "epoch": 0.754265713869039, + "grad_norm": 1.4071288108825684, + "learning_rate": 3.2091346153846155e-05, + "loss": 0.8877, + "step": 1293 + }, + { + "epoch": 0.7548490593554033, + "grad_norm": 1.1557461023330688, + "learning_rate": 3.2076322115384614e-05, + "loss": 0.8536, + "step": 1294 + }, + { + "epoch": 0.7554324048417675, + "grad_norm": 1.1718254089355469, + "learning_rate": 3.206129807692308e-05, + "loss": 0.8273, + "step": 1295 + }, + { + "epoch": 0.7560157503281318, + "grad_norm": 1.2104593515396118, + "learning_rate": 3.204627403846154e-05, + "loss": 1.2501, + "step": 1296 + }, + { + "epoch": 0.7565990958144961, + "grad_norm": 2.3339264392852783, + "learning_rate": 3.203125e-05, + "loss": 0.8378, + "step": 1297 + }, + { + "epoch": 0.7571824413008604, + "grad_norm": 1.0478349924087524, + "learning_rate": 3.201622596153847e-05, + "loss": 1.0836, + "step": 1298 + }, + { + "epoch": 0.7577657867872247, + "grad_norm": 1.2753820419311523, + "learning_rate": 3.2001201923076926e-05, + "loss": 1.1445, + "step": 1299 + }, + { + "epoch": 0.758349132273589, + "grad_norm": 1.194672703742981, + "learning_rate": 3.1986177884615385e-05, + "loss": 1.0424, + "step": 1300 + }, + { + "epoch": 0.7589324777599533, + "grad_norm": 1.2020694017410278, + "learning_rate": 3.1971153846153843e-05, + "loss": 1.0068, + "step": 1301 + }, + { + "epoch": 0.7595158232463176, + "grad_norm": 2.032259941101074, + "learning_rate": 3.195612980769231e-05, + "loss": 0.946, + "step": 1302 + }, + { + "epoch": 0.7600991687326819, + "grad_norm": 1.2049493789672852, + "learning_rate": 3.1941105769230774e-05, + "loss": 1.053, + "step": 1303 + }, + { + "epoch": 0.7606825142190462, + "grad_norm": 1.1551873683929443, + "learning_rate": 3.192608173076923e-05, + "loss": 1.0539, + "step": 1304 + }, + { + "epoch": 0.7612658597054105, + "grad_norm": 1.0860520601272583, + "learning_rate": 3.191105769230769e-05, + "loss": 1.1625, + "step": 1305 + }, + { + "epoch": 0.7618492051917748, + "grad_norm": 1.1426883935928345, + "learning_rate": 3.1896033653846156e-05, + "loss": 0.8321, + "step": 1306 + }, + { + "epoch": 0.7624325506781391, + "grad_norm": 1.2449581623077393, + "learning_rate": 3.1881009615384615e-05, + "loss": 1.0748, + "step": 1307 + }, + { + "epoch": 0.7630158961645034, + "grad_norm": 1.2475258111953735, + "learning_rate": 3.186598557692308e-05, + "loss": 0.9779, + "step": 1308 + }, + { + "epoch": 0.7635992416508677, + "grad_norm": 1.0726374387741089, + "learning_rate": 3.185096153846154e-05, + "loss": 0.9079, + "step": 1309 + }, + { + "epoch": 0.764182587137232, + "grad_norm": 1.169661521911621, + "learning_rate": 3.1835937500000004e-05, + "loss": 1.1156, + "step": 1310 + }, + { + "epoch": 0.7647659326235964, + "grad_norm": 1.1962292194366455, + "learning_rate": 3.182091346153846e-05, + "loss": 0.9258, + "step": 1311 + }, + { + "epoch": 0.7653492781099607, + "grad_norm": 1.038885235786438, + "learning_rate": 3.180588942307692e-05, + "loss": 1.0377, + "step": 1312 + }, + { + "epoch": 0.765932623596325, + "grad_norm": 1.4127665758132935, + "learning_rate": 3.1790865384615386e-05, + "loss": 0.982, + "step": 1313 + }, + { + "epoch": 0.7665159690826893, + "grad_norm": 1.1078689098358154, + "learning_rate": 3.1775841346153845e-05, + "loss": 1.0159, + "step": 1314 + }, + { + "epoch": 0.7670993145690536, + "grad_norm": 1.047503113746643, + "learning_rate": 3.176081730769231e-05, + "loss": 0.7841, + "step": 1315 + }, + { + "epoch": 0.7676826600554179, + "grad_norm": 1.2618082761764526, + "learning_rate": 3.1745793269230775e-05, + "loss": 1.0228, + "step": 1316 + }, + { + "epoch": 0.7682660055417821, + "grad_norm": 2.2956039905548096, + "learning_rate": 3.1730769230769234e-05, + "loss": 0.8457, + "step": 1317 + }, + { + "epoch": 0.7688493510281464, + "grad_norm": 1.1614506244659424, + "learning_rate": 3.171574519230769e-05, + "loss": 1.0914, + "step": 1318 + }, + { + "epoch": 0.7694326965145107, + "grad_norm": 1.0440285205841064, + "learning_rate": 3.170072115384616e-05, + "loss": 0.9781, + "step": 1319 + }, + { + "epoch": 0.770016042000875, + "grad_norm": 1.2385691404342651, + "learning_rate": 3.1685697115384616e-05, + "loss": 0.904, + "step": 1320 + }, + { + "epoch": 0.7705993874872393, + "grad_norm": 1.0237793922424316, + "learning_rate": 3.167067307692308e-05, + "loss": 0.8984, + "step": 1321 + }, + { + "epoch": 0.7711827329736036, + "grad_norm": 1.4518593549728394, + "learning_rate": 3.165564903846154e-05, + "loss": 1.1862, + "step": 1322 + }, + { + "epoch": 0.7717660784599679, + "grad_norm": 1.197102427482605, + "learning_rate": 3.1640625e-05, + "loss": 1.0267, + "step": 1323 + }, + { + "epoch": 0.7723494239463322, + "grad_norm": 1.3080718517303467, + "learning_rate": 3.1625600961538464e-05, + "loss": 1.0663, + "step": 1324 + }, + { + "epoch": 0.7729327694326965, + "grad_norm": 0.9802163243293762, + "learning_rate": 3.161057692307692e-05, + "loss": 1.0571, + "step": 1325 + }, + { + "epoch": 0.7735161149190608, + "grad_norm": 1.969308853149414, + "learning_rate": 3.159555288461539e-05, + "loss": 1.0338, + "step": 1326 + }, + { + "epoch": 0.7740994604054251, + "grad_norm": 1.0249823331832886, + "learning_rate": 3.1580528846153846e-05, + "loss": 0.9921, + "step": 1327 + }, + { + "epoch": 0.7746828058917894, + "grad_norm": 1.099127173423767, + "learning_rate": 3.156550480769231e-05, + "loss": 0.9504, + "step": 1328 + }, + { + "epoch": 0.7752661513781537, + "grad_norm": 1.2968648672103882, + "learning_rate": 3.155048076923077e-05, + "loss": 0.9594, + "step": 1329 + }, + { + "epoch": 0.775849496864518, + "grad_norm": 1.0648295879364014, + "learning_rate": 3.153545673076923e-05, + "loss": 0.8687, + "step": 1330 + }, + { + "epoch": 0.7764328423508823, + "grad_norm": 1.802680253982544, + "learning_rate": 3.1520432692307693e-05, + "loss": 1.0598, + "step": 1331 + }, + { + "epoch": 0.7770161878372466, + "grad_norm": 1.080910563468933, + "learning_rate": 3.150540865384616e-05, + "loss": 0.9963, + "step": 1332 + }, + { + "epoch": 0.7775995333236109, + "grad_norm": 1.1521499156951904, + "learning_rate": 3.149038461538462e-05, + "loss": 1.0519, + "step": 1333 + }, + { + "epoch": 0.7781828788099752, + "grad_norm": 1.1837037801742554, + "learning_rate": 3.147536057692308e-05, + "loss": 0.9725, + "step": 1334 + }, + { + "epoch": 0.7787662242963395, + "grad_norm": 1.085605502128601, + "learning_rate": 3.146033653846154e-05, + "loss": 1.1865, + "step": 1335 + }, + { + "epoch": 0.7793495697827038, + "grad_norm": 1.0882564783096313, + "learning_rate": 3.14453125e-05, + "loss": 0.9782, + "step": 1336 + }, + { + "epoch": 0.7799329152690682, + "grad_norm": 1.1564704179763794, + "learning_rate": 3.1430288461538465e-05, + "loss": 0.8602, + "step": 1337 + }, + { + "epoch": 0.7805162607554325, + "grad_norm": 1.3103642463684082, + "learning_rate": 3.141526442307692e-05, + "loss": 1.144, + "step": 1338 + }, + { + "epoch": 0.7810996062417968, + "grad_norm": 1.2268692255020142, + "learning_rate": 3.140024038461539e-05, + "loss": 0.8534, + "step": 1339 + }, + { + "epoch": 0.781682951728161, + "grad_norm": 1.2421032190322876, + "learning_rate": 3.138521634615385e-05, + "loss": 1.1487, + "step": 1340 + }, + { + "epoch": 0.7822662972145253, + "grad_norm": 1.0822752714157104, + "learning_rate": 3.1370192307692306e-05, + "loss": 0.963, + "step": 1341 + }, + { + "epoch": 0.7828496427008896, + "grad_norm": 1.2561531066894531, + "learning_rate": 3.135516826923077e-05, + "loss": 1.1393, + "step": 1342 + }, + { + "epoch": 0.7834329881872539, + "grad_norm": 0.9926168918609619, + "learning_rate": 3.134014423076923e-05, + "loss": 0.8662, + "step": 1343 + }, + { + "epoch": 0.7840163336736182, + "grad_norm": 1.2411295175552368, + "learning_rate": 3.1325120192307695e-05, + "loss": 1.0019, + "step": 1344 + }, + { + "epoch": 0.7845996791599825, + "grad_norm": 1.1900317668914795, + "learning_rate": 3.131009615384616e-05, + "loss": 1.0258, + "step": 1345 + }, + { + "epoch": 0.7851830246463468, + "grad_norm": 1.0790519714355469, + "learning_rate": 3.129507211538462e-05, + "loss": 1.1386, + "step": 1346 + }, + { + "epoch": 0.7857663701327111, + "grad_norm": 2.3410184383392334, + "learning_rate": 3.128004807692308e-05, + "loss": 0.8789, + "step": 1347 + }, + { + "epoch": 0.7863497156190754, + "grad_norm": 1.0258671045303345, + "learning_rate": 3.1265024038461535e-05, + "loss": 1.015, + "step": 1348 + }, + { + "epoch": 0.7869330611054397, + "grad_norm": 1.0533411502838135, + "learning_rate": 3.125e-05, + "loss": 0.8044, + "step": 1349 + }, + { + "epoch": 0.787516406591804, + "grad_norm": 1.1454368829727173, + "learning_rate": 3.1234975961538466e-05, + "loss": 1.0102, + "step": 1350 + }, + { + "epoch": 0.7880997520781683, + "grad_norm": 1.3050988912582397, + "learning_rate": 3.1219951923076924e-05, + "loss": 1.1298, + "step": 1351 + }, + { + "epoch": 0.7886830975645326, + "grad_norm": 1.1478936672210693, + "learning_rate": 3.120492788461539e-05, + "loss": 0.9447, + "step": 1352 + }, + { + "epoch": 0.7892664430508969, + "grad_norm": 1.2745267152786255, + "learning_rate": 3.118990384615385e-05, + "loss": 0.9069, + "step": 1353 + }, + { + "epoch": 0.7898497885372612, + "grad_norm": 1.031055212020874, + "learning_rate": 3.117487980769231e-05, + "loss": 0.9162, + "step": 1354 + }, + { + "epoch": 0.7904331340236255, + "grad_norm": 1.1818459033966064, + "learning_rate": 3.115985576923077e-05, + "loss": 0.9005, + "step": 1355 + }, + { + "epoch": 0.7910164795099898, + "grad_norm": 1.157064437866211, + "learning_rate": 3.114483173076923e-05, + "loss": 1.0922, + "step": 1356 + }, + { + "epoch": 0.7915998249963541, + "grad_norm": 1.3568843603134155, + "learning_rate": 3.1129807692307696e-05, + "loss": 0.9815, + "step": 1357 + }, + { + "epoch": 0.7921831704827184, + "grad_norm": 0.9772933125495911, + "learning_rate": 3.111478365384616e-05, + "loss": 0.848, + "step": 1358 + }, + { + "epoch": 0.7927665159690827, + "grad_norm": 1.2958146333694458, + "learning_rate": 3.109975961538461e-05, + "loss": 0.903, + "step": 1359 + }, + { + "epoch": 0.793349861455447, + "grad_norm": 1.325095772743225, + "learning_rate": 3.108473557692308e-05, + "loss": 0.8335, + "step": 1360 + }, + { + "epoch": 0.7939332069418112, + "grad_norm": 1.3909435272216797, + "learning_rate": 3.1069711538461537e-05, + "loss": 1.1868, + "step": 1361 + }, + { + "epoch": 0.7945165524281755, + "grad_norm": 1.1557323932647705, + "learning_rate": 3.10546875e-05, + "loss": 1.2876, + "step": 1362 + }, + { + "epoch": 0.7950998979145398, + "grad_norm": 1.0653504133224487, + "learning_rate": 3.103966346153847e-05, + "loss": 1.0039, + "step": 1363 + }, + { + "epoch": 0.7956832434009042, + "grad_norm": 1.1019622087478638, + "learning_rate": 3.1024639423076926e-05, + "loss": 0.8072, + "step": 1364 + }, + { + "epoch": 0.7962665888872685, + "grad_norm": 1.1759988069534302, + "learning_rate": 3.1009615384615384e-05, + "loss": 1.038, + "step": 1365 + }, + { + "epoch": 0.7968499343736328, + "grad_norm": 1.2272703647613525, + "learning_rate": 3.099459134615384e-05, + "loss": 1.0167, + "step": 1366 + }, + { + "epoch": 0.7974332798599971, + "grad_norm": 1.6301058530807495, + "learning_rate": 3.097956730769231e-05, + "loss": 0.9102, + "step": 1367 + }, + { + "epoch": 0.7980166253463614, + "grad_norm": 1.059002161026001, + "learning_rate": 3.096454326923077e-05, + "loss": 1.1956, + "step": 1368 + }, + { + "epoch": 0.7985999708327257, + "grad_norm": 1.0058438777923584, + "learning_rate": 3.094951923076923e-05, + "loss": 0.7865, + "step": 1369 + }, + { + "epoch": 0.79918331631909, + "grad_norm": 1.3153209686279297, + "learning_rate": 3.09344951923077e-05, + "loss": 0.995, + "step": 1370 + }, + { + "epoch": 0.7997666618054543, + "grad_norm": 1.1372942924499512, + "learning_rate": 3.0919471153846155e-05, + "loss": 0.901, + "step": 1371 + }, + { + "epoch": 0.8003500072918186, + "grad_norm": 1.0216442346572876, + "learning_rate": 3.0904447115384614e-05, + "loss": 1.1966, + "step": 1372 + }, + { + "epoch": 0.8009333527781829, + "grad_norm": 1.054608702659607, + "learning_rate": 3.088942307692308e-05, + "loss": 0.8792, + "step": 1373 + }, + { + "epoch": 0.8015166982645472, + "grad_norm": 1.2942436933517456, + "learning_rate": 3.087439903846154e-05, + "loss": 1.1201, + "step": 1374 + }, + { + "epoch": 0.8021000437509115, + "grad_norm": 1.2945632934570312, + "learning_rate": 3.0859375e-05, + "loss": 0.936, + "step": 1375 + }, + { + "epoch": 0.8026833892372758, + "grad_norm": 1.213426947593689, + "learning_rate": 3.084435096153847e-05, + "loss": 0.9309, + "step": 1376 + }, + { + "epoch": 0.8032667347236401, + "grad_norm": 1.0726510286331177, + "learning_rate": 3.082932692307692e-05, + "loss": 1.1424, + "step": 1377 + }, + { + "epoch": 0.8038500802100044, + "grad_norm": 1.3145674467086792, + "learning_rate": 3.0814302884615385e-05, + "loss": 1.0205, + "step": 1378 + }, + { + "epoch": 0.8044334256963687, + "grad_norm": 1.082029104232788, + "learning_rate": 3.0799278846153844e-05, + "loss": 0.8689, + "step": 1379 + }, + { + "epoch": 0.805016771182733, + "grad_norm": 1.0544030666351318, + "learning_rate": 3.078425480769231e-05, + "loss": 0.8943, + "step": 1380 + }, + { + "epoch": 0.8056001166690973, + "grad_norm": 1.1208224296569824, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.9113, + "step": 1381 + }, + { + "epoch": 0.8061834621554615, + "grad_norm": 1.1705317497253418, + "learning_rate": 3.075420673076923e-05, + "loss": 0.7789, + "step": 1382 + }, + { + "epoch": 0.8067668076418258, + "grad_norm": 1.1692845821380615, + "learning_rate": 3.073918269230769e-05, + "loss": 1.1744, + "step": 1383 + }, + { + "epoch": 0.8073501531281901, + "grad_norm": 1.0401328802108765, + "learning_rate": 3.072415865384616e-05, + "loss": 0.9221, + "step": 1384 + }, + { + "epoch": 0.8079334986145544, + "grad_norm": 1.0775591135025024, + "learning_rate": 3.0709134615384615e-05, + "loss": 1.2103, + "step": 1385 + }, + { + "epoch": 0.8085168441009187, + "grad_norm": 1.3836114406585693, + "learning_rate": 3.069411057692308e-05, + "loss": 0.8438, + "step": 1386 + }, + { + "epoch": 0.809100189587283, + "grad_norm": 1.1026825904846191, + "learning_rate": 3.067908653846154e-05, + "loss": 0.8164, + "step": 1387 + }, + { + "epoch": 0.8096835350736473, + "grad_norm": 1.80268132686615, + "learning_rate": 3.0664062500000004e-05, + "loss": 1.0345, + "step": 1388 + }, + { + "epoch": 0.8102668805600116, + "grad_norm": 1.5059008598327637, + "learning_rate": 3.064903846153846e-05, + "loss": 0.9297, + "step": 1389 + }, + { + "epoch": 0.810850226046376, + "grad_norm": 1.0632637739181519, + "learning_rate": 3.063401442307692e-05, + "loss": 1.237, + "step": 1390 + }, + { + "epoch": 0.8114335715327403, + "grad_norm": 1.1647098064422607, + "learning_rate": 3.0618990384615386e-05, + "loss": 1.0494, + "step": 1391 + }, + { + "epoch": 0.8120169170191046, + "grad_norm": 1.0377775430679321, + "learning_rate": 3.0603966346153845e-05, + "loss": 1.3689, + "step": 1392 + }, + { + "epoch": 0.8126002625054689, + "grad_norm": 1.713774561882019, + "learning_rate": 3.058894230769231e-05, + "loss": 1.1149, + "step": 1393 + }, + { + "epoch": 0.8131836079918332, + "grad_norm": 1.2139819860458374, + "learning_rate": 3.0573918269230776e-05, + "loss": 0.8279, + "step": 1394 + }, + { + "epoch": 0.8137669534781975, + "grad_norm": 1.3912312984466553, + "learning_rate": 3.055889423076923e-05, + "loss": 1.0654, + "step": 1395 + }, + { + "epoch": 0.8143502989645618, + "grad_norm": 1.276260256767273, + "learning_rate": 3.054387019230769e-05, + "loss": 0.8086, + "step": 1396 + }, + { + "epoch": 0.8149336444509261, + "grad_norm": 1.3147262334823608, + "learning_rate": 3.052884615384616e-05, + "loss": 1.1643, + "step": 1397 + }, + { + "epoch": 0.8155169899372904, + "grad_norm": 1.760912299156189, + "learning_rate": 3.0513822115384616e-05, + "loss": 1.2003, + "step": 1398 + }, + { + "epoch": 0.8161003354236547, + "grad_norm": 1.060642957687378, + "learning_rate": 3.0498798076923078e-05, + "loss": 0.8297, + "step": 1399 + }, + { + "epoch": 0.816683680910019, + "grad_norm": 1.1941111087799072, + "learning_rate": 3.0483774038461537e-05, + "loss": 0.8585, + "step": 1400 + }, + { + "epoch": 0.816683680910019, + "eval_loss_squad": 0.8743070242926478, + "eval_perplexity": 8.211708550571148, + "eval_perplexity_reconstruct": 1.9933093086287428, + "step": 1400 + }, + { + "epoch": 0.8172670263963833, + "grad_norm": 1.1372501850128174, + "learning_rate": 3.0468750000000002e-05, + "loss": 1.0104, + "step": 1401 + }, + { + "epoch": 0.8178503718827476, + "grad_norm": 1.203554391860962, + "learning_rate": 3.0453725961538464e-05, + "loss": 1.1503, + "step": 1402 + }, + { + "epoch": 0.8184337173691119, + "grad_norm": 1.5660467147827148, + "learning_rate": 3.0438701923076922e-05, + "loss": 0.9822, + "step": 1403 + }, + { + "epoch": 0.8190170628554762, + "grad_norm": 1.2876819372177124, + "learning_rate": 3.0423677884615388e-05, + "loss": 1.1155, + "step": 1404 + }, + { + "epoch": 0.8196004083418404, + "grad_norm": 1.0389353036880493, + "learning_rate": 3.0408653846153846e-05, + "loss": 0.8308, + "step": 1405 + }, + { + "epoch": 0.8201837538282047, + "grad_norm": 1.467089295387268, + "learning_rate": 3.0393629807692308e-05, + "loss": 1.308, + "step": 1406 + }, + { + "epoch": 0.820767099314569, + "grad_norm": 1.135607361793518, + "learning_rate": 3.0378605769230773e-05, + "loss": 0.9116, + "step": 1407 + }, + { + "epoch": 0.8213504448009333, + "grad_norm": 1.0728017091751099, + "learning_rate": 3.0363581730769232e-05, + "loss": 0.9576, + "step": 1408 + }, + { + "epoch": 0.8219337902872976, + "grad_norm": 1.2483171224594116, + "learning_rate": 3.0348557692307694e-05, + "loss": 1.0636, + "step": 1409 + }, + { + "epoch": 0.8225171357736619, + "grad_norm": 1.3540332317352295, + "learning_rate": 3.033353365384616e-05, + "loss": 0.8896, + "step": 1410 + }, + { + "epoch": 0.8231004812600262, + "grad_norm": 1.4841009378433228, + "learning_rate": 3.0318509615384618e-05, + "loss": 0.9574, + "step": 1411 + }, + { + "epoch": 0.8236838267463905, + "grad_norm": 1.2146106958389282, + "learning_rate": 3.030348557692308e-05, + "loss": 1.0769, + "step": 1412 + }, + { + "epoch": 0.8242671722327548, + "grad_norm": 1.104175090789795, + "learning_rate": 3.0288461538461538e-05, + "loss": 0.9756, + "step": 1413 + }, + { + "epoch": 0.8248505177191191, + "grad_norm": 0.9717370271682739, + "learning_rate": 3.02734375e-05, + "loss": 0.9478, + "step": 1414 + }, + { + "epoch": 0.8254338632054834, + "grad_norm": 1.0630606412887573, + "learning_rate": 3.0258413461538465e-05, + "loss": 0.9061, + "step": 1415 + }, + { + "epoch": 0.8260172086918477, + "grad_norm": 1.1713072061538696, + "learning_rate": 3.0243389423076924e-05, + "loss": 1.1889, + "step": 1416 + }, + { + "epoch": 0.8266005541782121, + "grad_norm": 1.2068907022476196, + "learning_rate": 3.0228365384615385e-05, + "loss": 0.9438, + "step": 1417 + }, + { + "epoch": 0.8271838996645764, + "grad_norm": 0.9715260863304138, + "learning_rate": 3.0213341346153844e-05, + "loss": 0.7337, + "step": 1418 + }, + { + "epoch": 0.8277672451509407, + "grad_norm": 1.0985807180404663, + "learning_rate": 3.019831730769231e-05, + "loss": 1.0818, + "step": 1419 + }, + { + "epoch": 0.828350590637305, + "grad_norm": 1.2472034692764282, + "learning_rate": 3.018329326923077e-05, + "loss": 0.9131, + "step": 1420 + }, + { + "epoch": 0.8289339361236693, + "grad_norm": 1.1218814849853516, + "learning_rate": 3.016826923076923e-05, + "loss": 1.0787, + "step": 1421 + }, + { + "epoch": 0.8295172816100336, + "grad_norm": 1.0618772506713867, + "learning_rate": 3.0153245192307695e-05, + "loss": 0.8397, + "step": 1422 + }, + { + "epoch": 0.8301006270963979, + "grad_norm": 1.021227240562439, + "learning_rate": 3.0138221153846157e-05, + "loss": 0.7859, + "step": 1423 + }, + { + "epoch": 0.8306839725827622, + "grad_norm": 1.3854846954345703, + "learning_rate": 3.0123197115384615e-05, + "loss": 0.9057, + "step": 1424 + }, + { + "epoch": 0.8312673180691265, + "grad_norm": 1.0573982000350952, + "learning_rate": 3.010817307692308e-05, + "loss": 0.8431, + "step": 1425 + }, + { + "epoch": 0.8318506635554908, + "grad_norm": 1.3047147989273071, + "learning_rate": 3.009314903846154e-05, + "loss": 1.1606, + "step": 1426 + }, + { + "epoch": 0.832434009041855, + "grad_norm": 1.6169391870498657, + "learning_rate": 3.0078125e-05, + "loss": 1.0701, + "step": 1427 + }, + { + "epoch": 0.8330173545282193, + "grad_norm": 1.2799001932144165, + "learning_rate": 3.0063100961538466e-05, + "loss": 1.0659, + "step": 1428 + }, + { + "epoch": 0.8336007000145836, + "grad_norm": 1.3066989183425903, + "learning_rate": 3.0048076923076925e-05, + "loss": 1.0707, + "step": 1429 + }, + { + "epoch": 0.8341840455009479, + "grad_norm": 1.275020956993103, + "learning_rate": 3.0033052884615387e-05, + "loss": 1.0019, + "step": 1430 + }, + { + "epoch": 0.8347673909873122, + "grad_norm": 1.084713101387024, + "learning_rate": 3.0018028846153845e-05, + "loss": 1.0027, + "step": 1431 + }, + { + "epoch": 0.8353507364736765, + "grad_norm": 1.222685694694519, + "learning_rate": 3.0003004807692307e-05, + "loss": 0.8549, + "step": 1432 + }, + { + "epoch": 0.8359340819600408, + "grad_norm": 1.1162559986114502, + "learning_rate": 2.9987980769230772e-05, + "loss": 0.9687, + "step": 1433 + }, + { + "epoch": 0.8365174274464051, + "grad_norm": 1.1409958600997925, + "learning_rate": 2.997295673076923e-05, + "loss": 0.9169, + "step": 1434 + }, + { + "epoch": 0.8371007729327694, + "grad_norm": 1.1237506866455078, + "learning_rate": 2.9957932692307693e-05, + "loss": 1.0596, + "step": 1435 + }, + { + "epoch": 0.8376841184191337, + "grad_norm": 1.2333991527557373, + "learning_rate": 2.9942908653846158e-05, + "loss": 1.0413, + "step": 1436 + }, + { + "epoch": 0.838267463905498, + "grad_norm": 1.2132785320281982, + "learning_rate": 2.9927884615384616e-05, + "loss": 0.8801, + "step": 1437 + }, + { + "epoch": 0.8388508093918623, + "grad_norm": 2.4601593017578125, + "learning_rate": 2.991286057692308e-05, + "loss": 1.0372, + "step": 1438 + }, + { + "epoch": 0.8394341548782266, + "grad_norm": 1.1504793167114258, + "learning_rate": 2.9897836538461537e-05, + "loss": 1.0258, + "step": 1439 + }, + { + "epoch": 0.8400175003645909, + "grad_norm": 1.0757592916488647, + "learning_rate": 2.9882812500000002e-05, + "loss": 0.925, + "step": 1440 + }, + { + "epoch": 0.8406008458509552, + "grad_norm": 1.2550078630447388, + "learning_rate": 2.9867788461538464e-05, + "loss": 1.0328, + "step": 1441 + }, + { + "epoch": 0.8411841913373195, + "grad_norm": 1.108076810836792, + "learning_rate": 2.9852764423076923e-05, + "loss": 1.0529, + "step": 1442 + }, + { + "epoch": 0.8417675368236839, + "grad_norm": 1.1696008443832397, + "learning_rate": 2.9837740384615388e-05, + "loss": 0.8204, + "step": 1443 + }, + { + "epoch": 0.8423508823100482, + "grad_norm": 0.9712606072425842, + "learning_rate": 2.9822716346153846e-05, + "loss": 0.9446, + "step": 1444 + }, + { + "epoch": 0.8429342277964125, + "grad_norm": 1.253280520439148, + "learning_rate": 2.9807692307692308e-05, + "loss": 0.9655, + "step": 1445 + }, + { + "epoch": 0.8435175732827768, + "grad_norm": 1.2332576513290405, + "learning_rate": 2.9792668269230773e-05, + "loss": 0.9132, + "step": 1446 + }, + { + "epoch": 0.8441009187691411, + "grad_norm": 0.9151219129562378, + "learning_rate": 2.9777644230769232e-05, + "loss": 1.0091, + "step": 1447 + }, + { + "epoch": 0.8446842642555054, + "grad_norm": 1.0670338869094849, + "learning_rate": 2.9762620192307694e-05, + "loss": 0.9617, + "step": 1448 + }, + { + "epoch": 0.8452676097418697, + "grad_norm": 1.4182400703430176, + "learning_rate": 2.974759615384616e-05, + "loss": 0.9599, + "step": 1449 + }, + { + "epoch": 0.845850955228234, + "grad_norm": 1.4236427545547485, + "learning_rate": 2.9732572115384614e-05, + "loss": 1.0529, + "step": 1450 + }, + { + "epoch": 0.8464343007145982, + "grad_norm": 1.3941996097564697, + "learning_rate": 2.971754807692308e-05, + "loss": 0.8889, + "step": 1451 + }, + { + "epoch": 0.8470176462009625, + "grad_norm": 1.1831328868865967, + "learning_rate": 2.9702524038461538e-05, + "loss": 0.9023, + "step": 1452 + }, + { + "epoch": 0.8476009916873268, + "grad_norm": 1.4158825874328613, + "learning_rate": 2.96875e-05, + "loss": 1.1108, + "step": 1453 + }, + { + "epoch": 0.8481843371736911, + "grad_norm": 0.9731126427650452, + "learning_rate": 2.9672475961538465e-05, + "loss": 0.8187, + "step": 1454 + }, + { + "epoch": 0.8487676826600554, + "grad_norm": 1.0598933696746826, + "learning_rate": 2.9657451923076924e-05, + "loss": 0.8798, + "step": 1455 + }, + { + "epoch": 0.8493510281464197, + "grad_norm": 1.04989492893219, + "learning_rate": 2.9642427884615386e-05, + "loss": 0.9962, + "step": 1456 + }, + { + "epoch": 0.849934373632784, + "grad_norm": 1.1285686492919922, + "learning_rate": 2.9627403846153844e-05, + "loss": 0.8692, + "step": 1457 + }, + { + "epoch": 0.8505177191191483, + "grad_norm": 1.1516649723052979, + "learning_rate": 2.961237980769231e-05, + "loss": 0.8191, + "step": 1458 + }, + { + "epoch": 0.8511010646055126, + "grad_norm": 1.2343637943267822, + "learning_rate": 2.959735576923077e-05, + "loss": 0.8846, + "step": 1459 + }, + { + "epoch": 0.8516844100918769, + "grad_norm": 1.295943021774292, + "learning_rate": 2.958233173076923e-05, + "loss": 0.9154, + "step": 1460 + }, + { + "epoch": 0.8522677555782412, + "grad_norm": 1.3594167232513428, + "learning_rate": 2.9567307692307695e-05, + "loss": 0.9516, + "step": 1461 + }, + { + "epoch": 0.8528511010646055, + "grad_norm": 1.2709529399871826, + "learning_rate": 2.9552283653846157e-05, + "loss": 1.0265, + "step": 1462 + }, + { + "epoch": 0.8534344465509698, + "grad_norm": 1.2515851259231567, + "learning_rate": 2.9537259615384615e-05, + "loss": 0.9376, + "step": 1463 + }, + { + "epoch": 0.8540177920373341, + "grad_norm": 0.9908419251441956, + "learning_rate": 2.952223557692308e-05, + "loss": 1.0423, + "step": 1464 + }, + { + "epoch": 0.8546011375236984, + "grad_norm": 1.3674644231796265, + "learning_rate": 2.950721153846154e-05, + "loss": 1.2146, + "step": 1465 + }, + { + "epoch": 0.8551844830100627, + "grad_norm": 1.239865779876709, + "learning_rate": 2.94921875e-05, + "loss": 0.9979, + "step": 1466 + }, + { + "epoch": 0.855767828496427, + "grad_norm": 2.099808692932129, + "learning_rate": 2.9477163461538466e-05, + "loss": 1.1026, + "step": 1467 + }, + { + "epoch": 0.8563511739827913, + "grad_norm": 1.1289135217666626, + "learning_rate": 2.946213942307692e-05, + "loss": 1.0327, + "step": 1468 + }, + { + "epoch": 0.8569345194691556, + "grad_norm": 1.2344995737075806, + "learning_rate": 2.9447115384615387e-05, + "loss": 0.9183, + "step": 1469 + }, + { + "epoch": 0.85751786495552, + "grad_norm": 1.2652794122695923, + "learning_rate": 2.9432091346153845e-05, + "loss": 0.9498, + "step": 1470 + }, + { + "epoch": 0.8581012104418843, + "grad_norm": 1.1644681692123413, + "learning_rate": 2.9417067307692307e-05, + "loss": 1.037, + "step": 1471 + }, + { + "epoch": 0.8586845559282485, + "grad_norm": 1.0714110136032104, + "learning_rate": 2.9402043269230772e-05, + "loss": 1.0802, + "step": 1472 + }, + { + "epoch": 0.8592679014146128, + "grad_norm": 1.1564422845840454, + "learning_rate": 2.938701923076923e-05, + "loss": 1.0889, + "step": 1473 + }, + { + "epoch": 0.8598512469009771, + "grad_norm": 1.3579368591308594, + "learning_rate": 2.9371995192307693e-05, + "loss": 0.8359, + "step": 1474 + }, + { + "epoch": 0.8604345923873414, + "grad_norm": 1.0801093578338623, + "learning_rate": 2.9356971153846158e-05, + "loss": 0.911, + "step": 1475 + }, + { + "epoch": 0.8610179378737057, + "grad_norm": 1.1530592441558838, + "learning_rate": 2.9341947115384617e-05, + "loss": 1.0432, + "step": 1476 + }, + { + "epoch": 0.86160128336007, + "grad_norm": 1.07007896900177, + "learning_rate": 2.932692307692308e-05, + "loss": 1.0042, + "step": 1477 + }, + { + "epoch": 0.8621846288464343, + "grad_norm": 1.128960371017456, + "learning_rate": 2.9311899038461537e-05, + "loss": 1.0481, + "step": 1478 + }, + { + "epoch": 0.8627679743327986, + "grad_norm": 1.7839515209197998, + "learning_rate": 2.9296875000000002e-05, + "loss": 1.1279, + "step": 1479 + }, + { + "epoch": 0.8633513198191629, + "grad_norm": 1.7222251892089844, + "learning_rate": 2.9281850961538464e-05, + "loss": 0.9325, + "step": 1480 + }, + { + "epoch": 0.8639346653055272, + "grad_norm": 0.8684887290000916, + "learning_rate": 2.9266826923076923e-05, + "loss": 1.1595, + "step": 1481 + }, + { + "epoch": 0.8645180107918915, + "grad_norm": 1.20137357711792, + "learning_rate": 2.9251802884615388e-05, + "loss": 1.0726, + "step": 1482 + }, + { + "epoch": 0.8651013562782558, + "grad_norm": 1.3149369955062866, + "learning_rate": 2.9236778846153846e-05, + "loss": 1.1417, + "step": 1483 + }, + { + "epoch": 0.8656847017646201, + "grad_norm": 1.2710528373718262, + "learning_rate": 2.922175480769231e-05, + "loss": 1.0942, + "step": 1484 + }, + { + "epoch": 0.8662680472509844, + "grad_norm": 1.145824909210205, + "learning_rate": 2.9206730769230774e-05, + "loss": 1.0122, + "step": 1485 + }, + { + "epoch": 0.8668513927373487, + "grad_norm": 1.2785214185714722, + "learning_rate": 2.9191706730769232e-05, + "loss": 1.1068, + "step": 1486 + }, + { + "epoch": 0.867434738223713, + "grad_norm": 1.3779847621917725, + "learning_rate": 2.9176682692307694e-05, + "loss": 1.1604, + "step": 1487 + }, + { + "epoch": 0.8680180837100773, + "grad_norm": 1.2660037279129028, + "learning_rate": 2.916165865384616e-05, + "loss": 0.9293, + "step": 1488 + }, + { + "epoch": 0.8686014291964416, + "grad_norm": 1.2929606437683105, + "learning_rate": 2.9146634615384614e-05, + "loss": 0.879, + "step": 1489 + }, + { + "epoch": 0.8691847746828059, + "grad_norm": 1.376483678817749, + "learning_rate": 2.913161057692308e-05, + "loss": 1.0383, + "step": 1490 + }, + { + "epoch": 0.8697681201691702, + "grad_norm": 1.1967178583145142, + "learning_rate": 2.9116586538461538e-05, + "loss": 0.981, + "step": 1491 + }, + { + "epoch": 0.8703514656555345, + "grad_norm": 1.1415852308273315, + "learning_rate": 2.91015625e-05, + "loss": 0.8465, + "step": 1492 + }, + { + "epoch": 0.8709348111418987, + "grad_norm": 1.2080132961273193, + "learning_rate": 2.9086538461538465e-05, + "loss": 1.0375, + "step": 1493 + }, + { + "epoch": 0.871518156628263, + "grad_norm": 1.1498886346817017, + "learning_rate": 2.9071514423076924e-05, + "loss": 1.1026, + "step": 1494 + }, + { + "epoch": 0.8721015021146273, + "grad_norm": 1.213456630706787, + "learning_rate": 2.9056490384615386e-05, + "loss": 1.1582, + "step": 1495 + }, + { + "epoch": 0.8726848476009917, + "grad_norm": 2.127002239227295, + "learning_rate": 2.9041466346153844e-05, + "loss": 0.9406, + "step": 1496 + }, + { + "epoch": 0.873268193087356, + "grad_norm": 1.454347848892212, + "learning_rate": 2.902644230769231e-05, + "loss": 0.9734, + "step": 1497 + }, + { + "epoch": 0.8738515385737203, + "grad_norm": 1.1562999486923218, + "learning_rate": 2.901141826923077e-05, + "loss": 1.1249, + "step": 1498 + }, + { + "epoch": 0.8744348840600846, + "grad_norm": 1.0088770389556885, + "learning_rate": 2.899639423076923e-05, + "loss": 1.1021, + "step": 1499 + }, + { + "epoch": 0.8750182295464489, + "grad_norm": 0.972812294960022, + "learning_rate": 2.8981370192307695e-05, + "loss": 0.8978, + "step": 1500 + }, + { + "epoch": 0.8756015750328132, + "grad_norm": 1.3404576778411865, + "learning_rate": 2.8966346153846157e-05, + "loss": 1.0195, + "step": 1501 + }, + { + "epoch": 0.8761849205191775, + "grad_norm": 1.1055123805999756, + "learning_rate": 2.8951322115384616e-05, + "loss": 1.1829, + "step": 1502 + }, + { + "epoch": 0.8767682660055418, + "grad_norm": 0.9296578764915466, + "learning_rate": 2.893629807692308e-05, + "loss": 1.1158, + "step": 1503 + }, + { + "epoch": 0.8773516114919061, + "grad_norm": 1.197265625, + "learning_rate": 2.892127403846154e-05, + "loss": 1.0498, + "step": 1504 + }, + { + "epoch": 0.8779349569782704, + "grad_norm": 0.9996442794799805, + "learning_rate": 2.890625e-05, + "loss": 1.0372, + "step": 1505 + }, + { + "epoch": 0.8785183024646347, + "grad_norm": 1.2330306768417358, + "learning_rate": 2.8891225961538467e-05, + "loss": 1.0224, + "step": 1506 + }, + { + "epoch": 0.879101647950999, + "grad_norm": 0.902051568031311, + "learning_rate": 2.887620192307692e-05, + "loss": 0.6842, + "step": 1507 + }, + { + "epoch": 0.8796849934373633, + "grad_norm": 1.1468744277954102, + "learning_rate": 2.8861177884615387e-05, + "loss": 0.7994, + "step": 1508 + }, + { + "epoch": 0.8802683389237276, + "grad_norm": 1.3102710247039795, + "learning_rate": 2.8846153846153845e-05, + "loss": 0.8041, + "step": 1509 + }, + { + "epoch": 0.8808516844100919, + "grad_norm": 1.234621286392212, + "learning_rate": 2.8831129807692307e-05, + "loss": 0.9069, + "step": 1510 + }, + { + "epoch": 0.8814350298964562, + "grad_norm": 1.3230934143066406, + "learning_rate": 2.8816105769230773e-05, + "loss": 0.8703, + "step": 1511 + }, + { + "epoch": 0.8820183753828205, + "grad_norm": 1.1847695112228394, + "learning_rate": 2.880108173076923e-05, + "loss": 1.0461, + "step": 1512 + }, + { + "epoch": 0.8826017208691848, + "grad_norm": 1.535765528678894, + "learning_rate": 2.8786057692307693e-05, + "loss": 0.9742, + "step": 1513 + }, + { + "epoch": 0.883185066355549, + "grad_norm": 1.295670509338379, + "learning_rate": 2.8771033653846158e-05, + "loss": 0.9253, + "step": 1514 + }, + { + "epoch": 0.8837684118419133, + "grad_norm": 1.4603569507598877, + "learning_rate": 2.8756009615384617e-05, + "loss": 1.2047, + "step": 1515 + }, + { + "epoch": 0.8843517573282776, + "grad_norm": 1.117161512374878, + "learning_rate": 2.874098557692308e-05, + "loss": 1.1832, + "step": 1516 + }, + { + "epoch": 0.8849351028146419, + "grad_norm": 1.2172470092773438, + "learning_rate": 2.8725961538461537e-05, + "loss": 1.0312, + "step": 1517 + }, + { + "epoch": 0.8855184483010062, + "grad_norm": 1.279415249824524, + "learning_rate": 2.8710937500000002e-05, + "loss": 1.0095, + "step": 1518 + }, + { + "epoch": 0.8861017937873705, + "grad_norm": 1.3966619968414307, + "learning_rate": 2.8695913461538464e-05, + "loss": 0.921, + "step": 1519 + }, + { + "epoch": 0.8866851392737348, + "grad_norm": 1.2895619869232178, + "learning_rate": 2.8680889423076923e-05, + "loss": 0.9268, + "step": 1520 + }, + { + "epoch": 0.8872684847600991, + "grad_norm": 1.188140869140625, + "learning_rate": 2.8665865384615388e-05, + "loss": 0.8283, + "step": 1521 + }, + { + "epoch": 0.8878518302464634, + "grad_norm": 1.200034499168396, + "learning_rate": 2.8650841346153847e-05, + "loss": 1.0621, + "step": 1522 + }, + { + "epoch": 0.8884351757328278, + "grad_norm": 1.3461898565292358, + "learning_rate": 2.863581730769231e-05, + "loss": 0.9668, + "step": 1523 + }, + { + "epoch": 0.8890185212191921, + "grad_norm": 1.2042393684387207, + "learning_rate": 2.8620793269230774e-05, + "loss": 1.145, + "step": 1524 + }, + { + "epoch": 0.8896018667055564, + "grad_norm": 1.2944340705871582, + "learning_rate": 2.860576923076923e-05, + "loss": 0.8788, + "step": 1525 + }, + { + "epoch": 0.8901852121919207, + "grad_norm": 1.2656347751617432, + "learning_rate": 2.8590745192307694e-05, + "loss": 0.826, + "step": 1526 + }, + { + "epoch": 0.890768557678285, + "grad_norm": 1.079966425895691, + "learning_rate": 2.857572115384616e-05, + "loss": 0.9062, + "step": 1527 + }, + { + "epoch": 0.8913519031646493, + "grad_norm": 1.440693736076355, + "learning_rate": 2.8560697115384615e-05, + "loss": 1.0446, + "step": 1528 + }, + { + "epoch": 0.8919352486510136, + "grad_norm": 1.3850165605545044, + "learning_rate": 2.854567307692308e-05, + "loss": 0.9395, + "step": 1529 + }, + { + "epoch": 0.8925185941373779, + "grad_norm": 1.3351702690124512, + "learning_rate": 2.853064903846154e-05, + "loss": 1.1362, + "step": 1530 + }, + { + "epoch": 0.8931019396237422, + "grad_norm": 0.9820516109466553, + "learning_rate": 2.8515625e-05, + "loss": 0.8793, + "step": 1531 + }, + { + "epoch": 0.8936852851101065, + "grad_norm": 1.218936562538147, + "learning_rate": 2.8500600961538466e-05, + "loss": 0.9328, + "step": 1532 + }, + { + "epoch": 0.8942686305964708, + "grad_norm": 1.2848559617996216, + "learning_rate": 2.8485576923076924e-05, + "loss": 0.955, + "step": 1533 + }, + { + "epoch": 0.8948519760828351, + "grad_norm": 1.3455287218093872, + "learning_rate": 2.8470552884615386e-05, + "loss": 0.9611, + "step": 1534 + }, + { + "epoch": 0.8954353215691994, + "grad_norm": 1.62530517578125, + "learning_rate": 2.8455528846153844e-05, + "loss": 0.9592, + "step": 1535 + }, + { + "epoch": 0.8960186670555637, + "grad_norm": 1.1084357500076294, + "learning_rate": 2.844050480769231e-05, + "loss": 1.1606, + "step": 1536 + }, + { + "epoch": 0.896602012541928, + "grad_norm": 1.11439847946167, + "learning_rate": 2.842548076923077e-05, + "loss": 0.9126, + "step": 1537 + }, + { + "epoch": 0.8971853580282922, + "grad_norm": 1.2643011808395386, + "learning_rate": 2.841045673076923e-05, + "loss": 1.1443, + "step": 1538 + }, + { + "epoch": 0.8977687035146565, + "grad_norm": 1.1699936389923096, + "learning_rate": 2.8395432692307695e-05, + "loss": 0.94, + "step": 1539 + }, + { + "epoch": 0.8983520490010208, + "grad_norm": 1.2582471370697021, + "learning_rate": 2.8380408653846157e-05, + "loss": 1.0579, + "step": 1540 + }, + { + "epoch": 0.8989353944873851, + "grad_norm": 1.1401822566986084, + "learning_rate": 2.8365384615384616e-05, + "loss": 0.8857, + "step": 1541 + }, + { + "epoch": 0.8995187399737494, + "grad_norm": 1.2778136730194092, + "learning_rate": 2.835036057692308e-05, + "loss": 0.7792, + "step": 1542 + }, + { + "epoch": 0.9001020854601137, + "grad_norm": 1.3046090602874756, + "learning_rate": 2.8335336538461536e-05, + "loss": 1.0387, + "step": 1543 + }, + { + "epoch": 0.900685430946478, + "grad_norm": 1.1095534563064575, + "learning_rate": 2.83203125e-05, + "loss": 0.8712, + "step": 1544 + }, + { + "epoch": 0.9012687764328423, + "grad_norm": 1.116352915763855, + "learning_rate": 2.8305288461538467e-05, + "loss": 0.9245, + "step": 1545 + }, + { + "epoch": 0.9018521219192066, + "grad_norm": 1.2703346014022827, + "learning_rate": 2.8290264423076922e-05, + "loss": 0.8139, + "step": 1546 + }, + { + "epoch": 0.9024354674055709, + "grad_norm": 1.2357991933822632, + "learning_rate": 2.8275240384615387e-05, + "loss": 1.1956, + "step": 1547 + }, + { + "epoch": 0.9030188128919352, + "grad_norm": 1.0954554080963135, + "learning_rate": 2.8260216346153846e-05, + "loss": 0.8301, + "step": 1548 + }, + { + "epoch": 0.9036021583782996, + "grad_norm": 1.6321262121200562, + "learning_rate": 2.8245192307692307e-05, + "loss": 0.9625, + "step": 1549 + }, + { + "epoch": 0.9041855038646639, + "grad_norm": 1.3000946044921875, + "learning_rate": 2.8230168269230773e-05, + "loss": 0.9551, + "step": 1550 + }, + { + "epoch": 0.9047688493510282, + "grad_norm": 1.1882951259613037, + "learning_rate": 2.821514423076923e-05, + "loss": 0.8661, + "step": 1551 + }, + { + "epoch": 0.9053521948373925, + "grad_norm": 1.136929988861084, + "learning_rate": 2.8200120192307693e-05, + "loss": 0.8345, + "step": 1552 + }, + { + "epoch": 0.9059355403237568, + "grad_norm": 1.2074054479599, + "learning_rate": 2.818509615384616e-05, + "loss": 1.0495, + "step": 1553 + }, + { + "epoch": 0.9065188858101211, + "grad_norm": 1.0739816427230835, + "learning_rate": 2.8170072115384617e-05, + "loss": 0.8297, + "step": 1554 + }, + { + "epoch": 0.9071022312964854, + "grad_norm": 1.0656182765960693, + "learning_rate": 2.815504807692308e-05, + "loss": 0.9902, + "step": 1555 + }, + { + "epoch": 0.9076855767828497, + "grad_norm": 1.072704553604126, + "learning_rate": 2.8140024038461537e-05, + "loss": 1.1225, + "step": 1556 + }, + { + "epoch": 0.908268922269214, + "grad_norm": 1.2448283433914185, + "learning_rate": 2.8125000000000003e-05, + "loss": 1.2656, + "step": 1557 + }, + { + "epoch": 0.9088522677555783, + "grad_norm": 1.2833285331726074, + "learning_rate": 2.8109975961538465e-05, + "loss": 1.297, + "step": 1558 + }, + { + "epoch": 0.9094356132419426, + "grad_norm": 0.9518376588821411, + "learning_rate": 2.8094951923076923e-05, + "loss": 1.2577, + "step": 1559 + }, + { + "epoch": 0.9100189587283068, + "grad_norm": 1.1654349565505981, + "learning_rate": 2.8079927884615388e-05, + "loss": 0.9981, + "step": 1560 + }, + { + "epoch": 0.9106023042146711, + "grad_norm": 1.316017746925354, + "learning_rate": 2.8064903846153843e-05, + "loss": 0.8892, + "step": 1561 + }, + { + "epoch": 0.9111856497010354, + "grad_norm": 1.2900649309158325, + "learning_rate": 2.804987980769231e-05, + "loss": 0.9713, + "step": 1562 + }, + { + "epoch": 0.9117689951873997, + "grad_norm": 1.169692039489746, + "learning_rate": 2.8034855769230774e-05, + "loss": 1.0375, + "step": 1563 + }, + { + "epoch": 0.912352340673764, + "grad_norm": 1.1953729391098022, + "learning_rate": 2.801983173076923e-05, + "loss": 0.8052, + "step": 1564 + }, + { + "epoch": 0.9129356861601283, + "grad_norm": 1.3699668645858765, + "learning_rate": 2.8004807692307694e-05, + "loss": 0.968, + "step": 1565 + }, + { + "epoch": 0.9135190316464926, + "grad_norm": 1.2539699077606201, + "learning_rate": 2.7989783653846156e-05, + "loss": 1.0042, + "step": 1566 + }, + { + "epoch": 0.9141023771328569, + "grad_norm": 1.1665087938308716, + "learning_rate": 2.7974759615384615e-05, + "loss": 0.949, + "step": 1567 + }, + { + "epoch": 0.9146857226192212, + "grad_norm": 1.421539068222046, + "learning_rate": 2.795973557692308e-05, + "loss": 0.9799, + "step": 1568 + }, + { + "epoch": 0.9152690681055855, + "grad_norm": 1.0832866430282593, + "learning_rate": 2.794471153846154e-05, + "loss": 0.8219, + "step": 1569 + }, + { + "epoch": 0.9158524135919498, + "grad_norm": 1.2853337526321411, + "learning_rate": 2.79296875e-05, + "loss": 1.0887, + "step": 1570 + }, + { + "epoch": 0.9164357590783141, + "grad_norm": 1.1525390148162842, + "learning_rate": 2.7914663461538466e-05, + "loss": 1.0594, + "step": 1571 + }, + { + "epoch": 0.9170191045646784, + "grad_norm": 1.3522123098373413, + "learning_rate": 2.7899639423076924e-05, + "loss": 1.012, + "step": 1572 + }, + { + "epoch": 0.9176024500510427, + "grad_norm": 1.1640657186508179, + "learning_rate": 2.7884615384615386e-05, + "loss": 0.9551, + "step": 1573 + }, + { + "epoch": 0.918185795537407, + "grad_norm": 1.099858045578003, + "learning_rate": 2.7869591346153845e-05, + "loss": 0.9387, + "step": 1574 + }, + { + "epoch": 0.9187691410237713, + "grad_norm": 1.295347809791565, + "learning_rate": 2.785456730769231e-05, + "loss": 0.9975, + "step": 1575 + }, + { + "epoch": 0.9193524865101357, + "grad_norm": 1.0438365936279297, + "learning_rate": 2.7839543269230772e-05, + "loss": 0.9287, + "step": 1576 + }, + { + "epoch": 0.9199358319965, + "grad_norm": 1.5085781812667847, + "learning_rate": 2.782451923076923e-05, + "loss": 0.8949, + "step": 1577 + }, + { + "epoch": 0.9205191774828643, + "grad_norm": 1.1125335693359375, + "learning_rate": 2.7809495192307696e-05, + "loss": 1.1754, + "step": 1578 + }, + { + "epoch": 0.9211025229692286, + "grad_norm": 1.4580250978469849, + "learning_rate": 2.7794471153846157e-05, + "loss": 1.188, + "step": 1579 + }, + { + "epoch": 0.9216858684555929, + "grad_norm": 1.5720735788345337, + "learning_rate": 2.7779447115384616e-05, + "loss": 0.97, + "step": 1580 + }, + { + "epoch": 0.9222692139419572, + "grad_norm": 1.5077130794525146, + "learning_rate": 2.776442307692308e-05, + "loss": 0.9886, + "step": 1581 + }, + { + "epoch": 0.9228525594283215, + "grad_norm": 0.9293258786201477, + "learning_rate": 2.7749399038461536e-05, + "loss": 0.9714, + "step": 1582 + }, + { + "epoch": 0.9234359049146857, + "grad_norm": 1.5474597215652466, + "learning_rate": 2.7734375e-05, + "loss": 0.9918, + "step": 1583 + }, + { + "epoch": 0.92401925040105, + "grad_norm": 1.3098106384277344, + "learning_rate": 2.7719350961538463e-05, + "loss": 1.1593, + "step": 1584 + }, + { + "epoch": 0.9246025958874143, + "grad_norm": 1.1443274021148682, + "learning_rate": 2.7704326923076922e-05, + "loss": 1.0293, + "step": 1585 + }, + { + "epoch": 0.9251859413737786, + "grad_norm": 1.362916111946106, + "learning_rate": 2.7689302884615387e-05, + "loss": 1.2039, + "step": 1586 + }, + { + "epoch": 0.9257692868601429, + "grad_norm": 1.2175136804580688, + "learning_rate": 2.7674278846153846e-05, + "loss": 0.9562, + "step": 1587 + }, + { + "epoch": 0.9263526323465072, + "grad_norm": 1.4667470455169678, + "learning_rate": 2.7659254807692308e-05, + "loss": 0.9903, + "step": 1588 + }, + { + "epoch": 0.9269359778328715, + "grad_norm": 1.317681908607483, + "learning_rate": 2.7644230769230773e-05, + "loss": 1.0178, + "step": 1589 + }, + { + "epoch": 0.9275193233192358, + "grad_norm": 1.2673649787902832, + "learning_rate": 2.762920673076923e-05, + "loss": 1.0001, + "step": 1590 + }, + { + "epoch": 0.9281026688056001, + "grad_norm": 1.055626630783081, + "learning_rate": 2.7614182692307693e-05, + "loss": 1.0075, + "step": 1591 + }, + { + "epoch": 0.9286860142919644, + "grad_norm": 0.938339352607727, + "learning_rate": 2.759915865384616e-05, + "loss": 0.8492, + "step": 1592 + }, + { + "epoch": 0.9292693597783287, + "grad_norm": 1.1331336498260498, + "learning_rate": 2.7584134615384617e-05, + "loss": 1.1089, + "step": 1593 + }, + { + "epoch": 0.929852705264693, + "grad_norm": 0.9863076210021973, + "learning_rate": 2.756911057692308e-05, + "loss": 0.827, + "step": 1594 + }, + { + "epoch": 0.9304360507510573, + "grad_norm": 0.8568504452705383, + "learning_rate": 2.7554086538461537e-05, + "loss": 0.9642, + "step": 1595 + }, + { + "epoch": 0.9310193962374216, + "grad_norm": 1.3427821397781372, + "learning_rate": 2.7539062500000003e-05, + "loss": 1.0344, + "step": 1596 + }, + { + "epoch": 0.9316027417237859, + "grad_norm": 1.1285762786865234, + "learning_rate": 2.7524038461538465e-05, + "loss": 0.9632, + "step": 1597 + }, + { + "epoch": 0.9321860872101502, + "grad_norm": 1.079243779182434, + "learning_rate": 2.7509014423076923e-05, + "loss": 1.1732, + "step": 1598 + }, + { + "epoch": 0.9327694326965145, + "grad_norm": 1.4731907844543457, + "learning_rate": 2.749399038461539e-05, + "loss": 1.0798, + "step": 1599 + }, + { + "epoch": 0.9333527781828788, + "grad_norm": 1.8456404209136963, + "learning_rate": 2.7478966346153844e-05, + "loss": 0.8122, + "step": 1600 + }, + { + "epoch": 0.9333527781828788, + "eval_loss_squad": 0.8185375917516649, + "eval_perplexity": 8.537540772600966, + "eval_perplexity_reconstruct": 1.9311449308939752, + "step": 1600 + }, + { + "epoch": 0.9339361236692431, + "grad_norm": 1.5223437547683716, + "learning_rate": 2.746394230769231e-05, + "loss": 1.0494, + "step": 1601 + }, + { + "epoch": 0.9345194691556075, + "grad_norm": 1.1983387470245361, + "learning_rate": 2.744891826923077e-05, + "loss": 1.1375, + "step": 1602 + }, + { + "epoch": 0.9351028146419718, + "grad_norm": 1.253590703010559, + "learning_rate": 2.743389423076923e-05, + "loss": 0.86, + "step": 1603 + }, + { + "epoch": 0.935686160128336, + "grad_norm": 1.5768380165100098, + "learning_rate": 2.7418870192307695e-05, + "loss": 1.0707, + "step": 1604 + }, + { + "epoch": 0.9362695056147003, + "grad_norm": 1.1141117811203003, + "learning_rate": 2.7403846153846156e-05, + "loss": 0.9249, + "step": 1605 + }, + { + "epoch": 0.9368528511010646, + "grad_norm": 1.229311466217041, + "learning_rate": 2.7388822115384615e-05, + "loss": 1.1422, + "step": 1606 + }, + { + "epoch": 0.9374361965874289, + "grad_norm": 1.0226390361785889, + "learning_rate": 2.737379807692308e-05, + "loss": 1.1093, + "step": 1607 + }, + { + "epoch": 0.9380195420737932, + "grad_norm": 1.055548071861267, + "learning_rate": 2.735877403846154e-05, + "loss": 1.1264, + "step": 1608 + }, + { + "epoch": 0.9386028875601575, + "grad_norm": 1.326737642288208, + "learning_rate": 2.734375e-05, + "loss": 0.7929, + "step": 1609 + }, + { + "epoch": 0.9391862330465218, + "grad_norm": 1.2810485363006592, + "learning_rate": 2.7328725961538466e-05, + "loss": 0.9209, + "step": 1610 + }, + { + "epoch": 0.9397695785328861, + "grad_norm": 1.1614972352981567, + "learning_rate": 2.7313701923076924e-05, + "loss": 0.8571, + "step": 1611 + }, + { + "epoch": 0.9403529240192504, + "grad_norm": 1.063585638999939, + "learning_rate": 2.7298677884615386e-05, + "loss": 1.0951, + "step": 1612 + }, + { + "epoch": 0.9409362695056147, + "grad_norm": 1.1287500858306885, + "learning_rate": 2.7283653846153845e-05, + "loss": 0.9155, + "step": 1613 + }, + { + "epoch": 0.941519614991979, + "grad_norm": 1.202637791633606, + "learning_rate": 2.726862980769231e-05, + "loss": 0.922, + "step": 1614 + }, + { + "epoch": 0.9421029604783433, + "grad_norm": 0.9116735458374023, + "learning_rate": 2.7253605769230772e-05, + "loss": 1.1287, + "step": 1615 + }, + { + "epoch": 0.9426863059647076, + "grad_norm": 1.2105416059494019, + "learning_rate": 2.723858173076923e-05, + "loss": 1.0168, + "step": 1616 + }, + { + "epoch": 0.9432696514510719, + "grad_norm": 1.4131770133972168, + "learning_rate": 2.7223557692307696e-05, + "loss": 0.8181, + "step": 1617 + }, + { + "epoch": 0.9438529969374362, + "grad_norm": 1.2585093975067139, + "learning_rate": 2.7208533653846158e-05, + "loss": 0.8861, + "step": 1618 + }, + { + "epoch": 0.9444363424238005, + "grad_norm": 1.1039953231811523, + "learning_rate": 2.7193509615384616e-05, + "loss": 0.9609, + "step": 1619 + }, + { + "epoch": 0.9450196879101648, + "grad_norm": 1.245587706565857, + "learning_rate": 2.7178485576923078e-05, + "loss": 0.8698, + "step": 1620 + }, + { + "epoch": 0.9456030333965291, + "grad_norm": 1.325048565864563, + "learning_rate": 2.7163461538461536e-05, + "loss": 0.8585, + "step": 1621 + }, + { + "epoch": 0.9461863788828934, + "grad_norm": 1.3289551734924316, + "learning_rate": 2.7148437500000002e-05, + "loss": 1.089, + "step": 1622 + }, + { + "epoch": 0.9467697243692577, + "grad_norm": 1.2364208698272705, + "learning_rate": 2.7133413461538464e-05, + "loss": 1.0367, + "step": 1623 + }, + { + "epoch": 0.947353069855622, + "grad_norm": 1.2870745658874512, + "learning_rate": 2.7118389423076922e-05, + "loss": 1.1767, + "step": 1624 + }, + { + "epoch": 0.9479364153419862, + "grad_norm": 1.1112834215164185, + "learning_rate": 2.7103365384615387e-05, + "loss": 0.9196, + "step": 1625 + }, + { + "epoch": 0.9485197608283505, + "grad_norm": 1.0553075075149536, + "learning_rate": 2.7088341346153846e-05, + "loss": 1.01, + "step": 1626 + }, + { + "epoch": 0.9491031063147148, + "grad_norm": 1.2069644927978516, + "learning_rate": 2.7073317307692308e-05, + "loss": 1.0086, + "step": 1627 + }, + { + "epoch": 0.9496864518010791, + "grad_norm": 1.0190929174423218, + "learning_rate": 2.7058293269230773e-05, + "loss": 0.9053, + "step": 1628 + }, + { + "epoch": 0.9502697972874435, + "grad_norm": 1.1159418821334839, + "learning_rate": 2.704326923076923e-05, + "loss": 0.9234, + "step": 1629 + }, + { + "epoch": 0.9508531427738078, + "grad_norm": 1.2140814065933228, + "learning_rate": 2.7028245192307693e-05, + "loss": 0.8535, + "step": 1630 + }, + { + "epoch": 0.9514364882601721, + "grad_norm": 1.3430871963500977, + "learning_rate": 2.701322115384616e-05, + "loss": 1.1157, + "step": 1631 + }, + { + "epoch": 0.9520198337465364, + "grad_norm": 1.2624151706695557, + "learning_rate": 2.6998197115384617e-05, + "loss": 0.9076, + "step": 1632 + }, + { + "epoch": 0.9526031792329007, + "grad_norm": 1.325024127960205, + "learning_rate": 2.698317307692308e-05, + "loss": 0.9707, + "step": 1633 + }, + { + "epoch": 0.953186524719265, + "grad_norm": 1.1497493982315063, + "learning_rate": 2.6968149038461538e-05, + "loss": 0.9365, + "step": 1634 + }, + { + "epoch": 0.9537698702056293, + "grad_norm": 1.139591097831726, + "learning_rate": 2.6953125000000003e-05, + "loss": 1.0114, + "step": 1635 + }, + { + "epoch": 0.9543532156919936, + "grad_norm": 1.202013373374939, + "learning_rate": 2.6938100961538465e-05, + "loss": 0.8738, + "step": 1636 + }, + { + "epoch": 0.9549365611783579, + "grad_norm": 1.1490192413330078, + "learning_rate": 2.6923076923076923e-05, + "loss": 0.9644, + "step": 1637 + }, + { + "epoch": 0.9555199066647222, + "grad_norm": 1.1956679821014404, + "learning_rate": 2.6908052884615385e-05, + "loss": 0.8404, + "step": 1638 + }, + { + "epoch": 0.9561032521510865, + "grad_norm": 1.016502857208252, + "learning_rate": 2.6893028846153844e-05, + "loss": 0.9757, + "step": 1639 + }, + { + "epoch": 0.9566865976374508, + "grad_norm": 1.1245112419128418, + "learning_rate": 2.687800480769231e-05, + "loss": 0.8616, + "step": 1640 + }, + { + "epoch": 0.9572699431238151, + "grad_norm": 1.326154351234436, + "learning_rate": 2.686298076923077e-05, + "loss": 0.8661, + "step": 1641 + }, + { + "epoch": 0.9578532886101794, + "grad_norm": 1.2109782695770264, + "learning_rate": 2.684795673076923e-05, + "loss": 0.8722, + "step": 1642 + }, + { + "epoch": 0.9584366340965437, + "grad_norm": 1.2979744672775269, + "learning_rate": 2.6832932692307695e-05, + "loss": 0.9699, + "step": 1643 + }, + { + "epoch": 0.959019979582908, + "grad_norm": 1.4275164604187012, + "learning_rate": 2.6817908653846157e-05, + "loss": 1.3414, + "step": 1644 + }, + { + "epoch": 0.9596033250692723, + "grad_norm": 2.1155264377593994, + "learning_rate": 2.6802884615384615e-05, + "loss": 1.061, + "step": 1645 + }, + { + "epoch": 0.9601866705556366, + "grad_norm": 1.1777634620666504, + "learning_rate": 2.678786057692308e-05, + "loss": 0.8194, + "step": 1646 + }, + { + "epoch": 0.9607700160420009, + "grad_norm": 1.3087421655654907, + "learning_rate": 2.677283653846154e-05, + "loss": 1.084, + "step": 1647 + }, + { + "epoch": 0.9613533615283651, + "grad_norm": 1.2843700647354126, + "learning_rate": 2.67578125e-05, + "loss": 0.9156, + "step": 1648 + }, + { + "epoch": 0.9619367070147294, + "grad_norm": 1.2804768085479736, + "learning_rate": 2.6742788461538466e-05, + "loss": 1.1256, + "step": 1649 + }, + { + "epoch": 0.9625200525010937, + "grad_norm": 1.1783385276794434, + "learning_rate": 2.6727764423076925e-05, + "loss": 0.7652, + "step": 1650 + }, + { + "epoch": 0.963103397987458, + "grad_norm": 1.0474493503570557, + "learning_rate": 2.6712740384615386e-05, + "loss": 1.1225, + "step": 1651 + }, + { + "epoch": 0.9636867434738223, + "grad_norm": 1.003244161605835, + "learning_rate": 2.6697716346153845e-05, + "loss": 0.9582, + "step": 1652 + }, + { + "epoch": 0.9642700889601866, + "grad_norm": 1.3323856592178345, + "learning_rate": 2.668269230769231e-05, + "loss": 0.9056, + "step": 1653 + }, + { + "epoch": 0.9648534344465509, + "grad_norm": 0.8693028092384338, + "learning_rate": 2.6667668269230772e-05, + "loss": 1.2649, + "step": 1654 + }, + { + "epoch": 0.9654367799329153, + "grad_norm": 1.2013863325119019, + "learning_rate": 2.665264423076923e-05, + "loss": 1.0149, + "step": 1655 + }, + { + "epoch": 0.9660201254192796, + "grad_norm": 1.061509132385254, + "learning_rate": 2.6637620192307692e-05, + "loss": 0.9357, + "step": 1656 + }, + { + "epoch": 0.9666034709056439, + "grad_norm": 1.086358904838562, + "learning_rate": 2.6622596153846158e-05, + "loss": 0.7849, + "step": 1657 + }, + { + "epoch": 0.9671868163920082, + "grad_norm": 1.2358739376068115, + "learning_rate": 2.6607572115384616e-05, + "loss": 1.0885, + "step": 1658 + }, + { + "epoch": 0.9677701618783725, + "grad_norm": 1.0823692083358765, + "learning_rate": 2.6592548076923078e-05, + "loss": 0.8326, + "step": 1659 + }, + { + "epoch": 0.9683535073647368, + "grad_norm": 2.211810827255249, + "learning_rate": 2.6577524038461537e-05, + "loss": 1.0041, + "step": 1660 + }, + { + "epoch": 0.9689368528511011, + "grad_norm": 1.0642420053482056, + "learning_rate": 2.6562500000000002e-05, + "loss": 1.0824, + "step": 1661 + }, + { + "epoch": 0.9695201983374654, + "grad_norm": 1.0718188285827637, + "learning_rate": 2.6547475961538464e-05, + "loss": 1.024, + "step": 1662 + }, + { + "epoch": 0.9701035438238297, + "grad_norm": 0.972342848777771, + "learning_rate": 2.6532451923076922e-05, + "loss": 1.0009, + "step": 1663 + }, + { + "epoch": 0.970686889310194, + "grad_norm": 1.5624159574508667, + "learning_rate": 2.6517427884615388e-05, + "loss": 0.9796, + "step": 1664 + }, + { + "epoch": 0.9712702347965583, + "grad_norm": 1.251407504081726, + "learning_rate": 2.6502403846153846e-05, + "loss": 1.0386, + "step": 1665 + }, + { + "epoch": 0.9718535802829226, + "grad_norm": 1.3637851476669312, + "learning_rate": 2.6487379807692308e-05, + "loss": 1.0674, + "step": 1666 + }, + { + "epoch": 0.9724369257692869, + "grad_norm": 1.1037849187850952, + "learning_rate": 2.6472355769230773e-05, + "loss": 0.8173, + "step": 1667 + }, + { + "epoch": 0.9730202712556512, + "grad_norm": 1.0050426721572876, + "learning_rate": 2.6457331730769232e-05, + "loss": 0.9875, + "step": 1668 + }, + { + "epoch": 0.9736036167420155, + "grad_norm": 1.2693151235580444, + "learning_rate": 2.6442307692307694e-05, + "loss": 1.0777, + "step": 1669 + }, + { + "epoch": 0.9741869622283797, + "grad_norm": 1.139029622077942, + "learning_rate": 2.642728365384616e-05, + "loss": 1.1184, + "step": 1670 + }, + { + "epoch": 0.974770307714744, + "grad_norm": 1.156640887260437, + "learning_rate": 2.6412259615384617e-05, + "loss": 1.1217, + "step": 1671 + }, + { + "epoch": 0.9753536532011083, + "grad_norm": 1.2772996425628662, + "learning_rate": 2.639723557692308e-05, + "loss": 0.8868, + "step": 1672 + }, + { + "epoch": 0.9759369986874726, + "grad_norm": 1.1612448692321777, + "learning_rate": 2.6382211538461538e-05, + "loss": 0.912, + "step": 1673 + }, + { + "epoch": 0.9765203441738369, + "grad_norm": 1.0551241636276245, + "learning_rate": 2.63671875e-05, + "loss": 0.9191, + "step": 1674 + }, + { + "epoch": 0.9771036896602012, + "grad_norm": 1.092178225517273, + "learning_rate": 2.6352163461538465e-05, + "loss": 0.8177, + "step": 1675 + }, + { + "epoch": 0.9776870351465655, + "grad_norm": 1.261635422706604, + "learning_rate": 2.6337139423076923e-05, + "loss": 0.8914, + "step": 1676 + }, + { + "epoch": 0.9782703806329298, + "grad_norm": 1.3369121551513672, + "learning_rate": 2.6322115384615385e-05, + "loss": 0.7757, + "step": 1677 + }, + { + "epoch": 0.9788537261192941, + "grad_norm": 1.1056147813796997, + "learning_rate": 2.6307091346153844e-05, + "loss": 0.8988, + "step": 1678 + }, + { + "epoch": 0.9794370716056584, + "grad_norm": 1.2153946161270142, + "learning_rate": 2.629206730769231e-05, + "loss": 1.0653, + "step": 1679 + }, + { + "epoch": 0.9800204170920227, + "grad_norm": 1.085607886314392, + "learning_rate": 2.627704326923077e-05, + "loss": 1.0798, + "step": 1680 + }, + { + "epoch": 0.980603762578387, + "grad_norm": 1.152902603149414, + "learning_rate": 2.626201923076923e-05, + "loss": 1.193, + "step": 1681 + }, + { + "epoch": 0.9811871080647514, + "grad_norm": 1.100484848022461, + "learning_rate": 2.6246995192307695e-05, + "loss": 0.8515, + "step": 1682 + }, + { + "epoch": 0.9817704535511157, + "grad_norm": 0.9451040029525757, + "learning_rate": 2.6231971153846157e-05, + "loss": 1.015, + "step": 1683 + }, + { + "epoch": 0.98235379903748, + "grad_norm": 1.5376704931259155, + "learning_rate": 2.6216947115384615e-05, + "loss": 1.1115, + "step": 1684 + }, + { + "epoch": 0.9829371445238443, + "grad_norm": 1.2775416374206543, + "learning_rate": 2.620192307692308e-05, + "loss": 0.9463, + "step": 1685 + }, + { + "epoch": 0.9835204900102086, + "grad_norm": 1.0612167119979858, + "learning_rate": 2.618689903846154e-05, + "loss": 1.3035, + "step": 1686 + }, + { + "epoch": 0.9841038354965729, + "grad_norm": 1.1903184652328491, + "learning_rate": 2.6171875e-05, + "loss": 1.0081, + "step": 1687 + }, + { + "epoch": 0.9846871809829372, + "grad_norm": 1.1516382694244385, + "learning_rate": 2.6156850961538466e-05, + "loss": 0.8602, + "step": 1688 + }, + { + "epoch": 0.9852705264693015, + "grad_norm": 1.1271847486495972, + "learning_rate": 2.6141826923076925e-05, + "loss": 1.109, + "step": 1689 + }, + { + "epoch": 0.9858538719556658, + "grad_norm": 1.0397697687149048, + "learning_rate": 2.6126802884615387e-05, + "loss": 1.2407, + "step": 1690 + }, + { + "epoch": 0.9864372174420301, + "grad_norm": 1.1553921699523926, + "learning_rate": 2.6111778846153845e-05, + "loss": 1.0275, + "step": 1691 + }, + { + "epoch": 0.9870205629283944, + "grad_norm": 1.3742332458496094, + "learning_rate": 2.6096754807692307e-05, + "loss": 1.0156, + "step": 1692 + }, + { + "epoch": 0.9876039084147586, + "grad_norm": 1.3807793855667114, + "learning_rate": 2.6081730769230772e-05, + "loss": 1.0403, + "step": 1693 + }, + { + "epoch": 0.9881872539011229, + "grad_norm": 2.3124213218688965, + "learning_rate": 2.606670673076923e-05, + "loss": 1.065, + "step": 1694 + }, + { + "epoch": 0.9887705993874872, + "grad_norm": 1.1907039880752563, + "learning_rate": 2.6051682692307693e-05, + "loss": 0.7911, + "step": 1695 + }, + { + "epoch": 0.9893539448738515, + "grad_norm": 1.244017481803894, + "learning_rate": 2.6036658653846158e-05, + "loss": 1.0102, + "step": 1696 + }, + { + "epoch": 0.9899372903602158, + "grad_norm": 1.2556577920913696, + "learning_rate": 2.6021634615384616e-05, + "loss": 0.949, + "step": 1697 + }, + { + "epoch": 0.9905206358465801, + "grad_norm": 1.2664211988449097, + "learning_rate": 2.6006610576923078e-05, + "loss": 0.9661, + "step": 1698 + }, + { + "epoch": 0.9911039813329444, + "grad_norm": 1.1367192268371582, + "learning_rate": 2.5991586538461537e-05, + "loss": 1.094, + "step": 1699 + }, + { + "epoch": 0.9916873268193087, + "grad_norm": 1.4298202991485596, + "learning_rate": 2.5976562500000002e-05, + "loss": 1.2788, + "step": 1700 + }, + { + "epoch": 0.992270672305673, + "grad_norm": 1.2524197101593018, + "learning_rate": 2.5961538461538464e-05, + "loss": 0.7913, + "step": 1701 + }, + { + "epoch": 0.9928540177920373, + "grad_norm": 1.3437864780426025, + "learning_rate": 2.5946514423076922e-05, + "loss": 1.1432, + "step": 1702 + }, + { + "epoch": 0.9934373632784016, + "grad_norm": 1.105908751487732, + "learning_rate": 2.5931490384615388e-05, + "loss": 0.8405, + "step": 1703 + }, + { + "epoch": 0.9940207087647659, + "grad_norm": 0.9571016430854797, + "learning_rate": 2.5916466346153846e-05, + "loss": 0.7865, + "step": 1704 + }, + { + "epoch": 0.9946040542511302, + "grad_norm": 1.0247722864151, + "learning_rate": 2.5901442307692308e-05, + "loss": 0.8726, + "step": 1705 + }, + { + "epoch": 0.9951873997374945, + "grad_norm": 1.160334825515747, + "learning_rate": 2.5886418269230773e-05, + "loss": 0.8283, + "step": 1706 + }, + { + "epoch": 0.9957707452238588, + "grad_norm": 1.4978736639022827, + "learning_rate": 2.5871394230769232e-05, + "loss": 1.101, + "step": 1707 + }, + { + "epoch": 0.9963540907102232, + "grad_norm": 1.0514538288116455, + "learning_rate": 2.5856370192307694e-05, + "loss": 0.9573, + "step": 1708 + }, + { + "epoch": 0.9969374361965875, + "grad_norm": 1.091744303703308, + "learning_rate": 2.584134615384616e-05, + "loss": 1.0072, + "step": 1709 + }, + { + "epoch": 0.9975207816829518, + "grad_norm": 1.261958122253418, + "learning_rate": 2.5826322115384614e-05, + "loss": 0.8659, + "step": 1710 + }, + { + "epoch": 0.9981041271693161, + "grad_norm": 1.4826487302780151, + "learning_rate": 2.581129807692308e-05, + "loss": 0.9641, + "step": 1711 + }, + { + "epoch": 0.9986874726556804, + "grad_norm": 1.4628450870513916, + "learning_rate": 2.5796274038461538e-05, + "loss": 0.7998, + "step": 1712 + }, + { + "epoch": 0.9992708181420447, + "grad_norm": 1.1515614986419678, + "learning_rate": 2.578125e-05, + "loss": 1.1375, + "step": 1713 + }, + { + "epoch": 0.999854163628409, + "grad_norm": 1.118147373199463, + "learning_rate": 2.5766225961538465e-05, + "loss": 0.7665, + "step": 1714 + }, + { + "epoch": 1.0004375091147732, + "grad_norm": 0.9037253260612488, + "learning_rate": 2.5751201923076924e-05, + "loss": 0.7602, + "step": 1715 + }, + { + "epoch": 1.0010208546011374, + "grad_norm": 1.0712151527404785, + "learning_rate": 2.5736177884615386e-05, + "loss": 0.7995, + "step": 1716 + }, + { + "epoch": 1.0016042000875018, + "grad_norm": 0.8427433371543884, + "learning_rate": 2.5721153846153844e-05, + "loss": 0.9496, + "step": 1717 + }, + { + "epoch": 1.002187545573866, + "grad_norm": 0.9450975060462952, + "learning_rate": 2.570612980769231e-05, + "loss": 0.8904, + "step": 1718 + }, + { + "epoch": 1.0027708910602304, + "grad_norm": 1.037550926208496, + "learning_rate": 2.569110576923077e-05, + "loss": 0.8008, + "step": 1719 + }, + { + "epoch": 1.0033542365465946, + "grad_norm": 0.8638540506362915, + "learning_rate": 2.567608173076923e-05, + "loss": 0.639, + "step": 1720 + }, + { + "epoch": 1.003937582032959, + "grad_norm": 1.1321598291397095, + "learning_rate": 2.5661057692307695e-05, + "loss": 1.0667, + "step": 1721 + }, + { + "epoch": 1.0045209275193234, + "grad_norm": 1.1372548341751099, + "learning_rate": 2.5646033653846157e-05, + "loss": 0.8085, + "step": 1722 + }, + { + "epoch": 1.0051042730056876, + "grad_norm": 1.0561586618423462, + "learning_rate": 2.5631009615384615e-05, + "loss": 0.8129, + "step": 1723 + }, + { + "epoch": 1.005687618492052, + "grad_norm": 1.0580649375915527, + "learning_rate": 2.561598557692308e-05, + "loss": 0.6149, + "step": 1724 + }, + { + "epoch": 1.0062709639784162, + "grad_norm": 1.236765742301941, + "learning_rate": 2.560096153846154e-05, + "loss": 0.7003, + "step": 1725 + }, + { + "epoch": 1.0068543094647806, + "grad_norm": 1.0639208555221558, + "learning_rate": 2.55859375e-05, + "loss": 0.8107, + "step": 1726 + }, + { + "epoch": 1.0074376549511448, + "grad_norm": 1.3609492778778076, + "learning_rate": 2.5570913461538466e-05, + "loss": 0.9411, + "step": 1727 + }, + { + "epoch": 1.0080210004375092, + "grad_norm": 1.1644700765609741, + "learning_rate": 2.555588942307692e-05, + "loss": 0.701, + "step": 1728 + }, + { + "epoch": 1.0086043459238734, + "grad_norm": 1.2108243703842163, + "learning_rate": 2.5540865384615387e-05, + "loss": 0.9003, + "step": 1729 + }, + { + "epoch": 1.0091876914102378, + "grad_norm": 1.1662452220916748, + "learning_rate": 2.5525841346153845e-05, + "loss": 0.8709, + "step": 1730 + }, + { + "epoch": 1.009771036896602, + "grad_norm": 1.0945322513580322, + "learning_rate": 2.5510817307692307e-05, + "loss": 1.0255, + "step": 1731 + }, + { + "epoch": 1.0103543823829664, + "grad_norm": 1.0760470628738403, + "learning_rate": 2.5495793269230772e-05, + "loss": 0.7592, + "step": 1732 + }, + { + "epoch": 1.0109377278693306, + "grad_norm": 1.3232825994491577, + "learning_rate": 2.548076923076923e-05, + "loss": 0.7684, + "step": 1733 + }, + { + "epoch": 1.011521073355695, + "grad_norm": 1.0295486450195312, + "learning_rate": 2.5465745192307693e-05, + "loss": 0.741, + "step": 1734 + }, + { + "epoch": 1.0121044188420592, + "grad_norm": 1.247344970703125, + "learning_rate": 2.5450721153846158e-05, + "loss": 0.7828, + "step": 1735 + }, + { + "epoch": 1.0126877643284236, + "grad_norm": 1.0752184391021729, + "learning_rate": 2.5435697115384617e-05, + "loss": 0.9003, + "step": 1736 + }, + { + "epoch": 1.0132711098147877, + "grad_norm": 1.0817466974258423, + "learning_rate": 2.542067307692308e-05, + "loss": 0.8295, + "step": 1737 + }, + { + "epoch": 1.0138544553011521, + "grad_norm": 1.0235193967819214, + "learning_rate": 2.5405649038461537e-05, + "loss": 1.0862, + "step": 1738 + }, + { + "epoch": 1.0144378007875163, + "grad_norm": 1.1558364629745483, + "learning_rate": 2.5390625000000002e-05, + "loss": 0.7409, + "step": 1739 + }, + { + "epoch": 1.0150211462738807, + "grad_norm": 1.1982396841049194, + "learning_rate": 2.5375600961538464e-05, + "loss": 0.8249, + "step": 1740 + }, + { + "epoch": 1.015604491760245, + "grad_norm": 1.074081540107727, + "learning_rate": 2.5360576923076923e-05, + "loss": 0.7878, + "step": 1741 + }, + { + "epoch": 1.0161878372466093, + "grad_norm": 1.4815016984939575, + "learning_rate": 2.5345552884615388e-05, + "loss": 0.8043, + "step": 1742 + }, + { + "epoch": 1.0167711827329735, + "grad_norm": 1.1086952686309814, + "learning_rate": 2.5330528846153846e-05, + "loss": 0.7201, + "step": 1743 + }, + { + "epoch": 1.017354528219338, + "grad_norm": 1.3218934535980225, + "learning_rate": 2.5315504807692308e-05, + "loss": 0.6374, + "step": 1744 + }, + { + "epoch": 1.017937873705702, + "grad_norm": 1.0537618398666382, + "learning_rate": 2.5300480769230774e-05, + "loss": 0.7224, + "step": 1745 + }, + { + "epoch": 1.0185212191920665, + "grad_norm": 0.9964005947113037, + "learning_rate": 2.528545673076923e-05, + "loss": 0.7204, + "step": 1746 + }, + { + "epoch": 1.0191045646784307, + "grad_norm": 1.1964815855026245, + "learning_rate": 2.5270432692307694e-05, + "loss": 0.9753, + "step": 1747 + }, + { + "epoch": 1.019687910164795, + "grad_norm": 1.0601825714111328, + "learning_rate": 2.525540865384616e-05, + "loss": 0.9208, + "step": 1748 + }, + { + "epoch": 1.0202712556511595, + "grad_norm": 0.9419945478439331, + "learning_rate": 2.5240384615384614e-05, + "loss": 0.8482, + "step": 1749 + }, + { + "epoch": 1.0208546011375237, + "grad_norm": 1.0061907768249512, + "learning_rate": 2.522536057692308e-05, + "loss": 0.8908, + "step": 1750 + }, + { + "epoch": 1.021437946623888, + "grad_norm": 1.1203185319900513, + "learning_rate": 2.5210336538461538e-05, + "loss": 0.725, + "step": 1751 + }, + { + "epoch": 1.0220212921102523, + "grad_norm": 1.2346861362457275, + "learning_rate": 2.51953125e-05, + "loss": 0.8266, + "step": 1752 + }, + { + "epoch": 1.0226046375966167, + "grad_norm": 1.17784583568573, + "learning_rate": 2.5180288461538465e-05, + "loss": 0.8013, + "step": 1753 + }, + { + "epoch": 1.0231879830829809, + "grad_norm": 1.0398868322372437, + "learning_rate": 2.5165264423076924e-05, + "loss": 0.5803, + "step": 1754 + }, + { + "epoch": 1.0237713285693453, + "grad_norm": 1.372266173362732, + "learning_rate": 2.5150240384615386e-05, + "loss": 0.6542, + "step": 1755 + }, + { + "epoch": 1.0243546740557095, + "grad_norm": 1.2637945413589478, + "learning_rate": 2.5135216346153844e-05, + "loss": 1.0724, + "step": 1756 + }, + { + "epoch": 1.0249380195420739, + "grad_norm": 1.0931401252746582, + "learning_rate": 2.512019230769231e-05, + "loss": 0.734, + "step": 1757 + }, + { + "epoch": 1.025521365028438, + "grad_norm": 1.0705105066299438, + "learning_rate": 2.510516826923077e-05, + "loss": 0.8975, + "step": 1758 + }, + { + "epoch": 1.0261047105148025, + "grad_norm": 1.0411278009414673, + "learning_rate": 2.509014423076923e-05, + "loss": 0.829, + "step": 1759 + }, + { + "epoch": 1.0266880560011666, + "grad_norm": 1.1690353155136108, + "learning_rate": 2.5075120192307695e-05, + "loss": 0.8418, + "step": 1760 + }, + { + "epoch": 1.027271401487531, + "grad_norm": 1.1422929763793945, + "learning_rate": 2.5060096153846157e-05, + "loss": 1.003, + "step": 1761 + }, + { + "epoch": 1.0278547469738952, + "grad_norm": 1.2721543312072754, + "learning_rate": 2.5045072115384616e-05, + "loss": 0.8079, + "step": 1762 + }, + { + "epoch": 1.0284380924602596, + "grad_norm": 1.3369412422180176, + "learning_rate": 2.503004807692308e-05, + "loss": 0.9004, + "step": 1763 + }, + { + "epoch": 1.0290214379466238, + "grad_norm": 1.6825288534164429, + "learning_rate": 2.5015024038461536e-05, + "loss": 0.9225, + "step": 1764 + }, + { + "epoch": 1.0296047834329882, + "grad_norm": 1.25969660282135, + "learning_rate": 2.5e-05, + "loss": 0.7035, + "step": 1765 + }, + { + "epoch": 1.0301881289193524, + "grad_norm": 1.099972128868103, + "learning_rate": 2.4984975961538463e-05, + "loss": 0.7718, + "step": 1766 + }, + { + "epoch": 1.0307714744057168, + "grad_norm": 1.1386865377426147, + "learning_rate": 2.496995192307692e-05, + "loss": 0.7239, + "step": 1767 + }, + { + "epoch": 1.031354819892081, + "grad_norm": 1.2388840913772583, + "learning_rate": 2.4954927884615387e-05, + "loss": 0.7997, + "step": 1768 + }, + { + "epoch": 1.0319381653784454, + "grad_norm": 1.159300446510315, + "learning_rate": 2.493990384615385e-05, + "loss": 1.0244, + "step": 1769 + }, + { + "epoch": 1.0325215108648096, + "grad_norm": 1.186693549156189, + "learning_rate": 2.4924879807692307e-05, + "loss": 0.7362, + "step": 1770 + }, + { + "epoch": 1.033104856351174, + "grad_norm": 1.0532599687576294, + "learning_rate": 2.490985576923077e-05, + "loss": 0.6688, + "step": 1771 + }, + { + "epoch": 1.0336882018375382, + "grad_norm": 1.267077088356018, + "learning_rate": 2.4894831730769234e-05, + "loss": 0.7482, + "step": 1772 + }, + { + "epoch": 1.0342715473239026, + "grad_norm": 1.1788893938064575, + "learning_rate": 2.4879807692307693e-05, + "loss": 0.6433, + "step": 1773 + }, + { + "epoch": 1.0348548928102668, + "grad_norm": 1.3695346117019653, + "learning_rate": 2.4864783653846155e-05, + "loss": 1.0139, + "step": 1774 + }, + { + "epoch": 1.0354382382966312, + "grad_norm": 1.364996075630188, + "learning_rate": 2.4849759615384617e-05, + "loss": 0.9402, + "step": 1775 + }, + { + "epoch": 1.0360215837829956, + "grad_norm": 1.1488473415374756, + "learning_rate": 2.483473557692308e-05, + "loss": 0.6775, + "step": 1776 + }, + { + "epoch": 1.0366049292693598, + "grad_norm": 1.1622604131698608, + "learning_rate": 2.481971153846154e-05, + "loss": 0.9029, + "step": 1777 + }, + { + "epoch": 1.0371882747557242, + "grad_norm": 1.3263182640075684, + "learning_rate": 2.4804687500000002e-05, + "loss": 0.7835, + "step": 1778 + }, + { + "epoch": 1.0377716202420884, + "grad_norm": 1.031337022781372, + "learning_rate": 2.478966346153846e-05, + "loss": 0.7646, + "step": 1779 + }, + { + "epoch": 1.0383549657284528, + "grad_norm": 1.2709075212478638, + "learning_rate": 2.4774639423076923e-05, + "loss": 0.7746, + "step": 1780 + }, + { + "epoch": 1.038938311214817, + "grad_norm": 1.0792698860168457, + "learning_rate": 2.4759615384615388e-05, + "loss": 0.4939, + "step": 1781 + }, + { + "epoch": 1.0395216567011814, + "grad_norm": 1.0642223358154297, + "learning_rate": 2.4744591346153847e-05, + "loss": 0.7588, + "step": 1782 + }, + { + "epoch": 1.0401050021875455, + "grad_norm": 1.2650682926177979, + "learning_rate": 2.472956730769231e-05, + "loss": 0.866, + "step": 1783 + }, + { + "epoch": 1.04068834767391, + "grad_norm": 1.2211084365844727, + "learning_rate": 2.471454326923077e-05, + "loss": 0.8201, + "step": 1784 + }, + { + "epoch": 1.0412716931602741, + "grad_norm": 1.2228623628616333, + "learning_rate": 2.4699519230769232e-05, + "loss": 0.625, + "step": 1785 + }, + { + "epoch": 1.0418550386466385, + "grad_norm": 1.2318840026855469, + "learning_rate": 2.4684495192307694e-05, + "loss": 0.796, + "step": 1786 + }, + { + "epoch": 1.0424383841330027, + "grad_norm": 1.1144096851348877, + "learning_rate": 2.4669471153846156e-05, + "loss": 0.9259, + "step": 1787 + }, + { + "epoch": 1.0430217296193671, + "grad_norm": 1.1762030124664307, + "learning_rate": 2.4654447115384615e-05, + "loss": 0.8342, + "step": 1788 + }, + { + "epoch": 1.0436050751057313, + "grad_norm": 1.142924427986145, + "learning_rate": 2.463942307692308e-05, + "loss": 0.8825, + "step": 1789 + }, + { + "epoch": 1.0441884205920957, + "grad_norm": 1.3172521591186523, + "learning_rate": 2.462439903846154e-05, + "loss": 0.8078, + "step": 1790 + }, + { + "epoch": 1.04477176607846, + "grad_norm": 1.0718015432357788, + "learning_rate": 2.4609375e-05, + "loss": 0.8854, + "step": 1791 + }, + { + "epoch": 1.0453551115648243, + "grad_norm": 1.0188226699829102, + "learning_rate": 2.4594350961538462e-05, + "loss": 1.0314, + "step": 1792 + }, + { + "epoch": 1.0459384570511885, + "grad_norm": 1.0908758640289307, + "learning_rate": 2.4579326923076924e-05, + "loss": 0.6848, + "step": 1793 + }, + { + "epoch": 1.046521802537553, + "grad_norm": 1.072546362876892, + "learning_rate": 2.4564302884615386e-05, + "loss": 0.657, + "step": 1794 + }, + { + "epoch": 1.047105148023917, + "grad_norm": 1.0566478967666626, + "learning_rate": 2.4549278846153848e-05, + "loss": 0.9628, + "step": 1795 + }, + { + "epoch": 1.0476884935102815, + "grad_norm": 1.16886305809021, + "learning_rate": 2.453425480769231e-05, + "loss": 0.7434, + "step": 1796 + }, + { + "epoch": 1.0482718389966457, + "grad_norm": 1.1381522417068481, + "learning_rate": 2.4519230769230768e-05, + "loss": 0.559, + "step": 1797 + }, + { + "epoch": 1.04885518448301, + "grad_norm": 1.2783828973770142, + "learning_rate": 2.4504206730769233e-05, + "loss": 0.9, + "step": 1798 + }, + { + "epoch": 1.0494385299693743, + "grad_norm": 1.0993632078170776, + "learning_rate": 2.4489182692307695e-05, + "loss": 0.7602, + "step": 1799 + }, + { + "epoch": 1.0500218754557387, + "grad_norm": 0.9995693564414978, + "learning_rate": 2.4474158653846154e-05, + "loss": 0.8363, + "step": 1800 + }, + { + "epoch": 1.0500218754557387, + "eval_loss_squad": 0.8367310870531947, + "eval_perplexity": 8.296506422781007, + "eval_perplexity_reconstruct": 1.9039826490870362, + "step": 1800 + }, + { + "epoch": 1.0506052209421028, + "grad_norm": 1.3584226369857788, + "learning_rate": 2.4459134615384616e-05, + "loss": 0.7513, + "step": 1801 + }, + { + "epoch": 1.0511885664284673, + "grad_norm": 1.1026893854141235, + "learning_rate": 2.444411057692308e-05, + "loss": 0.8734, + "step": 1802 + }, + { + "epoch": 1.0517719119148317, + "grad_norm": 0.9750449061393738, + "learning_rate": 2.442908653846154e-05, + "loss": 0.7949, + "step": 1803 + }, + { + "epoch": 1.0523552574011958, + "grad_norm": 1.3138856887817383, + "learning_rate": 2.44140625e-05, + "loss": 0.7326, + "step": 1804 + }, + { + "epoch": 1.0529386028875602, + "grad_norm": 0.9921886920928955, + "learning_rate": 2.4399038461538463e-05, + "loss": 0.8876, + "step": 1805 + }, + { + "epoch": 1.0535219483739244, + "grad_norm": 1.2363988161087036, + "learning_rate": 2.4384014423076922e-05, + "loss": 0.8668, + "step": 1806 + }, + { + "epoch": 1.0541052938602888, + "grad_norm": 1.2714601755142212, + "learning_rate": 2.4368990384615387e-05, + "loss": 0.8749, + "step": 1807 + }, + { + "epoch": 1.054688639346653, + "grad_norm": 1.1158849000930786, + "learning_rate": 2.435396634615385e-05, + "loss": 1.065, + "step": 1808 + }, + { + "epoch": 1.0552719848330174, + "grad_norm": 1.125258207321167, + "learning_rate": 2.4338942307692307e-05, + "loss": 0.8081, + "step": 1809 + }, + { + "epoch": 1.0558553303193816, + "grad_norm": 1.039055585861206, + "learning_rate": 2.432391826923077e-05, + "loss": 0.8932, + "step": 1810 + }, + { + "epoch": 1.056438675805746, + "grad_norm": 1.0936514139175415, + "learning_rate": 2.4308894230769235e-05, + "loss": 0.9564, + "step": 1811 + }, + { + "epoch": 1.0570220212921102, + "grad_norm": 1.334757924079895, + "learning_rate": 2.4293870192307693e-05, + "loss": 0.8967, + "step": 1812 + }, + { + "epoch": 1.0576053667784746, + "grad_norm": 1.4477955102920532, + "learning_rate": 2.4278846153846155e-05, + "loss": 0.8336, + "step": 1813 + }, + { + "epoch": 1.0581887122648388, + "grad_norm": 1.319165825843811, + "learning_rate": 2.4263822115384617e-05, + "loss": 0.7005, + "step": 1814 + }, + { + "epoch": 1.0587720577512032, + "grad_norm": 1.1051411628723145, + "learning_rate": 2.424879807692308e-05, + "loss": 0.7516, + "step": 1815 + }, + { + "epoch": 1.0593554032375674, + "grad_norm": 1.2956829071044922, + "learning_rate": 2.423377403846154e-05, + "loss": 0.7716, + "step": 1816 + }, + { + "epoch": 1.0599387487239318, + "grad_norm": 1.199265956878662, + "learning_rate": 2.4218750000000003e-05, + "loss": 0.9904, + "step": 1817 + }, + { + "epoch": 1.060522094210296, + "grad_norm": 1.0964492559432983, + "learning_rate": 2.420372596153846e-05, + "loss": 0.7208, + "step": 1818 + }, + { + "epoch": 1.0611054396966604, + "grad_norm": 1.264791488647461, + "learning_rate": 2.4188701923076923e-05, + "loss": 0.7618, + "step": 1819 + }, + { + "epoch": 1.0616887851830246, + "grad_norm": 1.1390454769134521, + "learning_rate": 2.4173677884615388e-05, + "loss": 0.7634, + "step": 1820 + }, + { + "epoch": 1.062272130669389, + "grad_norm": 1.1044617891311646, + "learning_rate": 2.4158653846153847e-05, + "loss": 0.6295, + "step": 1821 + }, + { + "epoch": 1.0628554761557532, + "grad_norm": 1.4096314907073975, + "learning_rate": 2.414362980769231e-05, + "loss": 0.8327, + "step": 1822 + }, + { + "epoch": 1.0634388216421176, + "grad_norm": 1.2174628973007202, + "learning_rate": 2.412860576923077e-05, + "loss": 0.985, + "step": 1823 + }, + { + "epoch": 1.0640221671284817, + "grad_norm": 0.912973165512085, + "learning_rate": 2.4113581730769232e-05, + "loss": 1.0427, + "step": 1824 + }, + { + "epoch": 1.0646055126148462, + "grad_norm": 1.1908468008041382, + "learning_rate": 2.4098557692307694e-05, + "loss": 0.7616, + "step": 1825 + }, + { + "epoch": 1.0651888581012106, + "grad_norm": 1.260453462600708, + "learning_rate": 2.4083533653846156e-05, + "loss": 0.8113, + "step": 1826 + }, + { + "epoch": 1.0657722035875747, + "grad_norm": 1.2202224731445312, + "learning_rate": 2.4068509615384615e-05, + "loss": 0.7171, + "step": 1827 + }, + { + "epoch": 1.066355549073939, + "grad_norm": 0.8979166746139526, + "learning_rate": 2.405348557692308e-05, + "loss": 0.7942, + "step": 1828 + }, + { + "epoch": 1.0669388945603033, + "grad_norm": 1.1224358081817627, + "learning_rate": 2.4038461538461542e-05, + "loss": 0.9005, + "step": 1829 + }, + { + "epoch": 1.0675222400466677, + "grad_norm": 1.1409502029418945, + "learning_rate": 2.40234375e-05, + "loss": 0.7607, + "step": 1830 + }, + { + "epoch": 1.068105585533032, + "grad_norm": 1.1402530670166016, + "learning_rate": 2.4008413461538462e-05, + "loss": 1.0247, + "step": 1831 + }, + { + "epoch": 1.0686889310193963, + "grad_norm": 1.0991151332855225, + "learning_rate": 2.3993389423076924e-05, + "loss": 0.7754, + "step": 1832 + }, + { + "epoch": 1.0692722765057605, + "grad_norm": 1.229304552078247, + "learning_rate": 2.3978365384615386e-05, + "loss": 0.8271, + "step": 1833 + }, + { + "epoch": 1.069855621992125, + "grad_norm": 1.2119697332382202, + "learning_rate": 2.3963341346153848e-05, + "loss": 0.8626, + "step": 1834 + }, + { + "epoch": 1.070438967478489, + "grad_norm": 1.4418648481369019, + "learning_rate": 2.394831730769231e-05, + "loss": 0.8454, + "step": 1835 + }, + { + "epoch": 1.0710223129648535, + "grad_norm": 1.1836721897125244, + "learning_rate": 2.3933293269230768e-05, + "loss": 0.8298, + "step": 1836 + }, + { + "epoch": 1.0716056584512177, + "grad_norm": 1.2164535522460938, + "learning_rate": 2.3918269230769234e-05, + "loss": 0.8123, + "step": 1837 + }, + { + "epoch": 1.072189003937582, + "grad_norm": 1.3282859325408936, + "learning_rate": 2.3903245192307695e-05, + "loss": 1.004, + "step": 1838 + }, + { + "epoch": 1.0727723494239463, + "grad_norm": 1.1000369787216187, + "learning_rate": 2.3888221153846154e-05, + "loss": 0.7878, + "step": 1839 + }, + { + "epoch": 1.0733556949103107, + "grad_norm": 1.3156583309173584, + "learning_rate": 2.3873197115384616e-05, + "loss": 0.6691, + "step": 1840 + }, + { + "epoch": 1.0739390403966749, + "grad_norm": 1.264693021774292, + "learning_rate": 2.3858173076923078e-05, + "loss": 0.8582, + "step": 1841 + }, + { + "epoch": 1.0745223858830393, + "grad_norm": 1.1336017847061157, + "learning_rate": 2.384314903846154e-05, + "loss": 0.704, + "step": 1842 + }, + { + "epoch": 1.0751057313694035, + "grad_norm": 0.8796018362045288, + "learning_rate": 2.3828125e-05, + "loss": 0.7031, + "step": 1843 + }, + { + "epoch": 1.0756890768557679, + "grad_norm": 1.2418160438537598, + "learning_rate": 2.3813100961538463e-05, + "loss": 0.7327, + "step": 1844 + }, + { + "epoch": 1.076272422342132, + "grad_norm": 1.1050810813903809, + "learning_rate": 2.3798076923076922e-05, + "loss": 0.8006, + "step": 1845 + }, + { + "epoch": 1.0768557678284965, + "grad_norm": 1.2827439308166504, + "learning_rate": 2.3783052884615387e-05, + "loss": 0.777, + "step": 1846 + }, + { + "epoch": 1.0774391133148606, + "grad_norm": 1.3954315185546875, + "learning_rate": 2.376802884615385e-05, + "loss": 0.7289, + "step": 1847 + }, + { + "epoch": 1.078022458801225, + "grad_norm": 1.2399247884750366, + "learning_rate": 2.3753004807692308e-05, + "loss": 0.7638, + "step": 1848 + }, + { + "epoch": 1.0786058042875892, + "grad_norm": 1.351985216140747, + "learning_rate": 2.373798076923077e-05, + "loss": 0.7568, + "step": 1849 + }, + { + "epoch": 1.0791891497739536, + "grad_norm": 1.1881484985351562, + "learning_rate": 2.372295673076923e-05, + "loss": 0.8417, + "step": 1850 + }, + { + "epoch": 1.0797724952603178, + "grad_norm": 1.6460144519805908, + "learning_rate": 2.3707932692307693e-05, + "loss": 0.8467, + "step": 1851 + }, + { + "epoch": 1.0803558407466822, + "grad_norm": 1.1815186738967896, + "learning_rate": 2.3692908653846155e-05, + "loss": 0.7675, + "step": 1852 + }, + { + "epoch": 1.0809391862330466, + "grad_norm": 1.2811179161071777, + "learning_rate": 2.3677884615384617e-05, + "loss": 0.7745, + "step": 1853 + }, + { + "epoch": 1.0815225317194108, + "grad_norm": 1.1434037685394287, + "learning_rate": 2.366286057692308e-05, + "loss": 0.8752, + "step": 1854 + }, + { + "epoch": 1.0821058772057752, + "grad_norm": 1.2398650646209717, + "learning_rate": 2.364783653846154e-05, + "loss": 0.8398, + "step": 1855 + }, + { + "epoch": 1.0826892226921394, + "grad_norm": 1.2694295644760132, + "learning_rate": 2.3632812500000003e-05, + "loss": 0.8714, + "step": 1856 + }, + { + "epoch": 1.0832725681785038, + "grad_norm": 1.1594481468200684, + "learning_rate": 2.361778846153846e-05, + "loss": 0.6383, + "step": 1857 + }, + { + "epoch": 1.083855913664868, + "grad_norm": 1.2221319675445557, + "learning_rate": 2.3602764423076923e-05, + "loss": 0.9375, + "step": 1858 + }, + { + "epoch": 1.0844392591512324, + "grad_norm": 1.183512568473816, + "learning_rate": 2.3587740384615385e-05, + "loss": 0.8724, + "step": 1859 + }, + { + "epoch": 1.0850226046375966, + "grad_norm": 1.3200178146362305, + "learning_rate": 2.3572716346153847e-05, + "loss": 0.8969, + "step": 1860 + }, + { + "epoch": 1.085605950123961, + "grad_norm": 1.3450496196746826, + "learning_rate": 2.355769230769231e-05, + "loss": 0.822, + "step": 1861 + }, + { + "epoch": 1.0861892956103252, + "grad_norm": 1.2601107358932495, + "learning_rate": 2.354266826923077e-05, + "loss": 0.7901, + "step": 1862 + }, + { + "epoch": 1.0867726410966896, + "grad_norm": 1.089453101158142, + "learning_rate": 2.3527644230769233e-05, + "loss": 0.8132, + "step": 1863 + }, + { + "epoch": 1.0873559865830538, + "grad_norm": 0.8895475268363953, + "learning_rate": 2.3512620192307694e-05, + "loss": 0.6964, + "step": 1864 + }, + { + "epoch": 1.0879393320694182, + "grad_norm": 1.3046108484268188, + "learning_rate": 2.3497596153846156e-05, + "loss": 0.9721, + "step": 1865 + }, + { + "epoch": 1.0885226775557824, + "grad_norm": 1.102406620979309, + "learning_rate": 2.3482572115384615e-05, + "loss": 0.8803, + "step": 1866 + }, + { + "epoch": 1.0891060230421468, + "grad_norm": 1.1453616619110107, + "learning_rate": 2.346754807692308e-05, + "loss": 0.8508, + "step": 1867 + }, + { + "epoch": 1.089689368528511, + "grad_norm": 1.1119948625564575, + "learning_rate": 2.345252403846154e-05, + "loss": 0.6988, + "step": 1868 + }, + { + "epoch": 1.0902727140148754, + "grad_norm": 1.3853925466537476, + "learning_rate": 2.34375e-05, + "loss": 0.8331, + "step": 1869 + }, + { + "epoch": 1.0908560595012395, + "grad_norm": 1.315190315246582, + "learning_rate": 2.3422475961538462e-05, + "loss": 0.7391, + "step": 1870 + }, + { + "epoch": 1.091439404987604, + "grad_norm": 1.070307970046997, + "learning_rate": 2.3407451923076924e-05, + "loss": 0.8388, + "step": 1871 + }, + { + "epoch": 1.0920227504739681, + "grad_norm": 0.9866098165512085, + "learning_rate": 2.3392427884615386e-05, + "loss": 1.0789, + "step": 1872 + }, + { + "epoch": 1.0926060959603325, + "grad_norm": 1.2839645147323608, + "learning_rate": 2.3377403846153848e-05, + "loss": 0.7442, + "step": 1873 + }, + { + "epoch": 1.0931894414466967, + "grad_norm": 1.2288687229156494, + "learning_rate": 2.336237980769231e-05, + "loss": 0.7923, + "step": 1874 + }, + { + "epoch": 1.0937727869330611, + "grad_norm": 1.0700854063034058, + "learning_rate": 2.334735576923077e-05, + "loss": 0.9763, + "step": 1875 + }, + { + "epoch": 1.0943561324194253, + "grad_norm": 0.9807063937187195, + "learning_rate": 2.3332331730769234e-05, + "loss": 0.8784, + "step": 1876 + }, + { + "epoch": 1.0949394779057897, + "grad_norm": 1.1770750284194946, + "learning_rate": 2.3317307692307692e-05, + "loss": 0.7672, + "step": 1877 + }, + { + "epoch": 1.095522823392154, + "grad_norm": 1.1473841667175293, + "learning_rate": 2.3302283653846154e-05, + "loss": 0.7583, + "step": 1878 + }, + { + "epoch": 1.0961061688785183, + "grad_norm": 1.2041727304458618, + "learning_rate": 2.3287259615384616e-05, + "loss": 0.8327, + "step": 1879 + }, + { + "epoch": 1.0966895143648827, + "grad_norm": 1.1696784496307373, + "learning_rate": 2.3272235576923078e-05, + "loss": 0.853, + "step": 1880 + }, + { + "epoch": 1.097272859851247, + "grad_norm": 1.1404637098312378, + "learning_rate": 2.325721153846154e-05, + "loss": 0.8551, + "step": 1881 + }, + { + "epoch": 1.0978562053376113, + "grad_norm": 1.082005262374878, + "learning_rate": 2.32421875e-05, + "loss": 0.7462, + "step": 1882 + }, + { + "epoch": 1.0984395508239755, + "grad_norm": 1.3037984371185303, + "learning_rate": 2.3227163461538464e-05, + "loss": 1.0494, + "step": 1883 + }, + { + "epoch": 1.09902289631034, + "grad_norm": 1.3747504949569702, + "learning_rate": 2.3212139423076922e-05, + "loss": 0.9479, + "step": 1884 + }, + { + "epoch": 1.099606241796704, + "grad_norm": 1.2729436159133911, + "learning_rate": 2.3197115384615387e-05, + "loss": 0.8058, + "step": 1885 + }, + { + "epoch": 1.1001895872830685, + "grad_norm": 0.951274037361145, + "learning_rate": 2.3182091346153846e-05, + "loss": 0.7261, + "step": 1886 + }, + { + "epoch": 1.1007729327694327, + "grad_norm": 1.114943027496338, + "learning_rate": 2.3167067307692308e-05, + "loss": 0.8833, + "step": 1887 + }, + { + "epoch": 1.101356278255797, + "grad_norm": 1.2294602394104004, + "learning_rate": 2.315204326923077e-05, + "loss": 0.8286, + "step": 1888 + }, + { + "epoch": 1.1019396237421613, + "grad_norm": 1.0512102842330933, + "learning_rate": 2.313701923076923e-05, + "loss": 0.8782, + "step": 1889 + }, + { + "epoch": 1.1025229692285257, + "grad_norm": 1.2631170749664307, + "learning_rate": 2.3121995192307693e-05, + "loss": 0.7911, + "step": 1890 + }, + { + "epoch": 1.1031063147148898, + "grad_norm": 1.3144844770431519, + "learning_rate": 2.3106971153846155e-05, + "loss": 0.7812, + "step": 1891 + }, + { + "epoch": 1.1036896602012543, + "grad_norm": 1.0786577463150024, + "learning_rate": 2.3091947115384617e-05, + "loss": 0.7804, + "step": 1892 + }, + { + "epoch": 1.1042730056876184, + "grad_norm": 1.3957914113998413, + "learning_rate": 2.307692307692308e-05, + "loss": 0.9104, + "step": 1893 + }, + { + "epoch": 1.1048563511739828, + "grad_norm": 1.103511929512024, + "learning_rate": 2.306189903846154e-05, + "loss": 0.6835, + "step": 1894 + }, + { + "epoch": 1.105439696660347, + "grad_norm": 1.3668882846832275, + "learning_rate": 2.3046875e-05, + "loss": 0.8694, + "step": 1895 + }, + { + "epoch": 1.1060230421467114, + "grad_norm": 1.075162649154663, + "learning_rate": 2.303185096153846e-05, + "loss": 0.8763, + "step": 1896 + }, + { + "epoch": 1.1066063876330756, + "grad_norm": 1.2901614904403687, + "learning_rate": 2.3016826923076923e-05, + "loss": 0.8694, + "step": 1897 + }, + { + "epoch": 1.10718973311944, + "grad_norm": 1.0675395727157593, + "learning_rate": 2.3001802884615385e-05, + "loss": 0.7391, + "step": 1898 + }, + { + "epoch": 1.1077730786058042, + "grad_norm": 1.200480341911316, + "learning_rate": 2.2986778846153847e-05, + "loss": 0.868, + "step": 1899 + }, + { + "epoch": 1.1083564240921686, + "grad_norm": 1.254665732383728, + "learning_rate": 2.297175480769231e-05, + "loss": 0.7587, + "step": 1900 + }, + { + "epoch": 1.1089397695785328, + "grad_norm": 1.047951102256775, + "learning_rate": 2.295673076923077e-05, + "loss": 0.8484, + "step": 1901 + }, + { + "epoch": 1.1095231150648972, + "grad_norm": 1.0804102420806885, + "learning_rate": 2.2941706730769233e-05, + "loss": 0.7814, + "step": 1902 + }, + { + "epoch": 1.1101064605512614, + "grad_norm": 1.1867598295211792, + "learning_rate": 2.2926682692307695e-05, + "loss": 0.75, + "step": 1903 + }, + { + "epoch": 1.1106898060376258, + "grad_norm": 1.148829460144043, + "learning_rate": 2.2911658653846153e-05, + "loss": 0.8877, + "step": 1904 + }, + { + "epoch": 1.11127315152399, + "grad_norm": 1.0924962759017944, + "learning_rate": 2.2896634615384615e-05, + "loss": 0.7225, + "step": 1905 + }, + { + "epoch": 1.1118564970103544, + "grad_norm": 1.1306710243225098, + "learning_rate": 2.288161057692308e-05, + "loss": 0.8202, + "step": 1906 + }, + { + "epoch": 1.1124398424967188, + "grad_norm": 1.1604644060134888, + "learning_rate": 2.286658653846154e-05, + "loss": 0.8849, + "step": 1907 + }, + { + "epoch": 1.113023187983083, + "grad_norm": 1.2594935894012451, + "learning_rate": 2.28515625e-05, + "loss": 0.8069, + "step": 1908 + }, + { + "epoch": 1.1136065334694474, + "grad_norm": 1.1446539163589478, + "learning_rate": 2.2836538461538463e-05, + "loss": 0.6342, + "step": 1909 + }, + { + "epoch": 1.1141898789558116, + "grad_norm": 1.317840337753296, + "learning_rate": 2.2821514423076924e-05, + "loss": 0.8211, + "step": 1910 + }, + { + "epoch": 1.114773224442176, + "grad_norm": 1.0653162002563477, + "learning_rate": 2.2806490384615386e-05, + "loss": 0.8454, + "step": 1911 + }, + { + "epoch": 1.1153565699285402, + "grad_norm": 0.9791072607040405, + "learning_rate": 2.2791466346153848e-05, + "loss": 0.7433, + "step": 1912 + }, + { + "epoch": 1.1159399154149046, + "grad_norm": 1.1724233627319336, + "learning_rate": 2.2776442307692307e-05, + "loss": 0.7882, + "step": 1913 + }, + { + "epoch": 1.1165232609012687, + "grad_norm": 1.2540687322616577, + "learning_rate": 2.276141826923077e-05, + "loss": 0.7061, + "step": 1914 + }, + { + "epoch": 1.1171066063876331, + "grad_norm": 1.0810496807098389, + "learning_rate": 2.2746394230769234e-05, + "loss": 0.973, + "step": 1915 + }, + { + "epoch": 1.1176899518739973, + "grad_norm": 1.2553794384002686, + "learning_rate": 2.2731370192307692e-05, + "loss": 0.6909, + "step": 1916 + }, + { + "epoch": 1.1182732973603617, + "grad_norm": 1.2660142183303833, + "learning_rate": 2.2716346153846154e-05, + "loss": 0.8742, + "step": 1917 + }, + { + "epoch": 1.118856642846726, + "grad_norm": 1.4024698734283447, + "learning_rate": 2.2701322115384616e-05, + "loss": 0.7558, + "step": 1918 + }, + { + "epoch": 1.1194399883330903, + "grad_norm": 1.2419300079345703, + "learning_rate": 2.2686298076923078e-05, + "loss": 0.8231, + "step": 1919 + }, + { + "epoch": 1.1200233338194545, + "grad_norm": 1.2142603397369385, + "learning_rate": 2.267127403846154e-05, + "loss": 0.8333, + "step": 1920 + }, + { + "epoch": 1.120606679305819, + "grad_norm": 1.1328840255737305, + "learning_rate": 2.2656250000000002e-05, + "loss": 0.7284, + "step": 1921 + }, + { + "epoch": 1.121190024792183, + "grad_norm": 1.256993055343628, + "learning_rate": 2.264122596153846e-05, + "loss": 0.8874, + "step": 1922 + }, + { + "epoch": 1.1217733702785475, + "grad_norm": 0.9864529967308044, + "learning_rate": 2.2626201923076922e-05, + "loss": 0.7252, + "step": 1923 + }, + { + "epoch": 1.1223567157649117, + "grad_norm": 1.1391963958740234, + "learning_rate": 2.2611177884615387e-05, + "loss": 0.7308, + "step": 1924 + }, + { + "epoch": 1.122940061251276, + "grad_norm": 1.3632563352584839, + "learning_rate": 2.2596153846153846e-05, + "loss": 0.7668, + "step": 1925 + }, + { + "epoch": 1.1235234067376403, + "grad_norm": 0.9523732662200928, + "learning_rate": 2.2581129807692308e-05, + "loss": 0.6653, + "step": 1926 + }, + { + "epoch": 1.1241067522240047, + "grad_norm": 1.2235891819000244, + "learning_rate": 2.256610576923077e-05, + "loss": 0.7031, + "step": 1927 + }, + { + "epoch": 1.1246900977103689, + "grad_norm": 1.1225990056991577, + "learning_rate": 2.255108173076923e-05, + "loss": 0.7132, + "step": 1928 + }, + { + "epoch": 1.1252734431967333, + "grad_norm": 1.2185968160629272, + "learning_rate": 2.2536057692307694e-05, + "loss": 0.7747, + "step": 1929 + }, + { + "epoch": 1.1258567886830977, + "grad_norm": 1.2075011730194092, + "learning_rate": 2.2521033653846155e-05, + "loss": 0.7532, + "step": 1930 + }, + { + "epoch": 1.1264401341694619, + "grad_norm": 1.0588568449020386, + "learning_rate": 2.2506009615384614e-05, + "loss": 0.6433, + "step": 1931 + }, + { + "epoch": 1.127023479655826, + "grad_norm": 1.154813528060913, + "learning_rate": 2.249098557692308e-05, + "loss": 0.8853, + "step": 1932 + }, + { + "epoch": 1.1276068251421905, + "grad_norm": 1.280246376991272, + "learning_rate": 2.247596153846154e-05, + "loss": 0.8618, + "step": 1933 + }, + { + "epoch": 1.1281901706285549, + "grad_norm": 1.2844151258468628, + "learning_rate": 2.24609375e-05, + "loss": 0.7397, + "step": 1934 + }, + { + "epoch": 1.128773516114919, + "grad_norm": 1.1625947952270508, + "learning_rate": 2.244591346153846e-05, + "loss": 0.8301, + "step": 1935 + }, + { + "epoch": 1.1293568616012835, + "grad_norm": 1.3182225227355957, + "learning_rate": 2.2430889423076923e-05, + "loss": 0.7165, + "step": 1936 + }, + { + "epoch": 1.1299402070876476, + "grad_norm": 1.4218560457229614, + "learning_rate": 2.2415865384615385e-05, + "loss": 0.7256, + "step": 1937 + }, + { + "epoch": 1.130523552574012, + "grad_norm": 1.061953067779541, + "learning_rate": 2.2400841346153847e-05, + "loss": 0.8491, + "step": 1938 + }, + { + "epoch": 1.1311068980603762, + "grad_norm": 1.1568974256515503, + "learning_rate": 2.238581730769231e-05, + "loss": 0.7943, + "step": 1939 + }, + { + "epoch": 1.1316902435467406, + "grad_norm": 1.0611028671264648, + "learning_rate": 2.2370793269230768e-05, + "loss": 0.8708, + "step": 1940 + }, + { + "epoch": 1.1322735890331048, + "grad_norm": 1.1520451307296753, + "learning_rate": 2.2355769230769233e-05, + "loss": 0.7339, + "step": 1941 + }, + { + "epoch": 1.1328569345194692, + "grad_norm": 1.2494333982467651, + "learning_rate": 2.2340745192307695e-05, + "loss": 0.7952, + "step": 1942 + }, + { + "epoch": 1.1334402800058334, + "grad_norm": 1.1733430624008179, + "learning_rate": 2.2325721153846153e-05, + "loss": 0.7348, + "step": 1943 + }, + { + "epoch": 1.1340236254921978, + "grad_norm": 1.1535006761550903, + "learning_rate": 2.2310697115384615e-05, + "loss": 0.8333, + "step": 1944 + }, + { + "epoch": 1.134606970978562, + "grad_norm": 1.1618400812149048, + "learning_rate": 2.229567307692308e-05, + "loss": 0.8816, + "step": 1945 + }, + { + "epoch": 1.1351903164649264, + "grad_norm": 1.43650221824646, + "learning_rate": 2.228064903846154e-05, + "loss": 0.9497, + "step": 1946 + }, + { + "epoch": 1.1357736619512906, + "grad_norm": 1.2452301979064941, + "learning_rate": 2.2265625e-05, + "loss": 0.6446, + "step": 1947 + }, + { + "epoch": 1.136357007437655, + "grad_norm": 1.2092143297195435, + "learning_rate": 2.2250600961538463e-05, + "loss": 0.8083, + "step": 1948 + }, + { + "epoch": 1.1369403529240192, + "grad_norm": 1.0551011562347412, + "learning_rate": 2.223557692307692e-05, + "loss": 0.9274, + "step": 1949 + }, + { + "epoch": 1.1375236984103836, + "grad_norm": 1.204288125038147, + "learning_rate": 2.2220552884615386e-05, + "loss": 1.043, + "step": 1950 + }, + { + "epoch": 1.1381070438967478, + "grad_norm": 1.4167194366455078, + "learning_rate": 2.220552884615385e-05, + "loss": 0.8985, + "step": 1951 + }, + { + "epoch": 1.1386903893831122, + "grad_norm": 1.2125877141952515, + "learning_rate": 2.2190504807692307e-05, + "loss": 0.7084, + "step": 1952 + }, + { + "epoch": 1.1392737348694764, + "grad_norm": 1.3489576578140259, + "learning_rate": 2.217548076923077e-05, + "loss": 0.7183, + "step": 1953 + }, + { + "epoch": 1.1398570803558408, + "grad_norm": 1.4352302551269531, + "learning_rate": 2.2160456730769234e-05, + "loss": 0.9226, + "step": 1954 + }, + { + "epoch": 1.140440425842205, + "grad_norm": 1.2473138570785522, + "learning_rate": 2.2145432692307693e-05, + "loss": 0.8703, + "step": 1955 + }, + { + "epoch": 1.1410237713285694, + "grad_norm": 1.1649906635284424, + "learning_rate": 2.2130408653846154e-05, + "loss": 0.7713, + "step": 1956 + }, + { + "epoch": 1.1416071168149338, + "grad_norm": 1.1831884384155273, + "learning_rate": 2.2115384615384616e-05, + "loss": 0.9852, + "step": 1957 + }, + { + "epoch": 1.142190462301298, + "grad_norm": 1.1909598112106323, + "learning_rate": 2.2100360576923078e-05, + "loss": 0.7232, + "step": 1958 + }, + { + "epoch": 1.1427738077876621, + "grad_norm": 1.0720024108886719, + "learning_rate": 2.208533653846154e-05, + "loss": 0.8207, + "step": 1959 + }, + { + "epoch": 1.1433571532740265, + "grad_norm": 1.0868524312973022, + "learning_rate": 2.2070312500000002e-05, + "loss": 0.7564, + "step": 1960 + }, + { + "epoch": 1.143940498760391, + "grad_norm": 1.2819868326187134, + "learning_rate": 2.205528846153846e-05, + "loss": 0.8171, + "step": 1961 + }, + { + "epoch": 1.1445238442467551, + "grad_norm": 1.2320795059204102, + "learning_rate": 2.2040264423076922e-05, + "loss": 0.6759, + "step": 1962 + }, + { + "epoch": 1.1451071897331195, + "grad_norm": 1.2730618715286255, + "learning_rate": 2.2025240384615388e-05, + "loss": 0.8357, + "step": 1963 + }, + { + "epoch": 1.1456905352194837, + "grad_norm": 1.3447294235229492, + "learning_rate": 2.2010216346153846e-05, + "loss": 0.7348, + "step": 1964 + }, + { + "epoch": 1.1462738807058481, + "grad_norm": 1.215040683746338, + "learning_rate": 2.1995192307692308e-05, + "loss": 0.9677, + "step": 1965 + }, + { + "epoch": 1.1468572261922123, + "grad_norm": 0.992956280708313, + "learning_rate": 2.198016826923077e-05, + "loss": 0.7393, + "step": 1966 + }, + { + "epoch": 1.1474405716785767, + "grad_norm": 1.204768419265747, + "learning_rate": 2.1965144230769232e-05, + "loss": 0.7813, + "step": 1967 + }, + { + "epoch": 1.148023917164941, + "grad_norm": 1.2408292293548584, + "learning_rate": 2.1950120192307694e-05, + "loss": 0.9303, + "step": 1968 + }, + { + "epoch": 1.1486072626513053, + "grad_norm": 1.1849360466003418, + "learning_rate": 2.1935096153846156e-05, + "loss": 0.836, + "step": 1969 + }, + { + "epoch": 1.1491906081376695, + "grad_norm": 1.2159719467163086, + "learning_rate": 2.1920072115384614e-05, + "loss": 0.6705, + "step": 1970 + }, + { + "epoch": 1.149773953624034, + "grad_norm": 1.2858052253723145, + "learning_rate": 2.190504807692308e-05, + "loss": 0.8785, + "step": 1971 + }, + { + "epoch": 1.150357299110398, + "grad_norm": 1.2040108442306519, + "learning_rate": 2.189002403846154e-05, + "loss": 0.7998, + "step": 1972 + }, + { + "epoch": 1.1509406445967625, + "grad_norm": 1.2969449758529663, + "learning_rate": 2.1875e-05, + "loss": 0.7393, + "step": 1973 + }, + { + "epoch": 1.1515239900831267, + "grad_norm": 1.1521106958389282, + "learning_rate": 2.185997596153846e-05, + "loss": 0.9285, + "step": 1974 + }, + { + "epoch": 1.152107335569491, + "grad_norm": 1.139011025428772, + "learning_rate": 2.1844951923076924e-05, + "loss": 0.8499, + "step": 1975 + }, + { + "epoch": 1.1526906810558553, + "grad_norm": 1.3250030279159546, + "learning_rate": 2.1829927884615385e-05, + "loss": 0.7346, + "step": 1976 + }, + { + "epoch": 1.1532740265422197, + "grad_norm": 1.0421650409698486, + "learning_rate": 2.1814903846153847e-05, + "loss": 0.739, + "step": 1977 + }, + { + "epoch": 1.1538573720285838, + "grad_norm": 1.3398250341415405, + "learning_rate": 2.179987980769231e-05, + "loss": 0.644, + "step": 1978 + }, + { + "epoch": 1.1544407175149483, + "grad_norm": 1.0961638689041138, + "learning_rate": 2.1784855769230768e-05, + "loss": 0.8046, + "step": 1979 + }, + { + "epoch": 1.1550240630013124, + "grad_norm": 1.2048981189727783, + "learning_rate": 2.1769831730769233e-05, + "loss": 0.6226, + "step": 1980 + }, + { + "epoch": 1.1556074084876768, + "grad_norm": 1.395005702972412, + "learning_rate": 2.1754807692307695e-05, + "loss": 0.749, + "step": 1981 + }, + { + "epoch": 1.156190753974041, + "grad_norm": 1.2888487577438354, + "learning_rate": 2.1739783653846153e-05, + "loss": 0.8828, + "step": 1982 + }, + { + "epoch": 1.1567740994604054, + "grad_norm": 1.237052083015442, + "learning_rate": 2.1724759615384615e-05, + "loss": 0.742, + "step": 1983 + }, + { + "epoch": 1.1573574449467698, + "grad_norm": 1.3370431661605835, + "learning_rate": 2.170973557692308e-05, + "loss": 0.6737, + "step": 1984 + }, + { + "epoch": 1.157940790433134, + "grad_norm": 1.3525289297103882, + "learning_rate": 2.169471153846154e-05, + "loss": 0.6262, + "step": 1985 + }, + { + "epoch": 1.1585241359194982, + "grad_norm": 1.2294694185256958, + "learning_rate": 2.16796875e-05, + "loss": 0.6299, + "step": 1986 + }, + { + "epoch": 1.1591074814058626, + "grad_norm": 1.3560107946395874, + "learning_rate": 2.1664663461538463e-05, + "loss": 0.8025, + "step": 1987 + }, + { + "epoch": 1.159690826892227, + "grad_norm": 1.2413029670715332, + "learning_rate": 2.164963942307692e-05, + "loss": 0.7526, + "step": 1988 + }, + { + "epoch": 1.1602741723785912, + "grad_norm": 1.2291741371154785, + "learning_rate": 2.1634615384615387e-05, + "loss": 0.9964, + "step": 1989 + }, + { + "epoch": 1.1608575178649556, + "grad_norm": 1.1054651737213135, + "learning_rate": 2.161959134615385e-05, + "loss": 0.8606, + "step": 1990 + }, + { + "epoch": 1.1614408633513198, + "grad_norm": 1.2737064361572266, + "learning_rate": 2.1604567307692307e-05, + "loss": 0.7849, + "step": 1991 + }, + { + "epoch": 1.1620242088376842, + "grad_norm": 1.2561743259429932, + "learning_rate": 2.158954326923077e-05, + "loss": 0.914, + "step": 1992 + }, + { + "epoch": 1.1626075543240484, + "grad_norm": 1.4079277515411377, + "learning_rate": 2.1574519230769234e-05, + "loss": 0.8546, + "step": 1993 + }, + { + "epoch": 1.1631908998104128, + "grad_norm": 1.1819322109222412, + "learning_rate": 2.1559495192307693e-05, + "loss": 0.7467, + "step": 1994 + }, + { + "epoch": 1.163774245296777, + "grad_norm": 1.2287219762802124, + "learning_rate": 2.1544471153846155e-05, + "loss": 0.7146, + "step": 1995 + }, + { + "epoch": 1.1643575907831414, + "grad_norm": 1.2632535696029663, + "learning_rate": 2.1529447115384616e-05, + "loss": 0.7355, + "step": 1996 + }, + { + "epoch": 1.1649409362695056, + "grad_norm": 1.3165422677993774, + "learning_rate": 2.151442307692308e-05, + "loss": 0.7853, + "step": 1997 + }, + { + "epoch": 1.16552428175587, + "grad_norm": 1.056222915649414, + "learning_rate": 2.149939903846154e-05, + "loss": 0.7185, + "step": 1998 + }, + { + "epoch": 1.1661076272422342, + "grad_norm": 1.2883901596069336, + "learning_rate": 2.1484375000000002e-05, + "loss": 0.7979, + "step": 1999 + }, + { + "epoch": 1.1666909727285986, + "grad_norm": 1.367472767829895, + "learning_rate": 2.146935096153846e-05, + "loss": 0.8075, + "step": 2000 + }, + { + "epoch": 1.1666909727285986, + "eval_loss_squad": 0.833221665751189, + "eval_perplexity": 8.233223012549466, + "eval_perplexity_reconstruct": 1.9100281333491649, + "step": 2000 + }, + { + "epoch": 1.1672743182149627, + "grad_norm": 1.1449155807495117, + "learning_rate": 2.1454326923076923e-05, + "loss": 0.865, + "step": 2001 + }, + { + "epoch": 1.1678576637013272, + "grad_norm": 1.1930732727050781, + "learning_rate": 2.1439302884615388e-05, + "loss": 0.5906, + "step": 2002 + }, + { + "epoch": 1.1684410091876913, + "grad_norm": 1.4782718420028687, + "learning_rate": 2.1424278846153846e-05, + "loss": 0.9741, + "step": 2003 + }, + { + "epoch": 1.1690243546740557, + "grad_norm": 1.2961211204528809, + "learning_rate": 2.1409254807692308e-05, + "loss": 0.713, + "step": 2004 + }, + { + "epoch": 1.16960770016042, + "grad_norm": 1.250820279121399, + "learning_rate": 2.139423076923077e-05, + "loss": 0.8144, + "step": 2005 + }, + { + "epoch": 1.1701910456467843, + "grad_norm": 1.1511048078536987, + "learning_rate": 2.1379206730769232e-05, + "loss": 0.822, + "step": 2006 + }, + { + "epoch": 1.1707743911331485, + "grad_norm": 1.1328845024108887, + "learning_rate": 2.1364182692307694e-05, + "loss": 0.8251, + "step": 2007 + }, + { + "epoch": 1.171357736619513, + "grad_norm": 1.0141953229904175, + "learning_rate": 2.1349158653846156e-05, + "loss": 0.5794, + "step": 2008 + }, + { + "epoch": 1.171941082105877, + "grad_norm": 1.3566126823425293, + "learning_rate": 2.1334134615384614e-05, + "loss": 0.9965, + "step": 2009 + }, + { + "epoch": 1.1725244275922415, + "grad_norm": 1.246524691581726, + "learning_rate": 2.131911057692308e-05, + "loss": 0.7536, + "step": 2010 + }, + { + "epoch": 1.173107773078606, + "grad_norm": 1.3714301586151123, + "learning_rate": 2.130408653846154e-05, + "loss": 0.7692, + "step": 2011 + }, + { + "epoch": 1.17369111856497, + "grad_norm": 1.1998827457427979, + "learning_rate": 2.12890625e-05, + "loss": 0.8805, + "step": 2012 + }, + { + "epoch": 1.1742744640513343, + "grad_norm": 1.1431297063827515, + "learning_rate": 2.1274038461538462e-05, + "loss": 0.9395, + "step": 2013 + }, + { + "epoch": 1.1748578095376987, + "grad_norm": 1.1474422216415405, + "learning_rate": 2.1259014423076924e-05, + "loss": 0.5988, + "step": 2014 + }, + { + "epoch": 1.175441155024063, + "grad_norm": 1.1838253736495972, + "learning_rate": 2.1243990384615386e-05, + "loss": 1.0735, + "step": 2015 + }, + { + "epoch": 1.1760245005104273, + "grad_norm": 1.1664282083511353, + "learning_rate": 2.1228966346153847e-05, + "loss": 0.9661, + "step": 2016 + }, + { + "epoch": 1.1766078459967917, + "grad_norm": 1.2454452514648438, + "learning_rate": 2.121394230769231e-05, + "loss": 0.8219, + "step": 2017 + }, + { + "epoch": 1.1771911914831559, + "grad_norm": 1.3535860776901245, + "learning_rate": 2.1198918269230768e-05, + "loss": 0.855, + "step": 2018 + }, + { + "epoch": 1.1777745369695203, + "grad_norm": 1.3076273202896118, + "learning_rate": 2.1183894230769233e-05, + "loss": 0.9618, + "step": 2019 + }, + { + "epoch": 1.1783578824558845, + "grad_norm": 1.2071937322616577, + "learning_rate": 2.1168870192307695e-05, + "loss": 0.7426, + "step": 2020 + }, + { + "epoch": 1.1789412279422489, + "grad_norm": 1.273971676826477, + "learning_rate": 2.1153846153846154e-05, + "loss": 0.8335, + "step": 2021 + }, + { + "epoch": 1.179524573428613, + "grad_norm": 1.1485315561294556, + "learning_rate": 2.1138822115384615e-05, + "loss": 0.9572, + "step": 2022 + }, + { + "epoch": 1.1801079189149775, + "grad_norm": 1.3041000366210938, + "learning_rate": 2.112379807692308e-05, + "loss": 0.6271, + "step": 2023 + }, + { + "epoch": 1.1806912644013416, + "grad_norm": 1.3196794986724854, + "learning_rate": 2.110877403846154e-05, + "loss": 0.7458, + "step": 2024 + }, + { + "epoch": 1.181274609887706, + "grad_norm": 1.1841623783111572, + "learning_rate": 2.109375e-05, + "loss": 0.7578, + "step": 2025 + }, + { + "epoch": 1.1818579553740702, + "grad_norm": 1.098708152770996, + "learning_rate": 2.1078725961538463e-05, + "loss": 0.7583, + "step": 2026 + }, + { + "epoch": 1.1824413008604346, + "grad_norm": 1.4319161176681519, + "learning_rate": 2.106370192307692e-05, + "loss": 0.6168, + "step": 2027 + }, + { + "epoch": 1.1830246463467988, + "grad_norm": 1.0580462217330933, + "learning_rate": 2.1048677884615387e-05, + "loss": 0.9887, + "step": 2028 + }, + { + "epoch": 1.1836079918331632, + "grad_norm": 0.9840408563613892, + "learning_rate": 2.103365384615385e-05, + "loss": 0.8125, + "step": 2029 + }, + { + "epoch": 1.1841913373195274, + "grad_norm": 1.2513033151626587, + "learning_rate": 2.1018629807692307e-05, + "loss": 0.8072, + "step": 2030 + }, + { + "epoch": 1.1847746828058918, + "grad_norm": 1.3638144731521606, + "learning_rate": 2.100360576923077e-05, + "loss": 0.7152, + "step": 2031 + }, + { + "epoch": 1.185358028292256, + "grad_norm": 1.3127323389053345, + "learning_rate": 2.0988581730769234e-05, + "loss": 0.8506, + "step": 2032 + }, + { + "epoch": 1.1859413737786204, + "grad_norm": 1.028713345527649, + "learning_rate": 2.0973557692307693e-05, + "loss": 0.7184, + "step": 2033 + }, + { + "epoch": 1.1865247192649846, + "grad_norm": 1.3238139152526855, + "learning_rate": 2.0958533653846155e-05, + "loss": 0.9182, + "step": 2034 + }, + { + "epoch": 1.187108064751349, + "grad_norm": 1.1423969268798828, + "learning_rate": 2.0943509615384617e-05, + "loss": 0.6711, + "step": 2035 + }, + { + "epoch": 1.1876914102377132, + "grad_norm": 1.0575798749923706, + "learning_rate": 2.092848557692308e-05, + "loss": 0.6561, + "step": 2036 + }, + { + "epoch": 1.1882747557240776, + "grad_norm": 1.18056058883667, + "learning_rate": 2.091346153846154e-05, + "loss": 0.9018, + "step": 2037 + }, + { + "epoch": 1.188858101210442, + "grad_norm": 1.0494967699050903, + "learning_rate": 2.0898437500000002e-05, + "loss": 0.7882, + "step": 2038 + }, + { + "epoch": 1.1894414466968062, + "grad_norm": 1.2525426149368286, + "learning_rate": 2.088341346153846e-05, + "loss": 0.7677, + "step": 2039 + }, + { + "epoch": 1.1900247921831704, + "grad_norm": 1.2103064060211182, + "learning_rate": 2.0868389423076923e-05, + "loss": 0.7308, + "step": 2040 + }, + { + "epoch": 1.1906081376695348, + "grad_norm": 1.238573670387268, + "learning_rate": 2.0853365384615388e-05, + "loss": 0.8663, + "step": 2041 + }, + { + "epoch": 1.1911914831558992, + "grad_norm": 1.1449916362762451, + "learning_rate": 2.0838341346153846e-05, + "loss": 0.9087, + "step": 2042 + }, + { + "epoch": 1.1917748286422634, + "grad_norm": 0.9556616544723511, + "learning_rate": 2.082331730769231e-05, + "loss": 0.8262, + "step": 2043 + }, + { + "epoch": 1.1923581741286278, + "grad_norm": 1.0977288484573364, + "learning_rate": 2.080829326923077e-05, + "loss": 0.7063, + "step": 2044 + }, + { + "epoch": 1.192941519614992, + "grad_norm": 1.09841787815094, + "learning_rate": 2.0793269230769232e-05, + "loss": 0.8165, + "step": 2045 + }, + { + "epoch": 1.1935248651013564, + "grad_norm": 1.3817111253738403, + "learning_rate": 2.0778245192307694e-05, + "loss": 0.8761, + "step": 2046 + }, + { + "epoch": 1.1941082105877205, + "grad_norm": 1.6460163593292236, + "learning_rate": 2.0763221153846156e-05, + "loss": 0.9336, + "step": 2047 + }, + { + "epoch": 1.194691556074085, + "grad_norm": 1.1768983602523804, + "learning_rate": 2.0748197115384614e-05, + "loss": 0.7355, + "step": 2048 + }, + { + "epoch": 1.1952749015604491, + "grad_norm": 1.1434657573699951, + "learning_rate": 2.073317307692308e-05, + "loss": 0.8538, + "step": 2049 + }, + { + "epoch": 1.1958582470468135, + "grad_norm": 1.3496836423873901, + "learning_rate": 2.071814903846154e-05, + "loss": 0.6867, + "step": 2050 + }, + { + "epoch": 1.1964415925331777, + "grad_norm": 1.1143194437026978, + "learning_rate": 2.0703125e-05, + "loss": 0.4993, + "step": 2051 + }, + { + "epoch": 1.1970249380195421, + "grad_norm": 1.1078752279281616, + "learning_rate": 2.0688100961538462e-05, + "loss": 0.8276, + "step": 2052 + }, + { + "epoch": 1.1976082835059063, + "grad_norm": 1.3760533332824707, + "learning_rate": 2.0673076923076924e-05, + "loss": 0.7688, + "step": 2053 + }, + { + "epoch": 1.1981916289922707, + "grad_norm": 1.4191945791244507, + "learning_rate": 2.0658052884615386e-05, + "loss": 0.7183, + "step": 2054 + }, + { + "epoch": 1.198774974478635, + "grad_norm": 3.6387343406677246, + "learning_rate": 2.0643028846153848e-05, + "loss": 0.9276, + "step": 2055 + }, + { + "epoch": 1.1993583199649993, + "grad_norm": 1.3375924825668335, + "learning_rate": 2.062800480769231e-05, + "loss": 0.8397, + "step": 2056 + }, + { + "epoch": 1.1999416654513635, + "grad_norm": 1.2695611715316772, + "learning_rate": 2.0612980769230768e-05, + "loss": 1.0456, + "step": 2057 + }, + { + "epoch": 1.200525010937728, + "grad_norm": 1.278247594833374, + "learning_rate": 2.0597956730769233e-05, + "loss": 0.8718, + "step": 2058 + }, + { + "epoch": 1.201108356424092, + "grad_norm": 1.2946563959121704, + "learning_rate": 2.0582932692307695e-05, + "loss": 0.7343, + "step": 2059 + }, + { + "epoch": 1.2016917019104565, + "grad_norm": 1.5262877941131592, + "learning_rate": 2.0567908653846154e-05, + "loss": 0.8788, + "step": 2060 + }, + { + "epoch": 1.2022750473968207, + "grad_norm": 1.3010066747665405, + "learning_rate": 2.0552884615384616e-05, + "loss": 0.7022, + "step": 2061 + }, + { + "epoch": 1.202858392883185, + "grad_norm": 1.112301230430603, + "learning_rate": 2.053786057692308e-05, + "loss": 0.8348, + "step": 2062 + }, + { + "epoch": 1.2034417383695493, + "grad_norm": 1.1436960697174072, + "learning_rate": 2.052283653846154e-05, + "loss": 0.7332, + "step": 2063 + }, + { + "epoch": 1.2040250838559137, + "grad_norm": 1.1401253938674927, + "learning_rate": 2.05078125e-05, + "loss": 0.7297, + "step": 2064 + }, + { + "epoch": 1.204608429342278, + "grad_norm": 1.187334418296814, + "learning_rate": 2.0492788461538463e-05, + "loss": 0.8428, + "step": 2065 + }, + { + "epoch": 1.2051917748286423, + "grad_norm": 1.499940037727356, + "learning_rate": 2.047776442307692e-05, + "loss": 0.8032, + "step": 2066 + }, + { + "epoch": 1.2057751203150064, + "grad_norm": 1.4192659854888916, + "learning_rate": 2.0462740384615387e-05, + "loss": 0.7676, + "step": 2067 + }, + { + "epoch": 1.2063584658013708, + "grad_norm": 1.6800339221954346, + "learning_rate": 2.044771634615385e-05, + "loss": 0.7352, + "step": 2068 + }, + { + "epoch": 1.2069418112877353, + "grad_norm": 1.1487371921539307, + "learning_rate": 2.0432692307692307e-05, + "loss": 0.7481, + "step": 2069 + }, + { + "epoch": 1.2075251567740994, + "grad_norm": 1.2796707153320312, + "learning_rate": 2.041766826923077e-05, + "loss": 0.7011, + "step": 2070 + }, + { + "epoch": 1.2081085022604638, + "grad_norm": 2.365525722503662, + "learning_rate": 2.0402644230769235e-05, + "loss": 0.8039, + "step": 2071 + }, + { + "epoch": 1.208691847746828, + "grad_norm": 1.1125097274780273, + "learning_rate": 2.0387620192307693e-05, + "loss": 0.7027, + "step": 2072 + }, + { + "epoch": 1.2092751932331924, + "grad_norm": 1.3944780826568604, + "learning_rate": 2.0372596153846155e-05, + "loss": 0.8972, + "step": 2073 + }, + { + "epoch": 1.2098585387195566, + "grad_norm": 1.1701585054397583, + "learning_rate": 2.0357572115384617e-05, + "loss": 0.6956, + "step": 2074 + }, + { + "epoch": 1.210441884205921, + "grad_norm": 1.17959463596344, + "learning_rate": 2.034254807692308e-05, + "loss": 0.7278, + "step": 2075 + }, + { + "epoch": 1.2110252296922852, + "grad_norm": 1.3046574592590332, + "learning_rate": 2.032752403846154e-05, + "loss": 0.7054, + "step": 2076 + }, + { + "epoch": 1.2116085751786496, + "grad_norm": 1.1868085861206055, + "learning_rate": 2.0312500000000002e-05, + "loss": 0.76, + "step": 2077 + }, + { + "epoch": 1.2121919206650138, + "grad_norm": 1.4176987409591675, + "learning_rate": 2.029747596153846e-05, + "loss": 0.8469, + "step": 2078 + }, + { + "epoch": 1.2127752661513782, + "grad_norm": 1.0783299207687378, + "learning_rate": 2.0282451923076923e-05, + "loss": 0.8074, + "step": 2079 + }, + { + "epoch": 1.2133586116377424, + "grad_norm": 1.113829493522644, + "learning_rate": 2.0267427884615388e-05, + "loss": 0.7359, + "step": 2080 + }, + { + "epoch": 1.2139419571241068, + "grad_norm": 1.3299684524536133, + "learning_rate": 2.0252403846153847e-05, + "loss": 0.8881, + "step": 2081 + }, + { + "epoch": 1.214525302610471, + "grad_norm": 1.1996777057647705, + "learning_rate": 2.023737980769231e-05, + "loss": 0.686, + "step": 2082 + }, + { + "epoch": 1.2151086480968354, + "grad_norm": 1.1687122583389282, + "learning_rate": 2.022235576923077e-05, + "loss": 0.8864, + "step": 2083 + }, + { + "epoch": 1.2156919935831996, + "grad_norm": 1.3415162563323975, + "learning_rate": 2.0207331730769232e-05, + "loss": 0.8052, + "step": 2084 + }, + { + "epoch": 1.216275339069564, + "grad_norm": 1.184653878211975, + "learning_rate": 2.0192307692307694e-05, + "loss": 0.7937, + "step": 2085 + }, + { + "epoch": 1.2168586845559282, + "grad_norm": 1.2197047472000122, + "learning_rate": 2.0177283653846156e-05, + "loss": 0.7549, + "step": 2086 + }, + { + "epoch": 1.2174420300422926, + "grad_norm": 1.2201251983642578, + "learning_rate": 2.0162259615384615e-05, + "loss": 0.818, + "step": 2087 + }, + { + "epoch": 1.2180253755286568, + "grad_norm": 1.0660450458526611, + "learning_rate": 2.014723557692308e-05, + "loss": 0.7244, + "step": 2088 + }, + { + "epoch": 1.2186087210150212, + "grad_norm": 1.112344741821289, + "learning_rate": 2.0132211538461542e-05, + "loss": 0.8623, + "step": 2089 + }, + { + "epoch": 1.2191920665013853, + "grad_norm": 1.066853404045105, + "learning_rate": 2.01171875e-05, + "loss": 0.7755, + "step": 2090 + }, + { + "epoch": 1.2197754119877497, + "grad_norm": 1.4612683057785034, + "learning_rate": 2.0102163461538462e-05, + "loss": 0.7503, + "step": 2091 + }, + { + "epoch": 1.2203587574741142, + "grad_norm": 1.3045361042022705, + "learning_rate": 2.0087139423076924e-05, + "loss": 1.054, + "step": 2092 + }, + { + "epoch": 1.2209421029604783, + "grad_norm": 1.891230821609497, + "learning_rate": 2.0072115384615386e-05, + "loss": 0.801, + "step": 2093 + }, + { + "epoch": 1.2215254484468425, + "grad_norm": 1.2401351928710938, + "learning_rate": 2.0057091346153848e-05, + "loss": 1.0192, + "step": 2094 + }, + { + "epoch": 1.222108793933207, + "grad_norm": 1.1924711465835571, + "learning_rate": 2.004206730769231e-05, + "loss": 0.7175, + "step": 2095 + }, + { + "epoch": 1.2226921394195713, + "grad_norm": 1.4150327444076538, + "learning_rate": 2.0027043269230768e-05, + "loss": 0.8763, + "step": 2096 + }, + { + "epoch": 1.2232754849059355, + "grad_norm": 1.2163945436477661, + "learning_rate": 2.0012019230769233e-05, + "loss": 0.678, + "step": 2097 + }, + { + "epoch": 1.2238588303923, + "grad_norm": 1.3845294713974, + "learning_rate": 1.9996995192307695e-05, + "loss": 0.658, + "step": 2098 + }, + { + "epoch": 1.224442175878664, + "grad_norm": 1.308927297592163, + "learning_rate": 1.9981971153846154e-05, + "loss": 0.6827, + "step": 2099 + }, + { + "epoch": 1.2250255213650285, + "grad_norm": 1.386710524559021, + "learning_rate": 1.9966947115384616e-05, + "loss": 0.6608, + "step": 2100 + }, + { + "epoch": 1.2256088668513927, + "grad_norm": 2.081791877746582, + "learning_rate": 1.9951923076923078e-05, + "loss": 0.8755, + "step": 2101 + }, + { + "epoch": 1.226192212337757, + "grad_norm": 1.240175724029541, + "learning_rate": 1.993689903846154e-05, + "loss": 0.7445, + "step": 2102 + }, + { + "epoch": 1.2267755578241213, + "grad_norm": 1.1266789436340332, + "learning_rate": 1.9921875e-05, + "loss": 0.816, + "step": 2103 + }, + { + "epoch": 1.2273589033104857, + "grad_norm": 1.2244000434875488, + "learning_rate": 1.9906850961538463e-05, + "loss": 0.8795, + "step": 2104 + }, + { + "epoch": 1.2279422487968499, + "grad_norm": 1.420829176902771, + "learning_rate": 1.9891826923076922e-05, + "loss": 0.909, + "step": 2105 + }, + { + "epoch": 1.2285255942832143, + "grad_norm": 1.031460165977478, + "learning_rate": 1.9876802884615387e-05, + "loss": 0.557, + "step": 2106 + }, + { + "epoch": 1.2291089397695785, + "grad_norm": 1.2154114246368408, + "learning_rate": 1.986177884615385e-05, + "loss": 0.7688, + "step": 2107 + }, + { + "epoch": 1.2296922852559429, + "grad_norm": 1.0314477682113647, + "learning_rate": 1.9846754807692307e-05, + "loss": 0.834, + "step": 2108 + }, + { + "epoch": 1.230275630742307, + "grad_norm": 1.227577805519104, + "learning_rate": 1.983173076923077e-05, + "loss": 0.7475, + "step": 2109 + }, + { + "epoch": 1.2308589762286715, + "grad_norm": 1.3141686916351318, + "learning_rate": 1.981670673076923e-05, + "loss": 0.8031, + "step": 2110 + }, + { + "epoch": 1.2314423217150356, + "grad_norm": 1.0540612936019897, + "learning_rate": 1.9801682692307693e-05, + "loss": 0.7732, + "step": 2111 + }, + { + "epoch": 1.2320256672014, + "grad_norm": 1.198327898979187, + "learning_rate": 1.9786658653846155e-05, + "loss": 0.8973, + "step": 2112 + }, + { + "epoch": 1.2326090126877642, + "grad_norm": 1.286871314048767, + "learning_rate": 1.9771634615384617e-05, + "loss": 0.6474, + "step": 2113 + }, + { + "epoch": 1.2331923581741286, + "grad_norm": 1.4305251836776733, + "learning_rate": 1.975661057692308e-05, + "loss": 1.0454, + "step": 2114 + }, + { + "epoch": 1.233775703660493, + "grad_norm": 1.2102510929107666, + "learning_rate": 1.974158653846154e-05, + "loss": 0.5195, + "step": 2115 + }, + { + "epoch": 1.2343590491468572, + "grad_norm": 1.3697700500488281, + "learning_rate": 1.9726562500000003e-05, + "loss": 0.7984, + "step": 2116 + }, + { + "epoch": 1.2349423946332214, + "grad_norm": 1.121392846107483, + "learning_rate": 1.971153846153846e-05, + "loss": 0.9984, + "step": 2117 + }, + { + "epoch": 1.2355257401195858, + "grad_norm": 1.2788634300231934, + "learning_rate": 1.9696514423076923e-05, + "loss": 0.7598, + "step": 2118 + }, + { + "epoch": 1.2361090856059502, + "grad_norm": 1.1477173566818237, + "learning_rate": 1.9681490384615385e-05, + "loss": 0.7552, + "step": 2119 + }, + { + "epoch": 1.2366924310923144, + "grad_norm": 1.1774203777313232, + "learning_rate": 1.9666466346153847e-05, + "loss": 0.8124, + "step": 2120 + }, + { + "epoch": 1.2372757765786786, + "grad_norm": 1.0521777868270874, + "learning_rate": 1.965144230769231e-05, + "loss": 0.7731, + "step": 2121 + }, + { + "epoch": 1.237859122065043, + "grad_norm": 1.3230030536651611, + "learning_rate": 1.963641826923077e-05, + "loss": 0.8122, + "step": 2122 + }, + { + "epoch": 1.2384424675514074, + "grad_norm": 1.1762146949768066, + "learning_rate": 1.9621394230769232e-05, + "loss": 0.7926, + "step": 2123 + }, + { + "epoch": 1.2390258130377716, + "grad_norm": 1.3138278722763062, + "learning_rate": 1.9606370192307694e-05, + "loss": 0.8361, + "step": 2124 + }, + { + "epoch": 1.239609158524136, + "grad_norm": 1.3810380697250366, + "learning_rate": 1.9591346153846156e-05, + "loss": 0.8379, + "step": 2125 + }, + { + "epoch": 1.2401925040105002, + "grad_norm": 1.2080543041229248, + "learning_rate": 1.9576322115384615e-05, + "loss": 1.0409, + "step": 2126 + }, + { + "epoch": 1.2407758494968646, + "grad_norm": 1.35194730758667, + "learning_rate": 1.956129807692308e-05, + "loss": 0.5343, + "step": 2127 + }, + { + "epoch": 1.2413591949832288, + "grad_norm": 1.287524700164795, + "learning_rate": 1.954627403846154e-05, + "loss": 0.9522, + "step": 2128 + }, + { + "epoch": 1.2419425404695932, + "grad_norm": 1.3742008209228516, + "learning_rate": 1.953125e-05, + "loss": 0.6388, + "step": 2129 + }, + { + "epoch": 1.2425258859559574, + "grad_norm": 1.1241064071655273, + "learning_rate": 1.9516225961538462e-05, + "loss": 0.6185, + "step": 2130 + }, + { + "epoch": 1.2431092314423218, + "grad_norm": 1.279675006866455, + "learning_rate": 1.9501201923076924e-05, + "loss": 0.7721, + "step": 2131 + }, + { + "epoch": 1.243692576928686, + "grad_norm": 1.381134033203125, + "learning_rate": 1.9486177884615386e-05, + "loss": 0.7895, + "step": 2132 + }, + { + "epoch": 1.2442759224150504, + "grad_norm": 1.276642918586731, + "learning_rate": 1.9471153846153848e-05, + "loss": 0.855, + "step": 2133 + }, + { + "epoch": 1.2448592679014145, + "grad_norm": 1.0444273948669434, + "learning_rate": 1.945612980769231e-05, + "loss": 1.0147, + "step": 2134 + }, + { + "epoch": 1.245442613387779, + "grad_norm": 1.4284225702285767, + "learning_rate": 1.944110576923077e-05, + "loss": 0.8998, + "step": 2135 + }, + { + "epoch": 1.2460259588741431, + "grad_norm": 1.3835419416427612, + "learning_rate": 1.9426081730769234e-05, + "loss": 0.7556, + "step": 2136 + }, + { + "epoch": 1.2466093043605075, + "grad_norm": 1.3676860332489014, + "learning_rate": 1.9411057692307692e-05, + "loss": 0.7672, + "step": 2137 + }, + { + "epoch": 1.2471926498468717, + "grad_norm": 1.2197344303131104, + "learning_rate": 1.9396033653846154e-05, + "loss": 0.8391, + "step": 2138 + }, + { + "epoch": 1.2477759953332361, + "grad_norm": 1.1958162784576416, + "learning_rate": 1.9381009615384616e-05, + "loss": 0.7733, + "step": 2139 + }, + { + "epoch": 1.2483593408196003, + "grad_norm": 1.3922827243804932, + "learning_rate": 1.9365985576923078e-05, + "loss": 0.6837, + "step": 2140 + }, + { + "epoch": 1.2489426863059647, + "grad_norm": 1.1373469829559326, + "learning_rate": 1.935096153846154e-05, + "loss": 0.8817, + "step": 2141 + }, + { + "epoch": 1.2495260317923291, + "grad_norm": 1.2826591730117798, + "learning_rate": 1.93359375e-05, + "loss": 0.8214, + "step": 2142 + }, + { + "epoch": 1.2501093772786933, + "grad_norm": 1.4706931114196777, + "learning_rate": 1.9320913461538463e-05, + "loss": 0.8828, + "step": 2143 + }, + { + "epoch": 1.2506927227650575, + "grad_norm": 1.1995126008987427, + "learning_rate": 1.9305889423076922e-05, + "loss": 0.8192, + "step": 2144 + }, + { + "epoch": 1.251276068251422, + "grad_norm": 1.321846842765808, + "learning_rate": 1.9290865384615387e-05, + "loss": 0.7632, + "step": 2145 + }, + { + "epoch": 1.2518594137377863, + "grad_norm": 1.2701880931854248, + "learning_rate": 1.9275841346153846e-05, + "loss": 0.8689, + "step": 2146 + }, + { + "epoch": 1.2524427592241505, + "grad_norm": 1.2398470640182495, + "learning_rate": 1.9260817307692308e-05, + "loss": 1.0235, + "step": 2147 + }, + { + "epoch": 1.2530261047105147, + "grad_norm": 1.2439467906951904, + "learning_rate": 1.924579326923077e-05, + "loss": 0.6753, + "step": 2148 + }, + { + "epoch": 1.253609450196879, + "grad_norm": 1.207457184791565, + "learning_rate": 1.923076923076923e-05, + "loss": 0.9007, + "step": 2149 + }, + { + "epoch": 1.2541927956832435, + "grad_norm": 1.1856311559677124, + "learning_rate": 1.9215745192307693e-05, + "loss": 0.6977, + "step": 2150 + }, + { + "epoch": 1.2547761411696077, + "grad_norm": 1.2477092742919922, + "learning_rate": 1.9200721153846155e-05, + "loss": 0.7035, + "step": 2151 + }, + { + "epoch": 1.255359486655972, + "grad_norm": 1.0819847583770752, + "learning_rate": 1.9185697115384617e-05, + "loss": 0.7668, + "step": 2152 + }, + { + "epoch": 1.2559428321423363, + "grad_norm": 1.4272301197052002, + "learning_rate": 1.917067307692308e-05, + "loss": 0.8473, + "step": 2153 + }, + { + "epoch": 1.2565261776287007, + "grad_norm": 1.478842854499817, + "learning_rate": 1.915564903846154e-05, + "loss": 0.8842, + "step": 2154 + }, + { + "epoch": 1.2571095231150649, + "grad_norm": 1.500758409500122, + "learning_rate": 1.9140625e-05, + "loss": 0.7502, + "step": 2155 + }, + { + "epoch": 1.2576928686014293, + "grad_norm": 1.173345685005188, + "learning_rate": 1.912560096153846e-05, + "loss": 0.7785, + "step": 2156 + }, + { + "epoch": 1.2582762140877934, + "grad_norm": 1.1746881008148193, + "learning_rate": 1.9110576923076923e-05, + "loss": 0.83, + "step": 2157 + }, + { + "epoch": 1.2588595595741578, + "grad_norm": 1.263342261314392, + "learning_rate": 1.9095552884615385e-05, + "loss": 0.896, + "step": 2158 + }, + { + "epoch": 1.259442905060522, + "grad_norm": 1.374733328819275, + "learning_rate": 1.9080528846153847e-05, + "loss": 0.8734, + "step": 2159 + }, + { + "epoch": 1.2600262505468864, + "grad_norm": 1.1060365438461304, + "learning_rate": 1.906550480769231e-05, + "loss": 0.684, + "step": 2160 + }, + { + "epoch": 1.2606095960332506, + "grad_norm": 1.289867639541626, + "learning_rate": 1.905048076923077e-05, + "loss": 0.8807, + "step": 2161 + }, + { + "epoch": 1.261192941519615, + "grad_norm": 0.9511016607284546, + "learning_rate": 1.9035456730769233e-05, + "loss": 0.9252, + "step": 2162 + }, + { + "epoch": 1.2617762870059792, + "grad_norm": 1.3257215023040771, + "learning_rate": 1.9020432692307695e-05, + "loss": 0.6726, + "step": 2163 + }, + { + "epoch": 1.2623596324923436, + "grad_norm": 1.180518627166748, + "learning_rate": 1.9005408653846153e-05, + "loss": 0.6201, + "step": 2164 + }, + { + "epoch": 1.262942977978708, + "grad_norm": 1.2338030338287354, + "learning_rate": 1.8990384615384615e-05, + "loss": 0.8228, + "step": 2165 + }, + { + "epoch": 1.2635263234650722, + "grad_norm": 0.9147897362709045, + "learning_rate": 1.897536057692308e-05, + "loss": 0.9485, + "step": 2166 + }, + { + "epoch": 1.2641096689514364, + "grad_norm": 1.209277629852295, + "learning_rate": 1.896033653846154e-05, + "loss": 0.9845, + "step": 2167 + }, + { + "epoch": 1.2646930144378008, + "grad_norm": 1.3521689176559448, + "learning_rate": 1.89453125e-05, + "loss": 0.8049, + "step": 2168 + }, + { + "epoch": 1.2652763599241652, + "grad_norm": 1.1422154903411865, + "learning_rate": 1.8930288461538462e-05, + "loss": 0.8085, + "step": 2169 + }, + { + "epoch": 1.2658597054105294, + "grad_norm": 1.3813396692276, + "learning_rate": 1.8915264423076924e-05, + "loss": 0.8315, + "step": 2170 + }, + { + "epoch": 1.2664430508968936, + "grad_norm": 1.1161137819290161, + "learning_rate": 1.8900240384615386e-05, + "loss": 0.8324, + "step": 2171 + }, + { + "epoch": 1.267026396383258, + "grad_norm": 1.3338303565979004, + "learning_rate": 1.8885216346153848e-05, + "loss": 0.8148, + "step": 2172 + }, + { + "epoch": 1.2676097418696224, + "grad_norm": 1.677724838256836, + "learning_rate": 1.8870192307692307e-05, + "loss": 0.8382, + "step": 2173 + }, + { + "epoch": 1.2681930873559866, + "grad_norm": 1.2323002815246582, + "learning_rate": 1.885516826923077e-05, + "loss": 0.8921, + "step": 2174 + }, + { + "epoch": 1.2687764328423508, + "grad_norm": 1.3112934827804565, + "learning_rate": 1.8840144230769234e-05, + "loss": 0.782, + "step": 2175 + }, + { + "epoch": 1.2693597783287152, + "grad_norm": 1.1889418363571167, + "learning_rate": 1.8825120192307692e-05, + "loss": 0.6976, + "step": 2176 + }, + { + "epoch": 1.2699431238150796, + "grad_norm": 1.2303264141082764, + "learning_rate": 1.8810096153846154e-05, + "loss": 0.7932, + "step": 2177 + }, + { + "epoch": 1.2705264693014438, + "grad_norm": 1.306663990020752, + "learning_rate": 1.8795072115384616e-05, + "loss": 0.8867, + "step": 2178 + }, + { + "epoch": 1.2711098147878082, + "grad_norm": 1.3073201179504395, + "learning_rate": 1.8780048076923078e-05, + "loss": 0.7185, + "step": 2179 + }, + { + "epoch": 1.2716931602741723, + "grad_norm": 1.2337119579315186, + "learning_rate": 1.876502403846154e-05, + "loss": 0.8584, + "step": 2180 + }, + { + "epoch": 1.2722765057605367, + "grad_norm": 1.0608634948730469, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.7698, + "step": 2181 + }, + { + "epoch": 1.272859851246901, + "grad_norm": 1.7613379955291748, + "learning_rate": 1.873497596153846e-05, + "loss": 0.8036, + "step": 2182 + }, + { + "epoch": 1.2734431967332653, + "grad_norm": 1.2020319700241089, + "learning_rate": 1.8719951923076922e-05, + "loss": 0.9223, + "step": 2183 + }, + { + "epoch": 1.2740265422196295, + "grad_norm": 1.1997487545013428, + "learning_rate": 1.8704927884615387e-05, + "loss": 0.8401, + "step": 2184 + }, + { + "epoch": 1.274609887705994, + "grad_norm": 1.2920103073120117, + "learning_rate": 1.8689903846153846e-05, + "loss": 0.7484, + "step": 2185 + }, + { + "epoch": 1.275193233192358, + "grad_norm": 1.2046105861663818, + "learning_rate": 1.8674879807692308e-05, + "loss": 0.6554, + "step": 2186 + }, + { + "epoch": 1.2757765786787225, + "grad_norm": 1.1587508916854858, + "learning_rate": 1.865985576923077e-05, + "loss": 0.9453, + "step": 2187 + }, + { + "epoch": 1.2763599241650867, + "grad_norm": 1.2004345655441284, + "learning_rate": 1.864483173076923e-05, + "loss": 0.7673, + "step": 2188 + }, + { + "epoch": 1.276943269651451, + "grad_norm": 1.2180094718933105, + "learning_rate": 1.8629807692307693e-05, + "loss": 0.7902, + "step": 2189 + }, + { + "epoch": 1.2775266151378153, + "grad_norm": 1.1839169263839722, + "learning_rate": 1.8614783653846155e-05, + "loss": 0.9672, + "step": 2190 + }, + { + "epoch": 1.2781099606241797, + "grad_norm": 1.2951889038085938, + "learning_rate": 1.8599759615384614e-05, + "loss": 0.8781, + "step": 2191 + }, + { + "epoch": 1.278693306110544, + "grad_norm": 1.199181318283081, + "learning_rate": 1.858473557692308e-05, + "loss": 0.8252, + "step": 2192 + }, + { + "epoch": 1.2792766515969083, + "grad_norm": 1.3422390222549438, + "learning_rate": 1.856971153846154e-05, + "loss": 0.8287, + "step": 2193 + }, + { + "epoch": 1.2798599970832725, + "grad_norm": 1.1554558277130127, + "learning_rate": 1.85546875e-05, + "loss": 0.8332, + "step": 2194 + }, + { + "epoch": 1.2804433425696369, + "grad_norm": 1.2082306146621704, + "learning_rate": 1.853966346153846e-05, + "loss": 0.8973, + "step": 2195 + }, + { + "epoch": 1.2810266880560013, + "grad_norm": 1.3980644941329956, + "learning_rate": 1.8524639423076923e-05, + "loss": 0.7318, + "step": 2196 + }, + { + "epoch": 1.2816100335423655, + "grad_norm": 1.290248155593872, + "learning_rate": 1.8509615384615385e-05, + "loss": 0.9601, + "step": 2197 + }, + { + "epoch": 1.2821933790287297, + "grad_norm": 1.2903095483779907, + "learning_rate": 1.8494591346153847e-05, + "loss": 0.853, + "step": 2198 + }, + { + "epoch": 1.282776724515094, + "grad_norm": 1.1215404272079468, + "learning_rate": 1.847956730769231e-05, + "loss": 0.8574, + "step": 2199 + }, + { + "epoch": 1.2833600700014585, + "grad_norm": 1.4814391136169434, + "learning_rate": 1.8464543269230767e-05, + "loss": 0.7185, + "step": 2200 + }, + { + "epoch": 1.2833600700014585, + "eval_loss_squad": 0.8695367491571233, + "eval_perplexity": 8.314321659478429, + "eval_perplexity_reconstruct": 1.905092611405054, + "step": 2200 + }, + { + "epoch": 1.2839434154878226, + "grad_norm": 1.5173953771591187, + "learning_rate": 1.8449519230769233e-05, + "loss": 0.7604, + "step": 2201 + }, + { + "epoch": 1.2845267609741868, + "grad_norm": 1.3316093683242798, + "learning_rate": 1.8434495192307695e-05, + "loss": 0.7515, + "step": 2202 + }, + { + "epoch": 1.2851101064605512, + "grad_norm": 1.444630742073059, + "learning_rate": 1.8419471153846153e-05, + "loss": 1.0531, + "step": 2203 + }, + { + "epoch": 1.2856934519469156, + "grad_norm": 1.2808470726013184, + "learning_rate": 1.8404447115384615e-05, + "loss": 0.6771, + "step": 2204 + }, + { + "epoch": 1.2862767974332798, + "grad_norm": 1.2151483297348022, + "learning_rate": 1.838942307692308e-05, + "loss": 0.8662, + "step": 2205 + }, + { + "epoch": 1.2868601429196442, + "grad_norm": 1.46257483959198, + "learning_rate": 1.837439903846154e-05, + "loss": 0.8319, + "step": 2206 + }, + { + "epoch": 1.2874434884060084, + "grad_norm": 1.3388731479644775, + "learning_rate": 1.8359375e-05, + "loss": 0.6788, + "step": 2207 + }, + { + "epoch": 1.2880268338923728, + "grad_norm": 1.563617467880249, + "learning_rate": 1.8344350961538463e-05, + "loss": 0.5617, + "step": 2208 + }, + { + "epoch": 1.288610179378737, + "grad_norm": 1.1539713144302368, + "learning_rate": 1.832932692307692e-05, + "loss": 0.7174, + "step": 2209 + }, + { + "epoch": 1.2891935248651014, + "grad_norm": 1.2138261795043945, + "learning_rate": 1.8314302884615386e-05, + "loss": 0.9121, + "step": 2210 + }, + { + "epoch": 1.2897768703514656, + "grad_norm": 1.2265892028808594, + "learning_rate": 1.8299278846153848e-05, + "loss": 0.7816, + "step": 2211 + }, + { + "epoch": 1.29036021583783, + "grad_norm": 1.1945728063583374, + "learning_rate": 1.8284254807692307e-05, + "loss": 0.8718, + "step": 2212 + }, + { + "epoch": 1.2909435613241942, + "grad_norm": 1.3123586177825928, + "learning_rate": 1.826923076923077e-05, + "loss": 0.7532, + "step": 2213 + }, + { + "epoch": 1.2915269068105586, + "grad_norm": 1.2278257608413696, + "learning_rate": 1.8254206730769234e-05, + "loss": 0.9726, + "step": 2214 + }, + { + "epoch": 1.2921102522969228, + "grad_norm": 1.100456714630127, + "learning_rate": 1.8239182692307692e-05, + "loss": 0.5921, + "step": 2215 + }, + { + "epoch": 1.2926935977832872, + "grad_norm": 1.1185805797576904, + "learning_rate": 1.8224158653846154e-05, + "loss": 0.7453, + "step": 2216 + }, + { + "epoch": 1.2932769432696514, + "grad_norm": 1.2159292697906494, + "learning_rate": 1.8209134615384616e-05, + "loss": 0.6872, + "step": 2217 + }, + { + "epoch": 1.2938602887560158, + "grad_norm": 1.1445262432098389, + "learning_rate": 1.8194110576923078e-05, + "loss": 0.7672, + "step": 2218 + }, + { + "epoch": 1.2944436342423802, + "grad_norm": 1.1859655380249023, + "learning_rate": 1.817908653846154e-05, + "loss": 0.6995, + "step": 2219 + }, + { + "epoch": 1.2950269797287444, + "grad_norm": 1.2108757495880127, + "learning_rate": 1.8164062500000002e-05, + "loss": 1.1017, + "step": 2220 + }, + { + "epoch": 1.2956103252151085, + "grad_norm": 1.0840204954147339, + "learning_rate": 1.814903846153846e-05, + "loss": 0.9441, + "step": 2221 + }, + { + "epoch": 1.296193670701473, + "grad_norm": 1.4668989181518555, + "learning_rate": 1.8134014423076922e-05, + "loss": 0.8163, + "step": 2222 + }, + { + "epoch": 1.2967770161878374, + "grad_norm": 1.3289495706558228, + "learning_rate": 1.8118990384615388e-05, + "loss": 0.7978, + "step": 2223 + }, + { + "epoch": 1.2973603616742015, + "grad_norm": 1.1220710277557373, + "learning_rate": 1.8103966346153846e-05, + "loss": 0.8322, + "step": 2224 + }, + { + "epoch": 1.2979437071605657, + "grad_norm": 1.1510266065597534, + "learning_rate": 1.8088942307692308e-05, + "loss": 0.735, + "step": 2225 + }, + { + "epoch": 1.2985270526469301, + "grad_norm": 1.145638108253479, + "learning_rate": 1.807391826923077e-05, + "loss": 0.8109, + "step": 2226 + }, + { + "epoch": 1.2991103981332945, + "grad_norm": 1.2273057699203491, + "learning_rate": 1.8058894230769232e-05, + "loss": 0.8697, + "step": 2227 + }, + { + "epoch": 1.2996937436196587, + "grad_norm": 0.9667252898216248, + "learning_rate": 1.8043870192307694e-05, + "loss": 0.7946, + "step": 2228 + }, + { + "epoch": 1.300277089106023, + "grad_norm": 1.183985710144043, + "learning_rate": 1.8028846153846156e-05, + "loss": 0.9245, + "step": 2229 + }, + { + "epoch": 1.3008604345923873, + "grad_norm": 1.4120750427246094, + "learning_rate": 1.8013822115384614e-05, + "loss": 0.899, + "step": 2230 + }, + { + "epoch": 1.3014437800787517, + "grad_norm": 1.9396076202392578, + "learning_rate": 1.799879807692308e-05, + "loss": 0.816, + "step": 2231 + }, + { + "epoch": 1.302027125565116, + "grad_norm": 1.4099103212356567, + "learning_rate": 1.798377403846154e-05, + "loss": 0.6821, + "step": 2232 + }, + { + "epoch": 1.3026104710514803, + "grad_norm": 1.2785017490386963, + "learning_rate": 1.796875e-05, + "loss": 0.9443, + "step": 2233 + }, + { + "epoch": 1.3031938165378445, + "grad_norm": 1.3573774099349976, + "learning_rate": 1.795372596153846e-05, + "loss": 0.7205, + "step": 2234 + }, + { + "epoch": 1.303777162024209, + "grad_norm": 1.1221401691436768, + "learning_rate": 1.7938701923076923e-05, + "loss": 0.7908, + "step": 2235 + }, + { + "epoch": 1.304360507510573, + "grad_norm": 1.1731091737747192, + "learning_rate": 1.7923677884615385e-05, + "loss": 0.6417, + "step": 2236 + }, + { + "epoch": 1.3049438529969375, + "grad_norm": 1.3627444505691528, + "learning_rate": 1.7908653846153847e-05, + "loss": 0.7929, + "step": 2237 + }, + { + "epoch": 1.3055271984833017, + "grad_norm": 1.4136035442352295, + "learning_rate": 1.789362980769231e-05, + "loss": 0.9224, + "step": 2238 + }, + { + "epoch": 1.306110543969666, + "grad_norm": 1.138240933418274, + "learning_rate": 1.7878605769230768e-05, + "loss": 0.8356, + "step": 2239 + }, + { + "epoch": 1.3066938894560303, + "grad_norm": 1.424336552619934, + "learning_rate": 1.7863581730769233e-05, + "loss": 0.7738, + "step": 2240 + }, + { + "epoch": 1.3072772349423947, + "grad_norm": 1.2529820203781128, + "learning_rate": 1.7848557692307695e-05, + "loss": 0.6549, + "step": 2241 + }, + { + "epoch": 1.3078605804287589, + "grad_norm": 1.7925548553466797, + "learning_rate": 1.7833533653846153e-05, + "loss": 0.648, + "step": 2242 + }, + { + "epoch": 1.3084439259151233, + "grad_norm": 1.3370999097824097, + "learning_rate": 1.7818509615384615e-05, + "loss": 0.8967, + "step": 2243 + }, + { + "epoch": 1.3090272714014874, + "grad_norm": 1.1147828102111816, + "learning_rate": 1.780348557692308e-05, + "loss": 0.8724, + "step": 2244 + }, + { + "epoch": 1.3096106168878519, + "grad_norm": 1.2542976140975952, + "learning_rate": 1.778846153846154e-05, + "loss": 0.8384, + "step": 2245 + }, + { + "epoch": 1.3101939623742163, + "grad_norm": 1.251609444618225, + "learning_rate": 1.77734375e-05, + "loss": 0.8426, + "step": 2246 + }, + { + "epoch": 1.3107773078605804, + "grad_norm": 1.2808489799499512, + "learning_rate": 1.7758413461538463e-05, + "loss": 0.7978, + "step": 2247 + }, + { + "epoch": 1.3113606533469446, + "grad_norm": 1.2761458158493042, + "learning_rate": 1.774338942307692e-05, + "loss": 0.9413, + "step": 2248 + }, + { + "epoch": 1.311943998833309, + "grad_norm": 1.263123631477356, + "learning_rate": 1.7728365384615387e-05, + "loss": 0.7363, + "step": 2249 + }, + { + "epoch": 1.3125273443196734, + "grad_norm": 1.2832683324813843, + "learning_rate": 1.771334134615385e-05, + "loss": 0.5699, + "step": 2250 + }, + { + "epoch": 1.3131106898060376, + "grad_norm": 1.2383707761764526, + "learning_rate": 1.7698317307692307e-05, + "loss": 0.9112, + "step": 2251 + }, + { + "epoch": 1.3136940352924018, + "grad_norm": 1.2074283361434937, + "learning_rate": 1.768329326923077e-05, + "loss": 0.7454, + "step": 2252 + }, + { + "epoch": 1.3142773807787662, + "grad_norm": 1.1046134233474731, + "learning_rate": 1.7668269230769234e-05, + "loss": 0.9412, + "step": 2253 + }, + { + "epoch": 1.3148607262651306, + "grad_norm": 1.1459115743637085, + "learning_rate": 1.7653245192307693e-05, + "loss": 0.7046, + "step": 2254 + }, + { + "epoch": 1.3154440717514948, + "grad_norm": 1.2635064125061035, + "learning_rate": 1.7638221153846155e-05, + "loss": 0.8815, + "step": 2255 + }, + { + "epoch": 1.316027417237859, + "grad_norm": 1.1063247919082642, + "learning_rate": 1.7623197115384616e-05, + "loss": 0.8305, + "step": 2256 + }, + { + "epoch": 1.3166107627242234, + "grad_norm": 2.1608684062957764, + "learning_rate": 1.7608173076923078e-05, + "loss": 0.9767, + "step": 2257 + }, + { + "epoch": 1.3171941082105878, + "grad_norm": 1.1712557077407837, + "learning_rate": 1.759314903846154e-05, + "loss": 1.0557, + "step": 2258 + }, + { + "epoch": 1.317777453696952, + "grad_norm": 1.1504344940185547, + "learning_rate": 1.7578125000000002e-05, + "loss": 1.0544, + "step": 2259 + }, + { + "epoch": 1.3183607991833164, + "grad_norm": 1.2168277502059937, + "learning_rate": 1.756310096153846e-05, + "loss": 0.8735, + "step": 2260 + }, + { + "epoch": 1.3189441446696806, + "grad_norm": 1.2982547283172607, + "learning_rate": 1.7548076923076922e-05, + "loss": 0.9633, + "step": 2261 + }, + { + "epoch": 1.319527490156045, + "grad_norm": 1.2747186422348022, + "learning_rate": 1.7533052884615388e-05, + "loss": 0.6536, + "step": 2262 + }, + { + "epoch": 1.3201108356424092, + "grad_norm": 1.1642917394638062, + "learning_rate": 1.7518028846153846e-05, + "loss": 0.7892, + "step": 2263 + }, + { + "epoch": 1.3206941811287736, + "grad_norm": 1.22393798828125, + "learning_rate": 1.7503004807692308e-05, + "loss": 0.8314, + "step": 2264 + }, + { + "epoch": 1.3212775266151378, + "grad_norm": 1.3126612901687622, + "learning_rate": 1.748798076923077e-05, + "loss": 0.695, + "step": 2265 + }, + { + "epoch": 1.3218608721015022, + "grad_norm": 1.1996201276779175, + "learning_rate": 1.7472956730769232e-05, + "loss": 0.7914, + "step": 2266 + }, + { + "epoch": 1.3224442175878663, + "grad_norm": 1.2702114582061768, + "learning_rate": 1.7457932692307694e-05, + "loss": 0.835, + "step": 2267 + }, + { + "epoch": 1.3230275630742308, + "grad_norm": 1.1210219860076904, + "learning_rate": 1.7442908653846156e-05, + "loss": 0.6297, + "step": 2268 + }, + { + "epoch": 1.323610908560595, + "grad_norm": 1.0922166109085083, + "learning_rate": 1.7427884615384614e-05, + "loss": 0.735, + "step": 2269 + }, + { + "epoch": 1.3241942540469593, + "grad_norm": 1.3747941255569458, + "learning_rate": 1.741286057692308e-05, + "loss": 0.9677, + "step": 2270 + }, + { + "epoch": 1.3247775995333235, + "grad_norm": 1.3291850090026855, + "learning_rate": 1.739783653846154e-05, + "loss": 0.9007, + "step": 2271 + }, + { + "epoch": 1.325360945019688, + "grad_norm": 1.1687302589416504, + "learning_rate": 1.73828125e-05, + "loss": 0.7095, + "step": 2272 + }, + { + "epoch": 1.3259442905060523, + "grad_norm": 1.1394537687301636, + "learning_rate": 1.7367788461538462e-05, + "loss": 0.6055, + "step": 2273 + }, + { + "epoch": 1.3265276359924165, + "grad_norm": 1.1962412595748901, + "learning_rate": 1.7352764423076924e-05, + "loss": 0.7964, + "step": 2274 + }, + { + "epoch": 1.3271109814787807, + "grad_norm": 1.5585336685180664, + "learning_rate": 1.7337740384615386e-05, + "loss": 0.725, + "step": 2275 + }, + { + "epoch": 1.327694326965145, + "grad_norm": 1.1137040853500366, + "learning_rate": 1.7322716346153847e-05, + "loss": 0.8106, + "step": 2276 + }, + { + "epoch": 1.3282776724515095, + "grad_norm": 1.2830206155776978, + "learning_rate": 1.730769230769231e-05, + "loss": 0.8546, + "step": 2277 + }, + { + "epoch": 1.3288610179378737, + "grad_norm": 1.1548091173171997, + "learning_rate": 1.7292668269230768e-05, + "loss": 0.8009, + "step": 2278 + }, + { + "epoch": 1.3294443634242379, + "grad_norm": 1.5664095878601074, + "learning_rate": 1.7277644230769233e-05, + "loss": 0.7502, + "step": 2279 + }, + { + "epoch": 1.3300277089106023, + "grad_norm": 1.4026076793670654, + "learning_rate": 1.7262620192307695e-05, + "loss": 0.886, + "step": 2280 + }, + { + "epoch": 1.3306110543969667, + "grad_norm": 1.2917288541793823, + "learning_rate": 1.7247596153846153e-05, + "loss": 0.7834, + "step": 2281 + }, + { + "epoch": 1.3311943998833309, + "grad_norm": 1.4439992904663086, + "learning_rate": 1.7232572115384615e-05, + "loss": 0.8932, + "step": 2282 + }, + { + "epoch": 1.331777745369695, + "grad_norm": 1.2439894676208496, + "learning_rate": 1.721754807692308e-05, + "loss": 0.7965, + "step": 2283 + }, + { + "epoch": 1.3323610908560595, + "grad_norm": 1.2655996084213257, + "learning_rate": 1.720252403846154e-05, + "loss": 0.7999, + "step": 2284 + }, + { + "epoch": 1.3329444363424239, + "grad_norm": 1.1132214069366455, + "learning_rate": 1.71875e-05, + "loss": 0.8086, + "step": 2285 + }, + { + "epoch": 1.333527781828788, + "grad_norm": 1.3198105096817017, + "learning_rate": 1.7172475961538463e-05, + "loss": 0.824, + "step": 2286 + }, + { + "epoch": 1.3341111273151525, + "grad_norm": 1.1884894371032715, + "learning_rate": 1.715745192307692e-05, + "loss": 0.7186, + "step": 2287 + }, + { + "epoch": 1.3346944728015167, + "grad_norm": 1.179015040397644, + "learning_rate": 1.7142427884615387e-05, + "loss": 0.9886, + "step": 2288 + }, + { + "epoch": 1.335277818287881, + "grad_norm": 1.1946966648101807, + "learning_rate": 1.712740384615385e-05, + "loss": 0.7248, + "step": 2289 + }, + { + "epoch": 1.3358611637742452, + "grad_norm": 1.2960485219955444, + "learning_rate": 1.7112379807692307e-05, + "loss": 0.8273, + "step": 2290 + }, + { + "epoch": 1.3364445092606096, + "grad_norm": 1.2853033542633057, + "learning_rate": 1.709735576923077e-05, + "loss": 0.8596, + "step": 2291 + }, + { + "epoch": 1.3370278547469738, + "grad_norm": 1.0934782028198242, + "learning_rate": 1.7082331730769234e-05, + "loss": 0.71, + "step": 2292 + }, + { + "epoch": 1.3376112002333382, + "grad_norm": 1.2531418800354004, + "learning_rate": 1.7067307692307693e-05, + "loss": 0.8957, + "step": 2293 + }, + { + "epoch": 1.3381945457197024, + "grad_norm": 1.389156460762024, + "learning_rate": 1.7052283653846155e-05, + "loss": 0.8757, + "step": 2294 + }, + { + "epoch": 1.3387778912060668, + "grad_norm": 1.4648810625076294, + "learning_rate": 1.7037259615384617e-05, + "loss": 0.9267, + "step": 2295 + }, + { + "epoch": 1.339361236692431, + "grad_norm": 1.110659122467041, + "learning_rate": 1.702223557692308e-05, + "loss": 0.8775, + "step": 2296 + }, + { + "epoch": 1.3399445821787954, + "grad_norm": 1.2895351648330688, + "learning_rate": 1.700721153846154e-05, + "loss": 0.7839, + "step": 2297 + }, + { + "epoch": 1.3405279276651596, + "grad_norm": 1.207476258277893, + "learning_rate": 1.6992187500000002e-05, + "loss": 0.7952, + "step": 2298 + }, + { + "epoch": 1.341111273151524, + "grad_norm": 1.142844557762146, + "learning_rate": 1.697716346153846e-05, + "loss": 0.7482, + "step": 2299 + }, + { + "epoch": 1.3416946186378884, + "grad_norm": 1.2515149116516113, + "learning_rate": 1.6962139423076923e-05, + "loss": 0.9569, + "step": 2300 + }, + { + "epoch": 1.3422779641242526, + "grad_norm": 1.090381145477295, + "learning_rate": 1.6947115384615388e-05, + "loss": 0.8499, + "step": 2301 + }, + { + "epoch": 1.3428613096106168, + "grad_norm": 1.2688581943511963, + "learning_rate": 1.6932091346153846e-05, + "loss": 0.6979, + "step": 2302 + }, + { + "epoch": 1.3434446550969812, + "grad_norm": 1.2520463466644287, + "learning_rate": 1.6917067307692308e-05, + "loss": 0.7364, + "step": 2303 + }, + { + "epoch": 1.3440280005833456, + "grad_norm": 1.1990320682525635, + "learning_rate": 1.690204326923077e-05, + "loss": 0.652, + "step": 2304 + }, + { + "epoch": 1.3446113460697098, + "grad_norm": 1.142889380455017, + "learning_rate": 1.6887019230769232e-05, + "loss": 0.9194, + "step": 2305 + }, + { + "epoch": 1.345194691556074, + "grad_norm": 1.1990599632263184, + "learning_rate": 1.6871995192307694e-05, + "loss": 0.7939, + "step": 2306 + }, + { + "epoch": 1.3457780370424384, + "grad_norm": 1.3533287048339844, + "learning_rate": 1.6856971153846156e-05, + "loss": 0.7524, + "step": 2307 + }, + { + "epoch": 1.3463613825288028, + "grad_norm": 1.157096028327942, + "learning_rate": 1.6841947115384614e-05, + "loss": 0.5834, + "step": 2308 + }, + { + "epoch": 1.346944728015167, + "grad_norm": 1.2179913520812988, + "learning_rate": 1.682692307692308e-05, + "loss": 0.9018, + "step": 2309 + }, + { + "epoch": 1.3475280735015314, + "grad_norm": 1.2479956150054932, + "learning_rate": 1.681189903846154e-05, + "loss": 0.7633, + "step": 2310 + }, + { + "epoch": 1.3481114189878955, + "grad_norm": 1.1520054340362549, + "learning_rate": 1.6796875e-05, + "loss": 0.7101, + "step": 2311 + }, + { + "epoch": 1.34869476447426, + "grad_norm": 1.21807861328125, + "learning_rate": 1.6781850961538462e-05, + "loss": 0.7766, + "step": 2312 + }, + { + "epoch": 1.3492781099606241, + "grad_norm": 1.4313595294952393, + "learning_rate": 1.6766826923076924e-05, + "loss": 0.6487, + "step": 2313 + }, + { + "epoch": 1.3498614554469885, + "grad_norm": 1.3884174823760986, + "learning_rate": 1.6751802884615386e-05, + "loss": 0.7646, + "step": 2314 + }, + { + "epoch": 1.3504448009333527, + "grad_norm": 1.1427439451217651, + "learning_rate": 1.6736778846153848e-05, + "loss": 0.8483, + "step": 2315 + }, + { + "epoch": 1.3510281464197171, + "grad_norm": 1.2759617567062378, + "learning_rate": 1.672175480769231e-05, + "loss": 0.7245, + "step": 2316 + }, + { + "epoch": 1.3516114919060813, + "grad_norm": 1.3233177661895752, + "learning_rate": 1.6706730769230768e-05, + "loss": 1.0692, + "step": 2317 + }, + { + "epoch": 1.3521948373924457, + "grad_norm": 1.3573335409164429, + "learning_rate": 1.6691706730769233e-05, + "loss": 0.9137, + "step": 2318 + }, + { + "epoch": 1.35277818287881, + "grad_norm": 1.0537357330322266, + "learning_rate": 1.6676682692307695e-05, + "loss": 0.7244, + "step": 2319 + }, + { + "epoch": 1.3533615283651743, + "grad_norm": 1.126329779624939, + "learning_rate": 1.6661658653846154e-05, + "loss": 0.7374, + "step": 2320 + }, + { + "epoch": 1.3539448738515385, + "grad_norm": 1.3769999742507935, + "learning_rate": 1.6646634615384616e-05, + "loss": 0.7223, + "step": 2321 + }, + { + "epoch": 1.354528219337903, + "grad_norm": 1.307424545288086, + "learning_rate": 1.6631610576923077e-05, + "loss": 0.7789, + "step": 2322 + }, + { + "epoch": 1.355111564824267, + "grad_norm": 1.2053377628326416, + "learning_rate": 1.661658653846154e-05, + "loss": 0.9478, + "step": 2323 + }, + { + "epoch": 1.3556949103106315, + "grad_norm": 1.478121280670166, + "learning_rate": 1.66015625e-05, + "loss": 0.7229, + "step": 2324 + }, + { + "epoch": 1.3562782557969957, + "grad_norm": 1.2465181350708008, + "learning_rate": 1.6586538461538463e-05, + "loss": 0.8424, + "step": 2325 + }, + { + "epoch": 1.35686160128336, + "grad_norm": 1.366040825843811, + "learning_rate": 1.657151442307692e-05, + "loss": 0.8107, + "step": 2326 + }, + { + "epoch": 1.3574449467697245, + "grad_norm": 1.2248352766036987, + "learning_rate": 1.6556490384615387e-05, + "loss": 0.7464, + "step": 2327 + }, + { + "epoch": 1.3580282922560887, + "grad_norm": 1.4650728702545166, + "learning_rate": 1.654146634615385e-05, + "loss": 0.7499, + "step": 2328 + }, + { + "epoch": 1.3586116377424529, + "grad_norm": 1.5269290208816528, + "learning_rate": 1.6526442307692307e-05, + "loss": 0.8989, + "step": 2329 + }, + { + "epoch": 1.3591949832288173, + "grad_norm": 1.1337950229644775, + "learning_rate": 1.651141826923077e-05, + "loss": 0.6656, + "step": 2330 + }, + { + "epoch": 1.3597783287151817, + "grad_norm": 1.1645163297653198, + "learning_rate": 1.649639423076923e-05, + "loss": 0.7783, + "step": 2331 + }, + { + "epoch": 1.3603616742015459, + "grad_norm": 1.1996797323226929, + "learning_rate": 1.6481370192307693e-05, + "loss": 0.9066, + "step": 2332 + }, + { + "epoch": 1.36094501968791, + "grad_norm": 1.2484889030456543, + "learning_rate": 1.6466346153846155e-05, + "loss": 0.761, + "step": 2333 + }, + { + "epoch": 1.3615283651742744, + "grad_norm": 1.1876178979873657, + "learning_rate": 1.6451322115384617e-05, + "loss": 0.7001, + "step": 2334 + }, + { + "epoch": 1.3621117106606389, + "grad_norm": 1.1717162132263184, + "learning_rate": 1.643629807692308e-05, + "loss": 0.7374, + "step": 2335 + }, + { + "epoch": 1.362695056147003, + "grad_norm": 1.0891270637512207, + "learning_rate": 1.642127403846154e-05, + "loss": 0.9183, + "step": 2336 + }, + { + "epoch": 1.3632784016333674, + "grad_norm": 1.3006951808929443, + "learning_rate": 1.6406250000000002e-05, + "loss": 0.7246, + "step": 2337 + }, + { + "epoch": 1.3638617471197316, + "grad_norm": 1.3394348621368408, + "learning_rate": 1.639122596153846e-05, + "loss": 0.7199, + "step": 2338 + }, + { + "epoch": 1.364445092606096, + "grad_norm": 1.2449558973312378, + "learning_rate": 1.6376201923076923e-05, + "loss": 0.8916, + "step": 2339 + }, + { + "epoch": 1.3650284380924602, + "grad_norm": 1.0279120206832886, + "learning_rate": 1.6361177884615385e-05, + "loss": 0.9012, + "step": 2340 + }, + { + "epoch": 1.3656117835788246, + "grad_norm": 1.218778133392334, + "learning_rate": 1.6346153846153847e-05, + "loss": 0.7377, + "step": 2341 + }, + { + "epoch": 1.3661951290651888, + "grad_norm": 1.2190182209014893, + "learning_rate": 1.633112980769231e-05, + "loss": 0.963, + "step": 2342 + }, + { + "epoch": 1.3667784745515532, + "grad_norm": 1.2658116817474365, + "learning_rate": 1.631610576923077e-05, + "loss": 0.8298, + "step": 2343 + }, + { + "epoch": 1.3673618200379174, + "grad_norm": 1.2883538007736206, + "learning_rate": 1.6301081730769232e-05, + "loss": 0.8406, + "step": 2344 + }, + { + "epoch": 1.3679451655242818, + "grad_norm": 1.2415398359298706, + "learning_rate": 1.6286057692307694e-05, + "loss": 0.7271, + "step": 2345 + }, + { + "epoch": 1.368528511010646, + "grad_norm": 1.2686741352081299, + "learning_rate": 1.6271033653846156e-05, + "loss": 0.8293, + "step": 2346 + }, + { + "epoch": 1.3691118564970104, + "grad_norm": 1.3187519311904907, + "learning_rate": 1.6256009615384614e-05, + "loss": 0.7959, + "step": 2347 + }, + { + "epoch": 1.3696952019833746, + "grad_norm": 1.2809123992919922, + "learning_rate": 1.624098557692308e-05, + "loss": 0.657, + "step": 2348 + }, + { + "epoch": 1.370278547469739, + "grad_norm": 1.2265044450759888, + "learning_rate": 1.6225961538461538e-05, + "loss": 0.8477, + "step": 2349 + }, + { + "epoch": 1.3708618929561034, + "grad_norm": 1.0081802606582642, + "learning_rate": 1.62109375e-05, + "loss": 0.8687, + "step": 2350 + }, + { + "epoch": 1.3714452384424676, + "grad_norm": 1.3187026977539062, + "learning_rate": 1.6195913461538462e-05, + "loss": 0.9389, + "step": 2351 + }, + { + "epoch": 1.3720285839288318, + "grad_norm": 1.283357858657837, + "learning_rate": 1.6180889423076924e-05, + "loss": 0.7913, + "step": 2352 + }, + { + "epoch": 1.3726119294151962, + "grad_norm": 1.126367211341858, + "learning_rate": 1.6165865384615386e-05, + "loss": 0.7502, + "step": 2353 + }, + { + "epoch": 1.3731952749015606, + "grad_norm": 1.1398643255233765, + "learning_rate": 1.6150841346153848e-05, + "loss": 0.8787, + "step": 2354 + }, + { + "epoch": 1.3737786203879248, + "grad_norm": 1.1188660860061646, + "learning_rate": 1.613581730769231e-05, + "loss": 0.5851, + "step": 2355 + }, + { + "epoch": 1.374361965874289, + "grad_norm": 1.200810194015503, + "learning_rate": 1.6120793269230768e-05, + "loss": 0.7128, + "step": 2356 + }, + { + "epoch": 1.3749453113606533, + "grad_norm": 1.1014255285263062, + "learning_rate": 1.6105769230769233e-05, + "loss": 0.8273, + "step": 2357 + }, + { + "epoch": 1.3755286568470177, + "grad_norm": 1.1966328620910645, + "learning_rate": 1.6090745192307692e-05, + "loss": 0.9342, + "step": 2358 + }, + { + "epoch": 1.376112002333382, + "grad_norm": 1.3458975553512573, + "learning_rate": 1.6075721153846154e-05, + "loss": 0.8317, + "step": 2359 + }, + { + "epoch": 1.3766953478197461, + "grad_norm": 1.364256501197815, + "learning_rate": 1.6060697115384616e-05, + "loss": 0.7107, + "step": 2360 + }, + { + "epoch": 1.3772786933061105, + "grad_norm": 1.2403984069824219, + "learning_rate": 1.6045673076923078e-05, + "loss": 0.6139, + "step": 2361 + }, + { + "epoch": 1.377862038792475, + "grad_norm": 1.3325952291488647, + "learning_rate": 1.603064903846154e-05, + "loss": 0.8053, + "step": 2362 + }, + { + "epoch": 1.3784453842788391, + "grad_norm": 1.0815925598144531, + "learning_rate": 1.6015625e-05, + "loss": 0.8323, + "step": 2363 + }, + { + "epoch": 1.3790287297652035, + "grad_norm": 1.0733479261398315, + "learning_rate": 1.6000600961538463e-05, + "loss": 0.7921, + "step": 2364 + }, + { + "epoch": 1.3796120752515677, + "grad_norm": 1.1240204572677612, + "learning_rate": 1.5985576923076922e-05, + "loss": 0.861, + "step": 2365 + }, + { + "epoch": 1.380195420737932, + "grad_norm": 1.2654609680175781, + "learning_rate": 1.5970552884615387e-05, + "loss": 0.6925, + "step": 2366 + }, + { + "epoch": 1.3807787662242963, + "grad_norm": 1.7558287382125854, + "learning_rate": 1.5955528846153846e-05, + "loss": 0.7423, + "step": 2367 + }, + { + "epoch": 1.3813621117106607, + "grad_norm": 1.3726961612701416, + "learning_rate": 1.5940504807692307e-05, + "loss": 0.7108, + "step": 2368 + }, + { + "epoch": 1.3819454571970249, + "grad_norm": 1.279613971710205, + "learning_rate": 1.592548076923077e-05, + "loss": 0.8721, + "step": 2369 + }, + { + "epoch": 1.3825288026833893, + "grad_norm": 1.324630856513977, + "learning_rate": 1.591045673076923e-05, + "loss": 0.6835, + "step": 2370 + }, + { + "epoch": 1.3831121481697535, + "grad_norm": 1.2332587242126465, + "learning_rate": 1.5895432692307693e-05, + "loss": 0.8593, + "step": 2371 + }, + { + "epoch": 1.3836954936561179, + "grad_norm": 1.0641486644744873, + "learning_rate": 1.5880408653846155e-05, + "loss": 0.8916, + "step": 2372 + }, + { + "epoch": 1.384278839142482, + "grad_norm": 1.2237672805786133, + "learning_rate": 1.5865384615384617e-05, + "loss": 0.8585, + "step": 2373 + }, + { + "epoch": 1.3848621846288465, + "grad_norm": 1.3848849534988403, + "learning_rate": 1.585036057692308e-05, + "loss": 0.8661, + "step": 2374 + }, + { + "epoch": 1.3854455301152107, + "grad_norm": 1.1622084379196167, + "learning_rate": 1.583533653846154e-05, + "loss": 0.8062, + "step": 2375 + }, + { + "epoch": 1.386028875601575, + "grad_norm": 1.1375772953033447, + "learning_rate": 1.58203125e-05, + "loss": 0.8455, + "step": 2376 + }, + { + "epoch": 1.3866122210879395, + "grad_norm": 1.1407866477966309, + "learning_rate": 1.580528846153846e-05, + "loss": 0.7599, + "step": 2377 + }, + { + "epoch": 1.3871955665743037, + "grad_norm": 1.3291012048721313, + "learning_rate": 1.5790264423076923e-05, + "loss": 0.7574, + "step": 2378 + }, + { + "epoch": 1.3877789120606678, + "grad_norm": 1.0793458223342896, + "learning_rate": 1.5775240384615385e-05, + "loss": 0.8483, + "step": 2379 + }, + { + "epoch": 1.3883622575470322, + "grad_norm": 1.419547438621521, + "learning_rate": 1.5760216346153847e-05, + "loss": 0.8905, + "step": 2380 + }, + { + "epoch": 1.3889456030333966, + "grad_norm": 1.1597005128860474, + "learning_rate": 1.574519230769231e-05, + "loss": 0.7328, + "step": 2381 + }, + { + "epoch": 1.3895289485197608, + "grad_norm": 1.3240809440612793, + "learning_rate": 1.573016826923077e-05, + "loss": 0.9296, + "step": 2382 + }, + { + "epoch": 1.390112294006125, + "grad_norm": 1.1506272554397583, + "learning_rate": 1.5715144230769232e-05, + "loss": 0.693, + "step": 2383 + }, + { + "epoch": 1.3906956394924894, + "grad_norm": 1.0295346975326538, + "learning_rate": 1.5700120192307694e-05, + "loss": 0.8175, + "step": 2384 + }, + { + "epoch": 1.3912789849788538, + "grad_norm": 1.1327847242355347, + "learning_rate": 1.5685096153846153e-05, + "loss": 0.8946, + "step": 2385 + }, + { + "epoch": 1.391862330465218, + "grad_norm": 1.3098478317260742, + "learning_rate": 1.5670072115384615e-05, + "loss": 0.7245, + "step": 2386 + }, + { + "epoch": 1.3924456759515822, + "grad_norm": 1.2984569072723389, + "learning_rate": 1.565504807692308e-05, + "loss": 1.0087, + "step": 2387 + }, + { + "epoch": 1.3930290214379466, + "grad_norm": 1.20249605178833, + "learning_rate": 1.564002403846154e-05, + "loss": 0.9404, + "step": 2388 + }, + { + "epoch": 1.393612366924311, + "grad_norm": 1.116599678993225, + "learning_rate": 1.5625e-05, + "loss": 0.8229, + "step": 2389 + }, + { + "epoch": 1.3941957124106752, + "grad_norm": 1.4463697671890259, + "learning_rate": 1.5609975961538462e-05, + "loss": 0.8928, + "step": 2390 + }, + { + "epoch": 1.3947790578970396, + "grad_norm": 1.3660231828689575, + "learning_rate": 1.5594951923076924e-05, + "loss": 0.7133, + "step": 2391 + }, + { + "epoch": 1.3953624033834038, + "grad_norm": 1.2797685861587524, + "learning_rate": 1.5579927884615386e-05, + "loss": 0.8965, + "step": 2392 + }, + { + "epoch": 1.3959457488697682, + "grad_norm": 1.3361741304397583, + "learning_rate": 1.5564903846153848e-05, + "loss": 0.7193, + "step": 2393 + }, + { + "epoch": 1.3965290943561324, + "grad_norm": 1.1247235536575317, + "learning_rate": 1.5549879807692306e-05, + "loss": 0.7304, + "step": 2394 + }, + { + "epoch": 1.3971124398424968, + "grad_norm": 1.4382926225662231, + "learning_rate": 1.5534855769230768e-05, + "loss": 0.8633, + "step": 2395 + }, + { + "epoch": 1.397695785328861, + "grad_norm": 1.2390516996383667, + "learning_rate": 1.5519831730769234e-05, + "loss": 0.8239, + "step": 2396 + }, + { + "epoch": 1.3982791308152254, + "grad_norm": 1.3460718393325806, + "learning_rate": 1.5504807692307692e-05, + "loss": 1.0886, + "step": 2397 + }, + { + "epoch": 1.3988624763015896, + "grad_norm": 1.125791311264038, + "learning_rate": 1.5489783653846154e-05, + "loss": 0.8253, + "step": 2398 + }, + { + "epoch": 1.399445821787954, + "grad_norm": 1.2912181615829468, + "learning_rate": 1.5474759615384616e-05, + "loss": 0.7138, + "step": 2399 + }, + { + "epoch": 1.4000291672743181, + "grad_norm": 1.1510553359985352, + "learning_rate": 1.5459735576923078e-05, + "loss": 0.932, + "step": 2400 + }, + { + "epoch": 1.4000291672743181, + "eval_loss_squad": 0.8576014773617499, + "eval_perplexity": 8.10681335191388, + "eval_perplexity_reconstruct": 1.909424481734322, + "step": 2400 + }, + { + "epoch": 1.4006125127606825, + "grad_norm": 1.429721474647522, + "learning_rate": 1.544471153846154e-05, + "loss": 0.7134, + "step": 2401 + }, + { + "epoch": 1.4011958582470467, + "grad_norm": 1.3556239604949951, + "learning_rate": 1.54296875e-05, + "loss": 0.8164, + "step": 2402 + }, + { + "epoch": 1.4017792037334111, + "grad_norm": 1.2837475538253784, + "learning_rate": 1.541466346153846e-05, + "loss": 0.678, + "step": 2403 + }, + { + "epoch": 1.4023625492197755, + "grad_norm": 1.1232627630233765, + "learning_rate": 1.5399639423076922e-05, + "loss": 0.8262, + "step": 2404 + }, + { + "epoch": 1.4029458947061397, + "grad_norm": 1.3286436796188354, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.6756, + "step": 2405 + }, + { + "epoch": 1.403529240192504, + "grad_norm": 1.2372446060180664, + "learning_rate": 1.5369591346153846e-05, + "loss": 0.7997, + "step": 2406 + }, + { + "epoch": 1.4041125856788683, + "grad_norm": 1.1915537118911743, + "learning_rate": 1.5354567307692308e-05, + "loss": 0.7118, + "step": 2407 + }, + { + "epoch": 1.4046959311652327, + "grad_norm": 1.241876244544983, + "learning_rate": 1.533954326923077e-05, + "loss": 0.7073, + "step": 2408 + }, + { + "epoch": 1.405279276651597, + "grad_norm": 1.4120349884033203, + "learning_rate": 1.532451923076923e-05, + "loss": 0.8801, + "step": 2409 + }, + { + "epoch": 1.405862622137961, + "grad_norm": 1.0920681953430176, + "learning_rate": 1.5309495192307693e-05, + "loss": 0.8118, + "step": 2410 + }, + { + "epoch": 1.4064459676243255, + "grad_norm": 1.3178508281707764, + "learning_rate": 1.5294471153846155e-05, + "loss": 0.8296, + "step": 2411 + }, + { + "epoch": 1.40702931311069, + "grad_norm": 1.307503342628479, + "learning_rate": 1.5279447115384614e-05, + "loss": 0.8255, + "step": 2412 + }, + { + "epoch": 1.407612658597054, + "grad_norm": 1.4173907041549683, + "learning_rate": 1.526442307692308e-05, + "loss": 0.7455, + "step": 2413 + }, + { + "epoch": 1.4081960040834183, + "grad_norm": 1.2470276355743408, + "learning_rate": 1.5249399038461539e-05, + "loss": 0.7058, + "step": 2414 + }, + { + "epoch": 1.4087793495697827, + "grad_norm": 1.1090408563613892, + "learning_rate": 1.5234375000000001e-05, + "loss": 0.7262, + "step": 2415 + }, + { + "epoch": 1.409362695056147, + "grad_norm": 1.0376850366592407, + "learning_rate": 1.5219350961538461e-05, + "loss": 0.6755, + "step": 2416 + }, + { + "epoch": 1.4099460405425113, + "grad_norm": 1.1430914402008057, + "learning_rate": 1.5204326923076923e-05, + "loss": 0.7457, + "step": 2417 + }, + { + "epoch": 1.4105293860288757, + "grad_norm": 1.2482012510299683, + "learning_rate": 1.5189302884615387e-05, + "loss": 0.6143, + "step": 2418 + }, + { + "epoch": 1.4111127315152399, + "grad_norm": 1.3330813646316528, + "learning_rate": 1.5174278846153847e-05, + "loss": 0.6651, + "step": 2419 + }, + { + "epoch": 1.4116960770016043, + "grad_norm": 1.3887958526611328, + "learning_rate": 1.5159254807692309e-05, + "loss": 0.9114, + "step": 2420 + }, + { + "epoch": 1.4122794224879684, + "grad_norm": 1.328403353691101, + "learning_rate": 1.5144230769230769e-05, + "loss": 0.8242, + "step": 2421 + }, + { + "epoch": 1.4128627679743329, + "grad_norm": 1.185078740119934, + "learning_rate": 1.5129206730769233e-05, + "loss": 0.6619, + "step": 2422 + }, + { + "epoch": 1.413446113460697, + "grad_norm": 1.2361003160476685, + "learning_rate": 1.5114182692307693e-05, + "loss": 0.7416, + "step": 2423 + }, + { + "epoch": 1.4140294589470614, + "grad_norm": 1.1306205987930298, + "learning_rate": 1.5099158653846155e-05, + "loss": 0.8077, + "step": 2424 + }, + { + "epoch": 1.4146128044334256, + "grad_norm": 0.9908915162086487, + "learning_rate": 1.5084134615384615e-05, + "loss": 0.8779, + "step": 2425 + }, + { + "epoch": 1.41519614991979, + "grad_norm": 1.1640621423721313, + "learning_rate": 1.5069110576923078e-05, + "loss": 0.8623, + "step": 2426 + }, + { + "epoch": 1.4157794954061542, + "grad_norm": 2.4493496417999268, + "learning_rate": 1.505408653846154e-05, + "loss": 0.9976, + "step": 2427 + }, + { + "epoch": 1.4163628408925186, + "grad_norm": 1.1745401620864868, + "learning_rate": 1.50390625e-05, + "loss": 0.7252, + "step": 2428 + }, + { + "epoch": 1.4169461863788828, + "grad_norm": 1.2570523023605347, + "learning_rate": 1.5024038461538462e-05, + "loss": 0.7836, + "step": 2429 + }, + { + "epoch": 1.4175295318652472, + "grad_norm": 1.1713680028915405, + "learning_rate": 1.5009014423076923e-05, + "loss": 0.7318, + "step": 2430 + }, + { + "epoch": 1.4181128773516116, + "grad_norm": 1.224394679069519, + "learning_rate": 1.4993990384615386e-05, + "loss": 0.619, + "step": 2431 + }, + { + "epoch": 1.4186962228379758, + "grad_norm": 1.089404821395874, + "learning_rate": 1.4978966346153846e-05, + "loss": 0.712, + "step": 2432 + }, + { + "epoch": 1.41927956832434, + "grad_norm": 1.172507643699646, + "learning_rate": 1.4963942307692308e-05, + "loss": 0.6563, + "step": 2433 + }, + { + "epoch": 1.4198629138107044, + "grad_norm": 1.3817616701126099, + "learning_rate": 1.4948918269230768e-05, + "loss": 0.6572, + "step": 2434 + }, + { + "epoch": 1.4204462592970688, + "grad_norm": 1.1678885221481323, + "learning_rate": 1.4933894230769232e-05, + "loss": 0.9326, + "step": 2435 + }, + { + "epoch": 1.421029604783433, + "grad_norm": 1.2956814765930176, + "learning_rate": 1.4918870192307694e-05, + "loss": 0.8578, + "step": 2436 + }, + { + "epoch": 1.4216129502697972, + "grad_norm": 0.7640461325645447, + "learning_rate": 1.4903846153846154e-05, + "loss": 0.8301, + "step": 2437 + }, + { + "epoch": 1.4221962957561616, + "grad_norm": 1.1869699954986572, + "learning_rate": 1.4888822115384616e-05, + "loss": 0.8132, + "step": 2438 + }, + { + "epoch": 1.422779641242526, + "grad_norm": 1.1306724548339844, + "learning_rate": 1.487379807692308e-05, + "loss": 0.7028, + "step": 2439 + }, + { + "epoch": 1.4233629867288902, + "grad_norm": 1.2320770025253296, + "learning_rate": 1.485877403846154e-05, + "loss": 0.6797, + "step": 2440 + }, + { + "epoch": 1.4239463322152544, + "grad_norm": 1.3738675117492676, + "learning_rate": 1.484375e-05, + "loss": 0.752, + "step": 2441 + }, + { + "epoch": 1.4245296777016188, + "grad_norm": 1.2884392738342285, + "learning_rate": 1.4828725961538462e-05, + "loss": 0.7435, + "step": 2442 + }, + { + "epoch": 1.4251130231879832, + "grad_norm": 1.3447365760803223, + "learning_rate": 1.4813701923076922e-05, + "loss": 0.6959, + "step": 2443 + }, + { + "epoch": 1.4256963686743473, + "grad_norm": 1.270371913909912, + "learning_rate": 1.4798677884615386e-05, + "loss": 0.8662, + "step": 2444 + }, + { + "epoch": 1.4262797141607118, + "grad_norm": 1.3903402090072632, + "learning_rate": 1.4783653846153848e-05, + "loss": 0.7844, + "step": 2445 + }, + { + "epoch": 1.426863059647076, + "grad_norm": 1.2270903587341309, + "learning_rate": 1.4768629807692308e-05, + "loss": 0.6552, + "step": 2446 + }, + { + "epoch": 1.4274464051334403, + "grad_norm": 1.170554280281067, + "learning_rate": 1.475360576923077e-05, + "loss": 0.9123, + "step": 2447 + }, + { + "epoch": 1.4280297506198045, + "grad_norm": 0.9950125217437744, + "learning_rate": 1.4738581730769233e-05, + "loss": 0.7512, + "step": 2448 + }, + { + "epoch": 1.428613096106169, + "grad_norm": 1.3289663791656494, + "learning_rate": 1.4723557692307693e-05, + "loss": 0.7338, + "step": 2449 + }, + { + "epoch": 1.4291964415925331, + "grad_norm": 1.298034429550171, + "learning_rate": 1.4708533653846154e-05, + "loss": 0.7775, + "step": 2450 + }, + { + "epoch": 1.4297797870788975, + "grad_norm": 1.3974965810775757, + "learning_rate": 1.4693509615384615e-05, + "loss": 0.7119, + "step": 2451 + }, + { + "epoch": 1.4303631325652617, + "grad_norm": 1.3868650197982788, + "learning_rate": 1.4678485576923079e-05, + "loss": 0.8351, + "step": 2452 + }, + { + "epoch": 1.4309464780516261, + "grad_norm": 1.093425989151001, + "learning_rate": 1.466346153846154e-05, + "loss": 0.505, + "step": 2453 + }, + { + "epoch": 1.4315298235379903, + "grad_norm": 1.075796127319336, + "learning_rate": 1.4648437500000001e-05, + "loss": 1.0527, + "step": 2454 + }, + { + "epoch": 1.4321131690243547, + "grad_norm": 1.2513554096221924, + "learning_rate": 1.4633413461538461e-05, + "loss": 0.6941, + "step": 2455 + }, + { + "epoch": 1.4326965145107189, + "grad_norm": 1.2571288347244263, + "learning_rate": 1.4618389423076923e-05, + "loss": 0.8531, + "step": 2456 + }, + { + "epoch": 1.4332798599970833, + "grad_norm": 1.139385461807251, + "learning_rate": 1.4603365384615387e-05, + "loss": 0.648, + "step": 2457 + }, + { + "epoch": 1.4338632054834477, + "grad_norm": 1.1437668800354004, + "learning_rate": 1.4588341346153847e-05, + "loss": 0.7422, + "step": 2458 + }, + { + "epoch": 1.4344465509698119, + "grad_norm": 1.3354110717773438, + "learning_rate": 1.4573317307692307e-05, + "loss": 0.8899, + "step": 2459 + }, + { + "epoch": 1.435029896456176, + "grad_norm": 1.1407440900802612, + "learning_rate": 1.4558293269230769e-05, + "loss": 0.7084, + "step": 2460 + }, + { + "epoch": 1.4356132419425405, + "grad_norm": 1.2988560199737549, + "learning_rate": 1.4543269230769233e-05, + "loss": 0.803, + "step": 2461 + }, + { + "epoch": 1.4361965874289049, + "grad_norm": 1.2152442932128906, + "learning_rate": 1.4528245192307693e-05, + "loss": 0.5639, + "step": 2462 + }, + { + "epoch": 1.436779932915269, + "grad_norm": 1.1522806882858276, + "learning_rate": 1.4513221153846155e-05, + "loss": 0.8283, + "step": 2463 + }, + { + "epoch": 1.4373632784016332, + "grad_norm": 1.3547604084014893, + "learning_rate": 1.4498197115384615e-05, + "loss": 0.9728, + "step": 2464 + }, + { + "epoch": 1.4379466238879977, + "grad_norm": 1.0888975858688354, + "learning_rate": 1.4483173076923079e-05, + "loss": 0.9095, + "step": 2465 + }, + { + "epoch": 1.438529969374362, + "grad_norm": 1.2343162298202515, + "learning_rate": 1.446814903846154e-05, + "loss": 0.8971, + "step": 2466 + }, + { + "epoch": 1.4391133148607262, + "grad_norm": 1.2880823612213135, + "learning_rate": 1.4453125e-05, + "loss": 0.758, + "step": 2467 + }, + { + "epoch": 1.4396966603470904, + "grad_norm": 1.273984670639038, + "learning_rate": 1.443810096153846e-05, + "loss": 0.8011, + "step": 2468 + }, + { + "epoch": 1.4402800058334548, + "grad_norm": 1.221825122833252, + "learning_rate": 1.4423076923076923e-05, + "loss": 0.7709, + "step": 2469 + }, + { + "epoch": 1.4408633513198192, + "grad_norm": 1.1602380275726318, + "learning_rate": 1.4408052884615386e-05, + "loss": 0.8517, + "step": 2470 + }, + { + "epoch": 1.4414466968061834, + "grad_norm": 1.3913099765777588, + "learning_rate": 1.4393028846153847e-05, + "loss": 0.8339, + "step": 2471 + }, + { + "epoch": 1.4420300422925478, + "grad_norm": 1.3142465353012085, + "learning_rate": 1.4378004807692308e-05, + "loss": 0.7022, + "step": 2472 + }, + { + "epoch": 1.442613387778912, + "grad_norm": 1.2736009359359741, + "learning_rate": 1.4362980769230769e-05, + "loss": 0.8779, + "step": 2473 + }, + { + "epoch": 1.4431967332652764, + "grad_norm": 1.3056821823120117, + "learning_rate": 1.4347956730769232e-05, + "loss": 0.8236, + "step": 2474 + }, + { + "epoch": 1.4437800787516406, + "grad_norm": 1.3933846950531006, + "learning_rate": 1.4332932692307694e-05, + "loss": 0.9686, + "step": 2475 + }, + { + "epoch": 1.444363424238005, + "grad_norm": 1.3702456951141357, + "learning_rate": 1.4317908653846154e-05, + "loss": 0.8735, + "step": 2476 + }, + { + "epoch": 1.4449467697243692, + "grad_norm": 1.1643601655960083, + "learning_rate": 1.4302884615384614e-05, + "loss": 0.6383, + "step": 2477 + }, + { + "epoch": 1.4455301152107336, + "grad_norm": 1.2481948137283325, + "learning_rate": 1.428786057692308e-05, + "loss": 0.8129, + "step": 2478 + }, + { + "epoch": 1.4461134606970978, + "grad_norm": 1.904402494430542, + "learning_rate": 1.427283653846154e-05, + "loss": 0.6686, + "step": 2479 + }, + { + "epoch": 1.4466968061834622, + "grad_norm": 1.0331618785858154, + "learning_rate": 1.42578125e-05, + "loss": 0.6951, + "step": 2480 + }, + { + "epoch": 1.4472801516698264, + "grad_norm": 1.3622822761535645, + "learning_rate": 1.4242788461538462e-05, + "loss": 0.8933, + "step": 2481 + }, + { + "epoch": 1.4478634971561908, + "grad_norm": 1.3097389936447144, + "learning_rate": 1.4227764423076922e-05, + "loss": 0.8643, + "step": 2482 + }, + { + "epoch": 1.448446842642555, + "grad_norm": 1.1944518089294434, + "learning_rate": 1.4212740384615386e-05, + "loss": 0.7313, + "step": 2483 + }, + { + "epoch": 1.4490301881289194, + "grad_norm": 1.163017749786377, + "learning_rate": 1.4197716346153848e-05, + "loss": 0.8936, + "step": 2484 + }, + { + "epoch": 1.4496135336152838, + "grad_norm": 1.350412130355835, + "learning_rate": 1.4182692307692308e-05, + "loss": 0.6471, + "step": 2485 + }, + { + "epoch": 1.450196879101648, + "grad_norm": 1.1196433305740356, + "learning_rate": 1.4167668269230768e-05, + "loss": 0.9139, + "step": 2486 + }, + { + "epoch": 1.4507802245880121, + "grad_norm": 1.0108141899108887, + "learning_rate": 1.4152644230769233e-05, + "loss": 0.7733, + "step": 2487 + }, + { + "epoch": 1.4513635700743766, + "grad_norm": 1.1067391633987427, + "learning_rate": 1.4137620192307694e-05, + "loss": 0.5762, + "step": 2488 + }, + { + "epoch": 1.451946915560741, + "grad_norm": 1.3588923215866089, + "learning_rate": 1.4122596153846154e-05, + "loss": 0.9662, + "step": 2489 + }, + { + "epoch": 1.4525302610471051, + "grad_norm": 1.15555739402771, + "learning_rate": 1.4107572115384616e-05, + "loss": 0.7992, + "step": 2490 + }, + { + "epoch": 1.4531136065334693, + "grad_norm": 1.3854424953460693, + "learning_rate": 1.409254807692308e-05, + "loss": 0.7705, + "step": 2491 + }, + { + "epoch": 1.4536969520198337, + "grad_norm": 1.2606521844863892, + "learning_rate": 1.407752403846154e-05, + "loss": 0.7152, + "step": 2492 + }, + { + "epoch": 1.4542802975061981, + "grad_norm": 1.3292980194091797, + "learning_rate": 1.4062500000000001e-05, + "loss": 0.9197, + "step": 2493 + }, + { + "epoch": 1.4548636429925623, + "grad_norm": 1.5681556463241577, + "learning_rate": 1.4047475961538462e-05, + "loss": 0.811, + "step": 2494 + }, + { + "epoch": 1.4554469884789265, + "grad_norm": 1.1809593439102173, + "learning_rate": 1.4032451923076922e-05, + "loss": 1.0052, + "step": 2495 + }, + { + "epoch": 1.456030333965291, + "grad_norm": 1.1254372596740723, + "learning_rate": 1.4017427884615387e-05, + "loss": 0.7758, + "step": 2496 + }, + { + "epoch": 1.4566136794516553, + "grad_norm": 1.2921695709228516, + "learning_rate": 1.4002403846153847e-05, + "loss": 0.7628, + "step": 2497 + }, + { + "epoch": 1.4571970249380195, + "grad_norm": 1.3415536880493164, + "learning_rate": 1.3987379807692307e-05, + "loss": 0.6855, + "step": 2498 + }, + { + "epoch": 1.457780370424384, + "grad_norm": 1.3940538167953491, + "learning_rate": 1.397235576923077e-05, + "loss": 0.7203, + "step": 2499 + }, + { + "epoch": 1.458363715910748, + "grad_norm": 1.2635236978530884, + "learning_rate": 1.3957331730769233e-05, + "loss": 0.843, + "step": 2500 + }, + { + "epoch": 1.4589470613971125, + "grad_norm": 1.1825194358825684, + "learning_rate": 1.3942307692307693e-05, + "loss": 0.6945, + "step": 2501 + }, + { + "epoch": 1.4595304068834767, + "grad_norm": 1.2193362712860107, + "learning_rate": 1.3927283653846155e-05, + "loss": 0.787, + "step": 2502 + }, + { + "epoch": 1.460113752369841, + "grad_norm": 1.249801754951477, + "learning_rate": 1.3912259615384615e-05, + "loss": 0.856, + "step": 2503 + }, + { + "epoch": 1.4606970978562053, + "grad_norm": 1.1366738080978394, + "learning_rate": 1.3897235576923079e-05, + "loss": 0.696, + "step": 2504 + }, + { + "epoch": 1.4612804433425697, + "grad_norm": 1.3033206462860107, + "learning_rate": 1.388221153846154e-05, + "loss": 0.8147, + "step": 2505 + }, + { + "epoch": 1.4618637888289339, + "grad_norm": 1.3203999996185303, + "learning_rate": 1.38671875e-05, + "loss": 0.5789, + "step": 2506 + }, + { + "epoch": 1.4624471343152983, + "grad_norm": 1.3760713338851929, + "learning_rate": 1.3852163461538461e-05, + "loss": 0.7991, + "step": 2507 + }, + { + "epoch": 1.4630304798016625, + "grad_norm": 1.2778420448303223, + "learning_rate": 1.3837139423076923e-05, + "loss": 0.6555, + "step": 2508 + }, + { + "epoch": 1.4636138252880269, + "grad_norm": 1.1887472867965698, + "learning_rate": 1.3822115384615386e-05, + "loss": 0.8536, + "step": 2509 + }, + { + "epoch": 1.464197170774391, + "grad_norm": 1.0331052541732788, + "learning_rate": 1.3807091346153847e-05, + "loss": 0.6553, + "step": 2510 + }, + { + "epoch": 1.4647805162607554, + "grad_norm": 1.2358107566833496, + "learning_rate": 1.3792067307692309e-05, + "loss": 0.8954, + "step": 2511 + }, + { + "epoch": 1.4653638617471199, + "grad_norm": 1.4614425897598267, + "learning_rate": 1.3777043269230769e-05, + "loss": 0.7539, + "step": 2512 + }, + { + "epoch": 1.465947207233484, + "grad_norm": 1.2347095012664795, + "learning_rate": 1.3762019230769232e-05, + "loss": 0.8088, + "step": 2513 + }, + { + "epoch": 1.4665305527198482, + "grad_norm": 1.299397587776184, + "learning_rate": 1.3746995192307694e-05, + "loss": 0.5135, + "step": 2514 + }, + { + "epoch": 1.4671138982062126, + "grad_norm": 1.1845791339874268, + "learning_rate": 1.3731971153846154e-05, + "loss": 0.9701, + "step": 2515 + }, + { + "epoch": 1.467697243692577, + "grad_norm": 1.3026585578918457, + "learning_rate": 1.3716947115384615e-05, + "loss": 0.8863, + "step": 2516 + }, + { + "epoch": 1.4682805891789412, + "grad_norm": 1.1109139919281006, + "learning_rate": 1.3701923076923078e-05, + "loss": 0.703, + "step": 2517 + }, + { + "epoch": 1.4688639346653054, + "grad_norm": 1.1830486059188843, + "learning_rate": 1.368689903846154e-05, + "loss": 0.7957, + "step": 2518 + }, + { + "epoch": 1.4694472801516698, + "grad_norm": 1.1587508916854858, + "learning_rate": 1.3671875e-05, + "loss": 1.0695, + "step": 2519 + }, + { + "epoch": 1.4700306256380342, + "grad_norm": 1.0737591981887817, + "learning_rate": 1.3656850961538462e-05, + "loss": 0.7813, + "step": 2520 + }, + { + "epoch": 1.4706139711243984, + "grad_norm": 1.2657713890075684, + "learning_rate": 1.3641826923076922e-05, + "loss": 0.7455, + "step": 2521 + }, + { + "epoch": 1.4711973166107628, + "grad_norm": 1.3846890926361084, + "learning_rate": 1.3626802884615386e-05, + "loss": 0.7651, + "step": 2522 + }, + { + "epoch": 1.471780662097127, + "grad_norm": 1.0715590715408325, + "learning_rate": 1.3611778846153848e-05, + "loss": 0.8357, + "step": 2523 + }, + { + "epoch": 1.4723640075834914, + "grad_norm": 1.2787429094314575, + "learning_rate": 1.3596754807692308e-05, + "loss": 0.8817, + "step": 2524 + }, + { + "epoch": 1.4729473530698556, + "grad_norm": 1.324184536933899, + "learning_rate": 1.3581730769230768e-05, + "loss": 0.8724, + "step": 2525 + }, + { + "epoch": 1.47353069855622, + "grad_norm": 1.1958729028701782, + "learning_rate": 1.3566706730769232e-05, + "loss": 0.8143, + "step": 2526 + }, + { + "epoch": 1.4741140440425842, + "grad_norm": 0.9904715418815613, + "learning_rate": 1.3551682692307694e-05, + "loss": 0.8887, + "step": 2527 + }, + { + "epoch": 1.4746973895289486, + "grad_norm": 1.2931222915649414, + "learning_rate": 1.3536658653846154e-05, + "loss": 0.7175, + "step": 2528 + }, + { + "epoch": 1.4752807350153128, + "grad_norm": 1.3142303228378296, + "learning_rate": 1.3521634615384616e-05, + "loss": 0.7999, + "step": 2529 + }, + { + "epoch": 1.4758640805016772, + "grad_norm": 1.3524141311645508, + "learning_rate": 1.350661057692308e-05, + "loss": 0.7423, + "step": 2530 + }, + { + "epoch": 1.4764474259880414, + "grad_norm": 1.4455997943878174, + "learning_rate": 1.349158653846154e-05, + "loss": 0.5973, + "step": 2531 + }, + { + "epoch": 1.4770307714744058, + "grad_norm": 1.367997169494629, + "learning_rate": 1.3476562500000001e-05, + "loss": 0.7174, + "step": 2532 + }, + { + "epoch": 1.47761411696077, + "grad_norm": 1.2403534650802612, + "learning_rate": 1.3461538461538462e-05, + "loss": 0.8182, + "step": 2533 + }, + { + "epoch": 1.4781974624471343, + "grad_norm": 1.1000239849090576, + "learning_rate": 1.3446514423076922e-05, + "loss": 0.8016, + "step": 2534 + }, + { + "epoch": 1.4787808079334988, + "grad_norm": 1.4198569059371948, + "learning_rate": 1.3431490384615385e-05, + "loss": 0.6896, + "step": 2535 + }, + { + "epoch": 1.479364153419863, + "grad_norm": 1.2650169134140015, + "learning_rate": 1.3416466346153847e-05, + "loss": 0.7742, + "step": 2536 + }, + { + "epoch": 1.4799474989062271, + "grad_norm": 1.4280163049697876, + "learning_rate": 1.3401442307692308e-05, + "loss": 0.9482, + "step": 2537 + }, + { + "epoch": 1.4805308443925915, + "grad_norm": 1.2631523609161377, + "learning_rate": 1.338641826923077e-05, + "loss": 0.7142, + "step": 2538 + }, + { + "epoch": 1.481114189878956, + "grad_norm": 1.236832857131958, + "learning_rate": 1.3371394230769233e-05, + "loss": 0.6786, + "step": 2539 + }, + { + "epoch": 1.4816975353653201, + "grad_norm": 1.2481725215911865, + "learning_rate": 1.3356370192307693e-05, + "loss": 0.8536, + "step": 2540 + }, + { + "epoch": 1.4822808808516843, + "grad_norm": 1.3706049919128418, + "learning_rate": 1.3341346153846155e-05, + "loss": 0.725, + "step": 2541 + }, + { + "epoch": 1.4828642263380487, + "grad_norm": 1.2054919004440308, + "learning_rate": 1.3326322115384615e-05, + "loss": 0.9378, + "step": 2542 + }, + { + "epoch": 1.4834475718244131, + "grad_norm": 1.659885048866272, + "learning_rate": 1.3311298076923079e-05, + "loss": 0.7564, + "step": 2543 + }, + { + "epoch": 1.4840309173107773, + "grad_norm": 1.3692610263824463, + "learning_rate": 1.3296274038461539e-05, + "loss": 0.8413, + "step": 2544 + }, + { + "epoch": 1.4846142627971415, + "grad_norm": 1.287308931350708, + "learning_rate": 1.3281250000000001e-05, + "loss": 0.695, + "step": 2545 + }, + { + "epoch": 1.4851976082835059, + "grad_norm": 1.5530636310577393, + "learning_rate": 1.3266225961538461e-05, + "loss": 0.7356, + "step": 2546 + }, + { + "epoch": 1.4857809537698703, + "grad_norm": 1.2729878425598145, + "learning_rate": 1.3251201923076923e-05, + "loss": 0.7533, + "step": 2547 + }, + { + "epoch": 1.4863642992562345, + "grad_norm": 1.4389548301696777, + "learning_rate": 1.3236177884615387e-05, + "loss": 0.8441, + "step": 2548 + }, + { + "epoch": 1.4869476447425989, + "grad_norm": 1.0692722797393799, + "learning_rate": 1.3221153846153847e-05, + "loss": 0.8289, + "step": 2549 + }, + { + "epoch": 1.487530990228963, + "grad_norm": 1.1718000173568726, + "learning_rate": 1.3206129807692309e-05, + "loss": 0.7955, + "step": 2550 + }, + { + "epoch": 1.4881143357153275, + "grad_norm": 1.1499435901641846, + "learning_rate": 1.3191105769230769e-05, + "loss": 0.7256, + "step": 2551 + }, + { + "epoch": 1.4886976812016917, + "grad_norm": 1.250289797782898, + "learning_rate": 1.3176081730769232e-05, + "loss": 0.8081, + "step": 2552 + }, + { + "epoch": 1.489281026688056, + "grad_norm": 1.243996024131775, + "learning_rate": 1.3161057692307693e-05, + "loss": 0.6775, + "step": 2553 + }, + { + "epoch": 1.4898643721744202, + "grad_norm": 1.2744098901748657, + "learning_rate": 1.3146033653846155e-05, + "loss": 0.8471, + "step": 2554 + }, + { + "epoch": 1.4904477176607847, + "grad_norm": 1.2613190412521362, + "learning_rate": 1.3131009615384615e-05, + "loss": 0.7044, + "step": 2555 + }, + { + "epoch": 1.4910310631471488, + "grad_norm": 1.4642215967178345, + "learning_rate": 1.3115985576923078e-05, + "loss": 1.0189, + "step": 2556 + }, + { + "epoch": 1.4916144086335132, + "grad_norm": 1.1679459810256958, + "learning_rate": 1.310096153846154e-05, + "loss": 0.8816, + "step": 2557 + }, + { + "epoch": 1.4921977541198774, + "grad_norm": 1.1182615756988525, + "learning_rate": 1.30859375e-05, + "loss": 0.7353, + "step": 2558 + }, + { + "epoch": 1.4927810996062418, + "grad_norm": 1.2033658027648926, + "learning_rate": 1.3070913461538462e-05, + "loss": 0.7741, + "step": 2559 + }, + { + "epoch": 1.493364445092606, + "grad_norm": 1.1669368743896484, + "learning_rate": 1.3055889423076923e-05, + "loss": 0.8408, + "step": 2560 + }, + { + "epoch": 1.4939477905789704, + "grad_norm": 1.4733059406280518, + "learning_rate": 1.3040865384615386e-05, + "loss": 0.7797, + "step": 2561 + }, + { + "epoch": 1.4945311360653348, + "grad_norm": 1.431363582611084, + "learning_rate": 1.3025841346153846e-05, + "loss": 0.794, + "step": 2562 + }, + { + "epoch": 1.495114481551699, + "grad_norm": 0.9812197685241699, + "learning_rate": 1.3010817307692308e-05, + "loss": 0.6596, + "step": 2563 + }, + { + "epoch": 1.4956978270380632, + "grad_norm": 1.124457597732544, + "learning_rate": 1.2995793269230768e-05, + "loss": 0.906, + "step": 2564 + }, + { + "epoch": 1.4962811725244276, + "grad_norm": 1.3098465204238892, + "learning_rate": 1.2980769230769232e-05, + "loss": 1.0137, + "step": 2565 + }, + { + "epoch": 1.496864518010792, + "grad_norm": 1.0503084659576416, + "learning_rate": 1.2965745192307694e-05, + "loss": 0.7207, + "step": 2566 + }, + { + "epoch": 1.4974478634971562, + "grad_norm": 1.2493394613265991, + "learning_rate": 1.2950721153846154e-05, + "loss": 0.9324, + "step": 2567 + }, + { + "epoch": 1.4980312089835204, + "grad_norm": 1.1940001249313354, + "learning_rate": 1.2935697115384616e-05, + "loss": 0.7695, + "step": 2568 + }, + { + "epoch": 1.4986145544698848, + "grad_norm": 1.2989429235458374, + "learning_rate": 1.292067307692308e-05, + "loss": 0.5233, + "step": 2569 + }, + { + "epoch": 1.4991978999562492, + "grad_norm": 1.1896779537200928, + "learning_rate": 1.290564903846154e-05, + "loss": 0.8223, + "step": 2570 + }, + { + "epoch": 1.4997812454426134, + "grad_norm": 1.2201910018920898, + "learning_rate": 1.2890625e-05, + "loss": 0.7695, + "step": 2571 + }, + { + "epoch": 1.5003645909289776, + "grad_norm": 1.2268931865692139, + "learning_rate": 1.2875600961538462e-05, + "loss": 0.8114, + "step": 2572 + }, + { + "epoch": 1.500947936415342, + "grad_norm": 1.1898248195648193, + "learning_rate": 1.2860576923076922e-05, + "loss": 0.8477, + "step": 2573 + }, + { + "epoch": 1.5015312819017064, + "grad_norm": 1.4626970291137695, + "learning_rate": 1.2845552884615386e-05, + "loss": 0.6557, + "step": 2574 + }, + { + "epoch": 1.5021146273880706, + "grad_norm": 1.2246737480163574, + "learning_rate": 1.2830528846153847e-05, + "loss": 0.9006, + "step": 2575 + }, + { + "epoch": 1.5026979728744347, + "grad_norm": 1.2037559747695923, + "learning_rate": 1.2815504807692308e-05, + "loss": 0.8817, + "step": 2576 + }, + { + "epoch": 1.5032813183607991, + "grad_norm": 1.3660948276519775, + "learning_rate": 1.280048076923077e-05, + "loss": 0.6706, + "step": 2577 + }, + { + "epoch": 1.5038646638471636, + "grad_norm": 1.359358787536621, + "learning_rate": 1.2785456730769233e-05, + "loss": 0.8003, + "step": 2578 + }, + { + "epoch": 1.5044480093335277, + "grad_norm": 1.2577052116394043, + "learning_rate": 1.2770432692307693e-05, + "loss": 0.9274, + "step": 2579 + }, + { + "epoch": 1.505031354819892, + "grad_norm": 1.2104783058166504, + "learning_rate": 1.2755408653846154e-05, + "loss": 0.787, + "step": 2580 + }, + { + "epoch": 1.5056147003062563, + "grad_norm": 1.3401927947998047, + "learning_rate": 1.2740384615384615e-05, + "loss": 0.7528, + "step": 2581 + }, + { + "epoch": 1.5061980457926207, + "grad_norm": 1.3545399904251099, + "learning_rate": 1.2725360576923079e-05, + "loss": 0.7889, + "step": 2582 + }, + { + "epoch": 1.506781391278985, + "grad_norm": 1.1138622760772705, + "learning_rate": 1.271033653846154e-05, + "loss": 0.9997, + "step": 2583 + }, + { + "epoch": 1.5073647367653493, + "grad_norm": 1.2229743003845215, + "learning_rate": 1.2695312500000001e-05, + "loss": 0.7661, + "step": 2584 + }, + { + "epoch": 1.5079480822517137, + "grad_norm": 1.1078226566314697, + "learning_rate": 1.2680288461538461e-05, + "loss": 0.6252, + "step": 2585 + }, + { + "epoch": 1.508531427738078, + "grad_norm": 1.1622321605682373, + "learning_rate": 1.2665264423076923e-05, + "loss": 0.8268, + "step": 2586 + }, + { + "epoch": 1.509114773224442, + "grad_norm": 1.3086098432540894, + "learning_rate": 1.2650240384615387e-05, + "loss": 0.8047, + "step": 2587 + }, + { + "epoch": 1.5096981187108065, + "grad_norm": 1.4337576627731323, + "learning_rate": 1.2635216346153847e-05, + "loss": 0.8714, + "step": 2588 + }, + { + "epoch": 1.510281464197171, + "grad_norm": 1.358109951019287, + "learning_rate": 1.2620192307692307e-05, + "loss": 0.7723, + "step": 2589 + }, + { + "epoch": 1.510864809683535, + "grad_norm": 1.1631211042404175, + "learning_rate": 1.2605168269230769e-05, + "loss": 0.788, + "step": 2590 + }, + { + "epoch": 1.5114481551698993, + "grad_norm": 1.421904444694519, + "learning_rate": 1.2590144230769233e-05, + "loss": 0.9168, + "step": 2591 + }, + { + "epoch": 1.5120315006562637, + "grad_norm": 1.1729179620742798, + "learning_rate": 1.2575120192307693e-05, + "loss": 0.9585, + "step": 2592 + }, + { + "epoch": 1.512614846142628, + "grad_norm": 1.3910799026489258, + "learning_rate": 1.2560096153846155e-05, + "loss": 0.6919, + "step": 2593 + }, + { + "epoch": 1.5131981916289923, + "grad_norm": 1.3072963953018188, + "learning_rate": 1.2545072115384615e-05, + "loss": 0.7613, + "step": 2594 + }, + { + "epoch": 1.5137815371153565, + "grad_norm": 1.4264005422592163, + "learning_rate": 1.2530048076923079e-05, + "loss": 0.9186, + "step": 2595 + }, + { + "epoch": 1.5143648826017209, + "grad_norm": 1.0973105430603027, + "learning_rate": 1.251502403846154e-05, + "loss": 0.6772, + "step": 2596 + }, + { + "epoch": 1.5149482280880853, + "grad_norm": 1.4456596374511719, + "learning_rate": 1.25e-05, + "loss": 0.7738, + "step": 2597 + }, + { + "epoch": 1.5155315735744495, + "grad_norm": 1.246766209602356, + "learning_rate": 1.248497596153846e-05, + "loss": 0.9088, + "step": 2598 + }, + { + "epoch": 1.5161149190608136, + "grad_norm": 1.170255422592163, + "learning_rate": 1.2469951923076924e-05, + "loss": 0.8891, + "step": 2599 + }, + { + "epoch": 1.516698264547178, + "grad_norm": 1.2451422214508057, + "learning_rate": 1.2454927884615385e-05, + "loss": 0.8856, + "step": 2600 + }, + { + "epoch": 1.516698264547178, + "eval_loss_squad": 0.8741619079979137, + "eval_perplexity": 7.975801635804587, + "eval_perplexity_reconstruct": 1.8950837793713629, + "step": 2600 + }, + { + "epoch": 1.5172816100335424, + "grad_norm": 1.1058474779129028, + "learning_rate": 1.2439903846153846e-05, + "loss": 0.8623, + "step": 2601 + }, + { + "epoch": 1.5178649555199066, + "grad_norm": 1.1985957622528076, + "learning_rate": 1.2424879807692308e-05, + "loss": 0.5664, + "step": 2602 + }, + { + "epoch": 1.5184483010062708, + "grad_norm": 1.2489391565322876, + "learning_rate": 1.240985576923077e-05, + "loss": 0.9798, + "step": 2603 + }, + { + "epoch": 1.5190316464926352, + "grad_norm": 1.2178716659545898, + "learning_rate": 1.239483173076923e-05, + "loss": 0.8585, + "step": 2604 + }, + { + "epoch": 1.5196149919789996, + "grad_norm": 1.065341591835022, + "learning_rate": 1.2379807692307694e-05, + "loss": 0.6899, + "step": 2605 + }, + { + "epoch": 1.5201983374653638, + "grad_norm": 1.3119381666183472, + "learning_rate": 1.2364783653846154e-05, + "loss": 0.743, + "step": 2606 + }, + { + "epoch": 1.520781682951728, + "grad_norm": 1.1781443357467651, + "learning_rate": 1.2349759615384616e-05, + "loss": 0.7418, + "step": 2607 + }, + { + "epoch": 1.5213650284380926, + "grad_norm": 1.3264269828796387, + "learning_rate": 1.2334735576923078e-05, + "loss": 0.7401, + "step": 2608 + }, + { + "epoch": 1.5219483739244568, + "grad_norm": 1.2371203899383545, + "learning_rate": 1.231971153846154e-05, + "loss": 0.6457, + "step": 2609 + }, + { + "epoch": 1.522531719410821, + "grad_norm": 1.1963739395141602, + "learning_rate": 1.23046875e-05, + "loss": 0.871, + "step": 2610 + }, + { + "epoch": 1.5231150648971854, + "grad_norm": 1.4940956830978394, + "learning_rate": 1.2289663461538462e-05, + "loss": 0.6387, + "step": 2611 + }, + { + "epoch": 1.5236984103835498, + "grad_norm": 1.158118724822998, + "learning_rate": 1.2274639423076924e-05, + "loss": 0.8764, + "step": 2612 + }, + { + "epoch": 1.524281755869914, + "grad_norm": 1.2709519863128662, + "learning_rate": 1.2259615384615384e-05, + "loss": 0.795, + "step": 2613 + }, + { + "epoch": 1.5248651013562782, + "grad_norm": 1.1874806880950928, + "learning_rate": 1.2244591346153848e-05, + "loss": 0.6796, + "step": 2614 + }, + { + "epoch": 1.5254484468426426, + "grad_norm": 1.2645267248153687, + "learning_rate": 1.2229567307692308e-05, + "loss": 0.6134, + "step": 2615 + }, + { + "epoch": 1.526031792329007, + "grad_norm": 1.4414528608322144, + "learning_rate": 1.221454326923077e-05, + "loss": 0.661, + "step": 2616 + }, + { + "epoch": 1.5266151378153712, + "grad_norm": 1.1239551305770874, + "learning_rate": 1.2199519230769232e-05, + "loss": 0.6612, + "step": 2617 + }, + { + "epoch": 1.5271984833017354, + "grad_norm": 1.0646827220916748, + "learning_rate": 1.2184495192307694e-05, + "loss": 0.8923, + "step": 2618 + }, + { + "epoch": 1.5277818287880998, + "grad_norm": 1.2423840761184692, + "learning_rate": 1.2169471153846154e-05, + "loss": 1.1771, + "step": 2619 + }, + { + "epoch": 1.5283651742744642, + "grad_norm": 1.0845423936843872, + "learning_rate": 1.2154447115384617e-05, + "loss": 0.735, + "step": 2620 + }, + { + "epoch": 1.5289485197608284, + "grad_norm": 1.2939835786819458, + "learning_rate": 1.2139423076923077e-05, + "loss": 0.856, + "step": 2621 + }, + { + "epoch": 1.5295318652471925, + "grad_norm": 1.2328243255615234, + "learning_rate": 1.212439903846154e-05, + "loss": 0.7687, + "step": 2622 + }, + { + "epoch": 1.530115210733557, + "grad_norm": 1.2373398542404175, + "learning_rate": 1.2109375000000001e-05, + "loss": 0.7192, + "step": 2623 + }, + { + "epoch": 1.5306985562199213, + "grad_norm": 1.3078012466430664, + "learning_rate": 1.2094350961538461e-05, + "loss": 1.0764, + "step": 2624 + }, + { + "epoch": 1.5312819017062855, + "grad_norm": 1.34095299243927, + "learning_rate": 1.2079326923076923e-05, + "loss": 0.7552, + "step": 2625 + }, + { + "epoch": 1.5318652471926497, + "grad_norm": 1.2611044645309448, + "learning_rate": 1.2064302884615385e-05, + "loss": 0.5605, + "step": 2626 + }, + { + "epoch": 1.5324485926790141, + "grad_norm": 1.3855218887329102, + "learning_rate": 1.2049278846153847e-05, + "loss": 0.7463, + "step": 2627 + }, + { + "epoch": 1.5330319381653785, + "grad_norm": 1.3558884859085083, + "learning_rate": 1.2034254807692307e-05, + "loss": 0.8794, + "step": 2628 + }, + { + "epoch": 1.5336152836517427, + "grad_norm": 1.287604570388794, + "learning_rate": 1.2019230769230771e-05, + "loss": 0.8108, + "step": 2629 + }, + { + "epoch": 1.534198629138107, + "grad_norm": 1.35361909866333, + "learning_rate": 1.2004206730769231e-05, + "loss": 0.7766, + "step": 2630 + }, + { + "epoch": 1.5347819746244713, + "grad_norm": 1.1538270711898804, + "learning_rate": 1.1989182692307693e-05, + "loss": 0.8437, + "step": 2631 + }, + { + "epoch": 1.5353653201108357, + "grad_norm": 1.212172031402588, + "learning_rate": 1.1974158653846155e-05, + "loss": 0.9256, + "step": 2632 + }, + { + "epoch": 1.5359486655972, + "grad_norm": 1.2593048810958862, + "learning_rate": 1.1959134615384617e-05, + "loss": 0.8088, + "step": 2633 + }, + { + "epoch": 1.536532011083564, + "grad_norm": 1.1822506189346313, + "learning_rate": 1.1944110576923077e-05, + "loss": 1.1436, + "step": 2634 + }, + { + "epoch": 1.5371153565699287, + "grad_norm": 1.1885801553726196, + "learning_rate": 1.1929086538461539e-05, + "loss": 0.7486, + "step": 2635 + }, + { + "epoch": 1.5376987020562929, + "grad_norm": 1.276310920715332, + "learning_rate": 1.19140625e-05, + "loss": 0.8676, + "step": 2636 + }, + { + "epoch": 1.538282047542657, + "grad_norm": 1.1194807291030884, + "learning_rate": 1.1899038461538461e-05, + "loss": 0.8396, + "step": 2637 + }, + { + "epoch": 1.5388653930290215, + "grad_norm": 1.291012167930603, + "learning_rate": 1.1884014423076925e-05, + "loss": 0.9238, + "step": 2638 + }, + { + "epoch": 1.5394487385153859, + "grad_norm": 1.2062819004058838, + "learning_rate": 1.1868990384615385e-05, + "loss": 0.714, + "step": 2639 + }, + { + "epoch": 1.54003208400175, + "grad_norm": 1.2483253479003906, + "learning_rate": 1.1853966346153847e-05, + "loss": 0.7584, + "step": 2640 + }, + { + "epoch": 1.5406154294881143, + "grad_norm": 1.4191927909851074, + "learning_rate": 1.1838942307692309e-05, + "loss": 0.9147, + "step": 2641 + }, + { + "epoch": 1.5411987749744787, + "grad_norm": 1.292017936706543, + "learning_rate": 1.182391826923077e-05, + "loss": 0.783, + "step": 2642 + }, + { + "epoch": 1.541782120460843, + "grad_norm": 1.4348483085632324, + "learning_rate": 1.180889423076923e-05, + "loss": 0.8497, + "step": 2643 + }, + { + "epoch": 1.5423654659472072, + "grad_norm": 1.4889435768127441, + "learning_rate": 1.1793870192307692e-05, + "loss": 0.8401, + "step": 2644 + }, + { + "epoch": 1.5429488114335714, + "grad_norm": 1.2667125463485718, + "learning_rate": 1.1778846153846154e-05, + "loss": 0.8579, + "step": 2645 + }, + { + "epoch": 1.5435321569199358, + "grad_norm": 1.2864303588867188, + "learning_rate": 1.1763822115384616e-05, + "loss": 0.7385, + "step": 2646 + }, + { + "epoch": 1.5441155024063002, + "grad_norm": 1.079271912574768, + "learning_rate": 1.1748798076923078e-05, + "loss": 0.7407, + "step": 2647 + }, + { + "epoch": 1.5446988478926644, + "grad_norm": 1.0965354442596436, + "learning_rate": 1.173377403846154e-05, + "loss": 0.7776, + "step": 2648 + }, + { + "epoch": 1.5452821933790286, + "grad_norm": 1.260859489440918, + "learning_rate": 1.171875e-05, + "loss": 0.7992, + "step": 2649 + }, + { + "epoch": 1.545865538865393, + "grad_norm": 1.2462226152420044, + "learning_rate": 1.1703725961538462e-05, + "loss": 1.0434, + "step": 2650 + }, + { + "epoch": 1.5464488843517574, + "grad_norm": 1.1139503717422485, + "learning_rate": 1.1688701923076924e-05, + "loss": 0.6601, + "step": 2651 + }, + { + "epoch": 1.5470322298381216, + "grad_norm": 1.375958800315857, + "learning_rate": 1.1673677884615384e-05, + "loss": 0.8542, + "step": 2652 + }, + { + "epoch": 1.5476155753244858, + "grad_norm": 1.268680214881897, + "learning_rate": 1.1658653846153846e-05, + "loss": 0.9015, + "step": 2653 + }, + { + "epoch": 1.5481989208108502, + "grad_norm": 1.3285831212997437, + "learning_rate": 1.1643629807692308e-05, + "loss": 0.7092, + "step": 2654 + }, + { + "epoch": 1.5487822662972146, + "grad_norm": 1.4205631017684937, + "learning_rate": 1.162860576923077e-05, + "loss": 0.7721, + "step": 2655 + }, + { + "epoch": 1.5493656117835788, + "grad_norm": 1.3641010522842407, + "learning_rate": 1.1613581730769232e-05, + "loss": 0.6439, + "step": 2656 + }, + { + "epoch": 1.549948957269943, + "grad_norm": 1.1985429525375366, + "learning_rate": 1.1598557692307694e-05, + "loss": 0.7375, + "step": 2657 + }, + { + "epoch": 1.5505323027563074, + "grad_norm": 1.200697898864746, + "learning_rate": 1.1583533653846154e-05, + "loss": 0.7262, + "step": 2658 + }, + { + "epoch": 1.5511156482426718, + "grad_norm": 1.057250738143921, + "learning_rate": 1.1568509615384616e-05, + "loss": 0.9285, + "step": 2659 + }, + { + "epoch": 1.551698993729036, + "grad_norm": 1.2788975238800049, + "learning_rate": 1.1553485576923078e-05, + "loss": 0.7772, + "step": 2660 + }, + { + "epoch": 1.5522823392154004, + "grad_norm": 1.139838695526123, + "learning_rate": 1.153846153846154e-05, + "loss": 0.949, + "step": 2661 + }, + { + "epoch": 1.5528656847017648, + "grad_norm": 1.4383840560913086, + "learning_rate": 1.15234375e-05, + "loss": 0.7398, + "step": 2662 + }, + { + "epoch": 1.553449030188129, + "grad_norm": 1.2270270586013794, + "learning_rate": 1.1508413461538462e-05, + "loss": 0.8875, + "step": 2663 + }, + { + "epoch": 1.5540323756744931, + "grad_norm": 1.2396215200424194, + "learning_rate": 1.1493389423076924e-05, + "loss": 0.8724, + "step": 2664 + }, + { + "epoch": 1.5546157211608576, + "grad_norm": 1.183029294013977, + "learning_rate": 1.1478365384615385e-05, + "loss": 0.9306, + "step": 2665 + }, + { + "epoch": 1.555199066647222, + "grad_norm": 1.2103503942489624, + "learning_rate": 1.1463341346153847e-05, + "loss": 0.7269, + "step": 2666 + }, + { + "epoch": 1.5557824121335861, + "grad_norm": 1.5745123624801636, + "learning_rate": 1.1448317307692307e-05, + "loss": 0.5633, + "step": 2667 + }, + { + "epoch": 1.5563657576199503, + "grad_norm": 1.3425438404083252, + "learning_rate": 1.143329326923077e-05, + "loss": 0.6148, + "step": 2668 + }, + { + "epoch": 1.5569491031063147, + "grad_norm": 1.2065016031265259, + "learning_rate": 1.1418269230769231e-05, + "loss": 0.6517, + "step": 2669 + }, + { + "epoch": 1.5575324485926791, + "grad_norm": 1.1521174907684326, + "learning_rate": 1.1403245192307693e-05, + "loss": 0.7774, + "step": 2670 + }, + { + "epoch": 1.5581157940790433, + "grad_norm": 1.0685138702392578, + "learning_rate": 1.1388221153846153e-05, + "loss": 0.8086, + "step": 2671 + }, + { + "epoch": 1.5586991395654075, + "grad_norm": 1.131866216659546, + "learning_rate": 1.1373197115384617e-05, + "loss": 0.8954, + "step": 2672 + }, + { + "epoch": 1.559282485051772, + "grad_norm": 1.074537754058838, + "learning_rate": 1.1358173076923077e-05, + "loss": 0.8301, + "step": 2673 + }, + { + "epoch": 1.5598658305381363, + "grad_norm": 1.255470633506775, + "learning_rate": 1.1343149038461539e-05, + "loss": 0.7473, + "step": 2674 + }, + { + "epoch": 1.5604491760245005, + "grad_norm": 1.3070282936096191, + "learning_rate": 1.1328125000000001e-05, + "loss": 0.7232, + "step": 2675 + }, + { + "epoch": 1.5610325215108647, + "grad_norm": 1.4808948040008545, + "learning_rate": 1.1313100961538461e-05, + "loss": 1.0816, + "step": 2676 + }, + { + "epoch": 1.561615866997229, + "grad_norm": 1.2335243225097656, + "learning_rate": 1.1298076923076923e-05, + "loss": 0.9074, + "step": 2677 + }, + { + "epoch": 1.5621992124835935, + "grad_norm": 1.1924381256103516, + "learning_rate": 1.1283052884615385e-05, + "loss": 0.6063, + "step": 2678 + }, + { + "epoch": 1.5627825579699577, + "grad_norm": 1.2754387855529785, + "learning_rate": 1.1268028846153847e-05, + "loss": 0.8983, + "step": 2679 + }, + { + "epoch": 1.5633659034563219, + "grad_norm": 1.326870083808899, + "learning_rate": 1.1253004807692307e-05, + "loss": 0.6428, + "step": 2680 + }, + { + "epoch": 1.5639492489426863, + "grad_norm": 1.1194915771484375, + "learning_rate": 1.123798076923077e-05, + "loss": 0.6659, + "step": 2681 + }, + { + "epoch": 1.5645325944290507, + "grad_norm": 1.119391679763794, + "learning_rate": 1.122295673076923e-05, + "loss": 0.8714, + "step": 2682 + }, + { + "epoch": 1.5651159399154149, + "grad_norm": 1.299011468887329, + "learning_rate": 1.1207932692307693e-05, + "loss": 0.8289, + "step": 2683 + }, + { + "epoch": 1.565699285401779, + "grad_norm": 1.0118248462677002, + "learning_rate": 1.1192908653846155e-05, + "loss": 0.6258, + "step": 2684 + }, + { + "epoch": 1.5662826308881435, + "grad_norm": 1.1196413040161133, + "learning_rate": 1.1177884615384616e-05, + "loss": 0.7186, + "step": 2685 + }, + { + "epoch": 1.5668659763745079, + "grad_norm": 1.2307488918304443, + "learning_rate": 1.1162860576923077e-05, + "loss": 0.7569, + "step": 2686 + }, + { + "epoch": 1.567449321860872, + "grad_norm": 1.2402465343475342, + "learning_rate": 1.114783653846154e-05, + "loss": 0.8829, + "step": 2687 + }, + { + "epoch": 1.5680326673472365, + "grad_norm": 1.262913465499878, + "learning_rate": 1.11328125e-05, + "loss": 0.8594, + "step": 2688 + }, + { + "epoch": 1.5686160128336009, + "grad_norm": 1.1202564239501953, + "learning_rate": 1.111778846153846e-05, + "loss": 0.8337, + "step": 2689 + }, + { + "epoch": 1.569199358319965, + "grad_norm": 1.263912320137024, + "learning_rate": 1.1102764423076924e-05, + "loss": 0.875, + "step": 2690 + }, + { + "epoch": 1.5697827038063292, + "grad_norm": 1.3599956035614014, + "learning_rate": 1.1087740384615384e-05, + "loss": 0.7736, + "step": 2691 + }, + { + "epoch": 1.5703660492926936, + "grad_norm": 1.435336947441101, + "learning_rate": 1.1072716346153846e-05, + "loss": 0.5259, + "step": 2692 + }, + { + "epoch": 1.570949394779058, + "grad_norm": 1.3584843873977661, + "learning_rate": 1.1057692307692308e-05, + "loss": 0.6383, + "step": 2693 + }, + { + "epoch": 1.5715327402654222, + "grad_norm": 1.44851815700531, + "learning_rate": 1.104266826923077e-05, + "loss": 0.8313, + "step": 2694 + }, + { + "epoch": 1.5721160857517864, + "grad_norm": 1.316826343536377, + "learning_rate": 1.102764423076923e-05, + "loss": 0.6728, + "step": 2695 + }, + { + "epoch": 1.5726994312381508, + "grad_norm": 1.5356096029281616, + "learning_rate": 1.1012620192307694e-05, + "loss": 0.7044, + "step": 2696 + }, + { + "epoch": 1.5732827767245152, + "grad_norm": 1.1947566270828247, + "learning_rate": 1.0997596153846154e-05, + "loss": 0.6776, + "step": 2697 + }, + { + "epoch": 1.5738661222108794, + "grad_norm": 1.3364490270614624, + "learning_rate": 1.0982572115384616e-05, + "loss": 0.8211, + "step": 2698 + }, + { + "epoch": 1.5744494676972436, + "grad_norm": 1.2481436729431152, + "learning_rate": 1.0967548076923078e-05, + "loss": 0.9184, + "step": 2699 + }, + { + "epoch": 1.575032813183608, + "grad_norm": 1.224586009979248, + "learning_rate": 1.095252403846154e-05, + "loss": 0.6932, + "step": 2700 + }, + { + "epoch": 1.5756161586699724, + "grad_norm": 1.1394731998443604, + "learning_rate": 1.09375e-05, + "loss": 0.8501, + "step": 2701 + }, + { + "epoch": 1.5761995041563366, + "grad_norm": 1.1618777513504028, + "learning_rate": 1.0922475961538462e-05, + "loss": 0.9925, + "step": 2702 + }, + { + "epoch": 1.5767828496427008, + "grad_norm": 1.480178952217102, + "learning_rate": 1.0907451923076924e-05, + "loss": 0.8851, + "step": 2703 + }, + { + "epoch": 1.5773661951290652, + "grad_norm": 1.0637935400009155, + "learning_rate": 1.0892427884615384e-05, + "loss": 0.7666, + "step": 2704 + }, + { + "epoch": 1.5779495406154296, + "grad_norm": 1.2903245687484741, + "learning_rate": 1.0877403846153847e-05, + "loss": 0.7153, + "step": 2705 + }, + { + "epoch": 1.5785328861017938, + "grad_norm": 1.4703887701034546, + "learning_rate": 1.0862379807692308e-05, + "loss": 0.7709, + "step": 2706 + }, + { + "epoch": 1.579116231588158, + "grad_norm": 1.12290358543396, + "learning_rate": 1.084735576923077e-05, + "loss": 0.7594, + "step": 2707 + }, + { + "epoch": 1.5796995770745224, + "grad_norm": 1.511317253112793, + "learning_rate": 1.0832331730769231e-05, + "loss": 0.9694, + "step": 2708 + }, + { + "epoch": 1.5802829225608868, + "grad_norm": 1.799383282661438, + "learning_rate": 1.0817307692307693e-05, + "loss": 0.6799, + "step": 2709 + }, + { + "epoch": 1.580866268047251, + "grad_norm": 1.1799027919769287, + "learning_rate": 1.0802283653846154e-05, + "loss": 0.8357, + "step": 2710 + }, + { + "epoch": 1.5814496135336151, + "grad_norm": 1.2925686836242676, + "learning_rate": 1.0787259615384617e-05, + "loss": 0.6977, + "step": 2711 + }, + { + "epoch": 1.5820329590199795, + "grad_norm": 1.284294843673706, + "learning_rate": 1.0772235576923077e-05, + "loss": 0.8429, + "step": 2712 + }, + { + "epoch": 1.582616304506344, + "grad_norm": 1.1772273778915405, + "learning_rate": 1.075721153846154e-05, + "loss": 0.692, + "step": 2713 + }, + { + "epoch": 1.5831996499927081, + "grad_norm": 1.1449207067489624, + "learning_rate": 1.0742187500000001e-05, + "loss": 0.7098, + "step": 2714 + }, + { + "epoch": 1.5837829954790725, + "grad_norm": 1.4304105043411255, + "learning_rate": 1.0727163461538461e-05, + "loss": 0.8735, + "step": 2715 + }, + { + "epoch": 1.584366340965437, + "grad_norm": 1.024904727935791, + "learning_rate": 1.0712139423076923e-05, + "loss": 0.7593, + "step": 2716 + }, + { + "epoch": 1.5849496864518011, + "grad_norm": 1.2996480464935303, + "learning_rate": 1.0697115384615385e-05, + "loss": 0.7183, + "step": 2717 + }, + { + "epoch": 1.5855330319381653, + "grad_norm": 1.3686482906341553, + "learning_rate": 1.0682091346153847e-05, + "loss": 0.9928, + "step": 2718 + }, + { + "epoch": 1.5861163774245297, + "grad_norm": 1.2611160278320312, + "learning_rate": 1.0667067307692307e-05, + "loss": 0.937, + "step": 2719 + }, + { + "epoch": 1.5866997229108941, + "grad_norm": 1.0135339498519897, + "learning_rate": 1.065204326923077e-05, + "loss": 0.8119, + "step": 2720 + }, + { + "epoch": 1.5872830683972583, + "grad_norm": 1.1465262174606323, + "learning_rate": 1.0637019230769231e-05, + "loss": 0.788, + "step": 2721 + }, + { + "epoch": 1.5878664138836225, + "grad_norm": 1.3036527633666992, + "learning_rate": 1.0621995192307693e-05, + "loss": 0.746, + "step": 2722 + }, + { + "epoch": 1.588449759369987, + "grad_norm": 1.3011112213134766, + "learning_rate": 1.0606971153846155e-05, + "loss": 0.9224, + "step": 2723 + }, + { + "epoch": 1.5890331048563513, + "grad_norm": 1.1597706079483032, + "learning_rate": 1.0591947115384617e-05, + "loss": 0.9486, + "step": 2724 + }, + { + "epoch": 1.5896164503427155, + "grad_norm": 1.1415634155273438, + "learning_rate": 1.0576923076923077e-05, + "loss": 0.6527, + "step": 2725 + }, + { + "epoch": 1.5901997958290797, + "grad_norm": 1.1724604368209839, + "learning_rate": 1.056189903846154e-05, + "loss": 0.6936, + "step": 2726 + }, + { + "epoch": 1.590783141315444, + "grad_norm": 1.014355182647705, + "learning_rate": 1.0546875e-05, + "loss": 0.7157, + "step": 2727 + }, + { + "epoch": 1.5913664868018085, + "grad_norm": 1.2243266105651855, + "learning_rate": 1.053185096153846e-05, + "loss": 0.7737, + "step": 2728 + }, + { + "epoch": 1.5919498322881727, + "grad_norm": 1.347441554069519, + "learning_rate": 1.0516826923076924e-05, + "loss": 0.6544, + "step": 2729 + }, + { + "epoch": 1.5925331777745368, + "grad_norm": 1.1011338233947754, + "learning_rate": 1.0501802884615385e-05, + "loss": 0.6885, + "step": 2730 + }, + { + "epoch": 1.5931165232609013, + "grad_norm": 1.2773078680038452, + "learning_rate": 1.0486778846153846e-05, + "loss": 0.826, + "step": 2731 + }, + { + "epoch": 1.5936998687472657, + "grad_norm": 1.3734136819839478, + "learning_rate": 1.0471754807692308e-05, + "loss": 0.8921, + "step": 2732 + }, + { + "epoch": 1.5942832142336298, + "grad_norm": 1.4568568468093872, + "learning_rate": 1.045673076923077e-05, + "loss": 0.7519, + "step": 2733 + }, + { + "epoch": 1.594866559719994, + "grad_norm": 1.0718237161636353, + "learning_rate": 1.044170673076923e-05, + "loss": 0.7066, + "step": 2734 + }, + { + "epoch": 1.5954499052063584, + "grad_norm": 1.2433981895446777, + "learning_rate": 1.0426682692307694e-05, + "loss": 0.5966, + "step": 2735 + }, + { + "epoch": 1.5960332506927228, + "grad_norm": 1.313547134399414, + "learning_rate": 1.0411658653846154e-05, + "loss": 0.714, + "step": 2736 + }, + { + "epoch": 1.596616596179087, + "grad_norm": 1.2181950807571411, + "learning_rate": 1.0396634615384616e-05, + "loss": 0.8592, + "step": 2737 + }, + { + "epoch": 1.5971999416654512, + "grad_norm": 1.5490280389785767, + "learning_rate": 1.0381610576923078e-05, + "loss": 0.8276, + "step": 2738 + }, + { + "epoch": 1.5977832871518156, + "grad_norm": 1.3620082139968872, + "learning_rate": 1.036658653846154e-05, + "loss": 1.0517, + "step": 2739 + }, + { + "epoch": 1.59836663263818, + "grad_norm": 1.155208945274353, + "learning_rate": 1.03515625e-05, + "loss": 0.6895, + "step": 2740 + }, + { + "epoch": 1.5989499781245442, + "grad_norm": 1.2351773977279663, + "learning_rate": 1.0336538461538462e-05, + "loss": 0.5748, + "step": 2741 + }, + { + "epoch": 1.5995333236109086, + "grad_norm": 0.9607925415039062, + "learning_rate": 1.0321514423076924e-05, + "loss": 0.87, + "step": 2742 + }, + { + "epoch": 1.600116669097273, + "grad_norm": 1.342832326889038, + "learning_rate": 1.0306490384615384e-05, + "loss": 0.6786, + "step": 2743 + }, + { + "epoch": 1.6007000145836372, + "grad_norm": 1.3285024166107178, + "learning_rate": 1.0291466346153848e-05, + "loss": 0.6508, + "step": 2744 + }, + { + "epoch": 1.6012833600700014, + "grad_norm": 1.2747256755828857, + "learning_rate": 1.0276442307692308e-05, + "loss": 0.7721, + "step": 2745 + }, + { + "epoch": 1.6018667055563658, + "grad_norm": 1.2518932819366455, + "learning_rate": 1.026141826923077e-05, + "loss": 0.7242, + "step": 2746 + }, + { + "epoch": 1.6024500510427302, + "grad_norm": 1.4885451793670654, + "learning_rate": 1.0246394230769232e-05, + "loss": 0.831, + "step": 2747 + }, + { + "epoch": 1.6030333965290944, + "grad_norm": 1.324741005897522, + "learning_rate": 1.0231370192307693e-05, + "loss": 0.7975, + "step": 2748 + }, + { + "epoch": 1.6036167420154586, + "grad_norm": 1.2721138000488281, + "learning_rate": 1.0216346153846154e-05, + "loss": 0.8043, + "step": 2749 + }, + { + "epoch": 1.604200087501823, + "grad_norm": 1.3388422727584839, + "learning_rate": 1.0201322115384617e-05, + "loss": 0.8146, + "step": 2750 + }, + { + "epoch": 1.6047834329881874, + "grad_norm": 1.4431674480438232, + "learning_rate": 1.0186298076923077e-05, + "loss": 0.8857, + "step": 2751 + }, + { + "epoch": 1.6053667784745516, + "grad_norm": 1.3353809118270874, + "learning_rate": 1.017127403846154e-05, + "loss": 0.6763, + "step": 2752 + }, + { + "epoch": 1.6059501239609157, + "grad_norm": 1.4101769924163818, + "learning_rate": 1.0156250000000001e-05, + "loss": 0.7768, + "step": 2753 + }, + { + "epoch": 1.6065334694472801, + "grad_norm": 1.2769949436187744, + "learning_rate": 1.0141225961538461e-05, + "loss": 0.803, + "step": 2754 + }, + { + "epoch": 1.6071168149336446, + "grad_norm": 1.3187309503555298, + "learning_rate": 1.0126201923076923e-05, + "loss": 0.7019, + "step": 2755 + }, + { + "epoch": 1.6077001604200087, + "grad_norm": 1.083141565322876, + "learning_rate": 1.0111177884615385e-05, + "loss": 0.8864, + "step": 2756 + }, + { + "epoch": 1.608283505906373, + "grad_norm": 1.8760899305343628, + "learning_rate": 1.0096153846153847e-05, + "loss": 0.8891, + "step": 2757 + }, + { + "epoch": 1.6088668513927373, + "grad_norm": 1.2959517240524292, + "learning_rate": 1.0081129807692307e-05, + "loss": 0.8422, + "step": 2758 + }, + { + "epoch": 1.6094501968791017, + "grad_norm": 1.4058853387832642, + "learning_rate": 1.0066105769230771e-05, + "loss": 0.6244, + "step": 2759 + }, + { + "epoch": 1.610033542365466, + "grad_norm": 1.154642939567566, + "learning_rate": 1.0051081730769231e-05, + "loss": 0.859, + "step": 2760 + }, + { + "epoch": 1.61061688785183, + "grad_norm": 1.209807276725769, + "learning_rate": 1.0036057692307693e-05, + "loss": 0.6576, + "step": 2761 + }, + { + "epoch": 1.6112002333381945, + "grad_norm": 1.3753244876861572, + "learning_rate": 1.0021033653846155e-05, + "loss": 0.6883, + "step": 2762 + }, + { + "epoch": 1.611783578824559, + "grad_norm": 1.2860386371612549, + "learning_rate": 1.0006009615384617e-05, + "loss": 0.8279, + "step": 2763 + }, + { + "epoch": 1.612366924310923, + "grad_norm": 1.2552486658096313, + "learning_rate": 9.990985576923077e-06, + "loss": 0.8707, + "step": 2764 + }, + { + "epoch": 1.6129502697972873, + "grad_norm": 1.2394871711730957, + "learning_rate": 9.975961538461539e-06, + "loss": 0.9137, + "step": 2765 + }, + { + "epoch": 1.6135336152836517, + "grad_norm": 1.0615895986557007, + "learning_rate": 9.9609375e-06, + "loss": 0.9192, + "step": 2766 + }, + { + "epoch": 1.614116960770016, + "grad_norm": 1.2719849348068237, + "learning_rate": 9.945913461538461e-06, + "loss": 0.8623, + "step": 2767 + }, + { + "epoch": 1.6147003062563803, + "grad_norm": 1.2009553909301758, + "learning_rate": 9.930889423076924e-06, + "loss": 0.7557, + "step": 2768 + }, + { + "epoch": 1.6152836517427447, + "grad_norm": 1.2548385858535767, + "learning_rate": 9.915865384615385e-06, + "loss": 0.8356, + "step": 2769 + }, + { + "epoch": 1.615866997229109, + "grad_norm": 1.4359416961669922, + "learning_rate": 9.900841346153847e-06, + "loss": 0.954, + "step": 2770 + }, + { + "epoch": 1.6164503427154733, + "grad_norm": 1.3595879077911377, + "learning_rate": 9.885817307692308e-06, + "loss": 0.771, + "step": 2771 + }, + { + "epoch": 1.6170336882018375, + "grad_norm": 1.4051216840744019, + "learning_rate": 9.87079326923077e-06, + "loss": 0.7762, + "step": 2772 + }, + { + "epoch": 1.6176170336882019, + "grad_norm": 1.0524299144744873, + "learning_rate": 9.85576923076923e-06, + "loss": 0.7751, + "step": 2773 + }, + { + "epoch": 1.6182003791745663, + "grad_norm": 1.2120068073272705, + "learning_rate": 9.840745192307692e-06, + "loss": 0.7958, + "step": 2774 + }, + { + "epoch": 1.6187837246609305, + "grad_norm": 1.271024227142334, + "learning_rate": 9.825721153846154e-06, + "loss": 0.8882, + "step": 2775 + }, + { + "epoch": 1.6193670701472946, + "grad_norm": 1.1276352405548096, + "learning_rate": 9.810697115384616e-06, + "loss": 0.8336, + "step": 2776 + }, + { + "epoch": 1.619950415633659, + "grad_norm": 1.3857134580612183, + "learning_rate": 9.795673076923078e-06, + "loss": 0.766, + "step": 2777 + }, + { + "epoch": 1.6205337611200235, + "grad_norm": 1.1864246129989624, + "learning_rate": 9.78064903846154e-06, + "loss": 0.6705, + "step": 2778 + }, + { + "epoch": 1.6211171066063876, + "grad_norm": 1.193347692489624, + "learning_rate": 9.765625e-06, + "loss": 0.865, + "step": 2779 + }, + { + "epoch": 1.6217004520927518, + "grad_norm": 1.2671443223953247, + "learning_rate": 9.750600961538462e-06, + "loss": 0.7479, + "step": 2780 + }, + { + "epoch": 1.6222837975791162, + "grad_norm": 1.2168381214141846, + "learning_rate": 9.735576923076924e-06, + "loss": 0.8945, + "step": 2781 + }, + { + "epoch": 1.6228671430654806, + "grad_norm": 1.0259840488433838, + "learning_rate": 9.720552884615384e-06, + "loss": 1.031, + "step": 2782 + }, + { + "epoch": 1.6234504885518448, + "grad_norm": 1.3780772686004639, + "learning_rate": 9.705528846153846e-06, + "loss": 0.9144, + "step": 2783 + }, + { + "epoch": 1.624033834038209, + "grad_norm": 1.1934678554534912, + "learning_rate": 9.690504807692308e-06, + "loss": 0.8131, + "step": 2784 + }, + { + "epoch": 1.6246171795245734, + "grad_norm": 1.2687208652496338, + "learning_rate": 9.67548076923077e-06, + "loss": 0.7369, + "step": 2785 + }, + { + "epoch": 1.6252005250109378, + "grad_norm": 1.2904934883117676, + "learning_rate": 9.660456730769232e-06, + "loss": 0.6605, + "step": 2786 + }, + { + "epoch": 1.625783870497302, + "grad_norm": 1.6874489784240723, + "learning_rate": 9.645432692307694e-06, + "loss": 0.8216, + "step": 2787 + }, + { + "epoch": 1.6263672159836662, + "grad_norm": 1.156012773513794, + "learning_rate": 9.630408653846154e-06, + "loss": 0.7344, + "step": 2788 + }, + { + "epoch": 1.6269505614700306, + "grad_norm": 1.2649706602096558, + "learning_rate": 9.615384615384616e-06, + "loss": 0.7568, + "step": 2789 + }, + { + "epoch": 1.627533906956395, + "grad_norm": 1.2135905027389526, + "learning_rate": 9.600360576923078e-06, + "loss": 0.7164, + "step": 2790 + }, + { + "epoch": 1.6281172524427592, + "grad_norm": 1.3913686275482178, + "learning_rate": 9.58533653846154e-06, + "loss": 0.7437, + "step": 2791 + }, + { + "epoch": 1.6287005979291234, + "grad_norm": 1.119268536567688, + "learning_rate": 9.5703125e-06, + "loss": 0.8903, + "step": 2792 + }, + { + "epoch": 1.6292839434154878, + "grad_norm": 1.0997390747070312, + "learning_rate": 9.555288461538462e-06, + "loss": 0.8839, + "step": 2793 + }, + { + "epoch": 1.6298672889018522, + "grad_norm": 1.7211315631866455, + "learning_rate": 9.540264423076923e-06, + "loss": 0.7962, + "step": 2794 + }, + { + "epoch": 1.6304506343882164, + "grad_norm": 1.1959589719772339, + "learning_rate": 9.525240384615385e-06, + "loss": 0.7228, + "step": 2795 + }, + { + "epoch": 1.6310339798745808, + "grad_norm": 1.3861689567565918, + "learning_rate": 9.510216346153847e-06, + "loss": 0.8062, + "step": 2796 + }, + { + "epoch": 1.6316173253609452, + "grad_norm": 1.251761794090271, + "learning_rate": 9.495192307692307e-06, + "loss": 0.8465, + "step": 2797 + }, + { + "epoch": 1.6322006708473094, + "grad_norm": 1.3125969171524048, + "learning_rate": 9.48016826923077e-06, + "loss": 0.9648, + "step": 2798 + }, + { + "epoch": 1.6327840163336735, + "grad_norm": 1.0540010929107666, + "learning_rate": 9.465144230769231e-06, + "loss": 0.8933, + "step": 2799 + }, + { + "epoch": 1.633367361820038, + "grad_norm": 1.2731456756591797, + "learning_rate": 9.450120192307693e-06, + "loss": 0.6307, + "step": 2800 + }, + { + "epoch": 1.633367361820038, + "eval_loss_squad": 0.8574257939518429, + "eval_perplexity": 8.239154888947215, + "eval_perplexity_reconstruct": 1.9011622713541043, + "step": 2800 + }, + { + "epoch": 1.6339507073064024, + "grad_norm": 1.4683425426483154, + "learning_rate": 9.435096153846153e-06, + "loss": 0.8113, + "step": 2801 + }, + { + "epoch": 1.6345340527927665, + "grad_norm": 1.1839910745620728, + "learning_rate": 9.420072115384617e-06, + "loss": 0.6765, + "step": 2802 + }, + { + "epoch": 1.6351173982791307, + "grad_norm": 1.1331663131713867, + "learning_rate": 9.405048076923077e-06, + "loss": 0.6725, + "step": 2803 + }, + { + "epoch": 1.6357007437654951, + "grad_norm": 1.2307205200195312, + "learning_rate": 9.390024038461539e-06, + "loss": 0.728, + "step": 2804 + }, + { + "epoch": 1.6362840892518595, + "grad_norm": 1.178563117980957, + "learning_rate": 9.375000000000001e-06, + "loss": 0.8463, + "step": 2805 + }, + { + "epoch": 1.6368674347382237, + "grad_norm": 1.1501257419586182, + "learning_rate": 9.359975961538461e-06, + "loss": 0.9248, + "step": 2806 + }, + { + "epoch": 1.637450780224588, + "grad_norm": 1.2466018199920654, + "learning_rate": 9.344951923076923e-06, + "loss": 0.8406, + "step": 2807 + }, + { + "epoch": 1.6380341257109523, + "grad_norm": 1.2315740585327148, + "learning_rate": 9.329927884615385e-06, + "loss": 0.8917, + "step": 2808 + }, + { + "epoch": 1.6386174711973167, + "grad_norm": 1.3349418640136719, + "learning_rate": 9.314903846153847e-06, + "loss": 0.9475, + "step": 2809 + }, + { + "epoch": 1.639200816683681, + "grad_norm": 1.3950275182724, + "learning_rate": 9.299879807692307e-06, + "loss": 0.6757, + "step": 2810 + }, + { + "epoch": 1.639784162170045, + "grad_norm": 1.2387348413467407, + "learning_rate": 9.28485576923077e-06, + "loss": 0.7381, + "step": 2811 + }, + { + "epoch": 1.6403675076564095, + "grad_norm": 1.3129287958145142, + "learning_rate": 9.26983173076923e-06, + "loss": 0.6148, + "step": 2812 + }, + { + "epoch": 1.640950853142774, + "grad_norm": 1.1648530960083008, + "learning_rate": 9.254807692307693e-06, + "loss": 0.7276, + "step": 2813 + }, + { + "epoch": 1.641534198629138, + "grad_norm": 1.53678560256958, + "learning_rate": 9.239783653846154e-06, + "loss": 0.9696, + "step": 2814 + }, + { + "epoch": 1.6421175441155023, + "grad_norm": 1.1140925884246826, + "learning_rate": 9.224759615384616e-06, + "loss": 0.6841, + "step": 2815 + }, + { + "epoch": 1.6427008896018667, + "grad_norm": 1.36318838596344, + "learning_rate": 9.209735576923077e-06, + "loss": 0.8003, + "step": 2816 + }, + { + "epoch": 1.643284235088231, + "grad_norm": 1.1023435592651367, + "learning_rate": 9.19471153846154e-06, + "loss": 0.7749, + "step": 2817 + }, + { + "epoch": 1.6438675805745953, + "grad_norm": 1.7993230819702148, + "learning_rate": 9.1796875e-06, + "loss": 0.6514, + "step": 2818 + }, + { + "epoch": 1.6444509260609594, + "grad_norm": 1.1305932998657227, + "learning_rate": 9.16466346153846e-06, + "loss": 0.802, + "step": 2819 + }, + { + "epoch": 1.645034271547324, + "grad_norm": 1.2455143928527832, + "learning_rate": 9.149639423076924e-06, + "loss": 0.6783, + "step": 2820 + }, + { + "epoch": 1.6456176170336883, + "grad_norm": 1.2018240690231323, + "learning_rate": 9.134615384615384e-06, + "loss": 0.6678, + "step": 2821 + }, + { + "epoch": 1.6462009625200524, + "grad_norm": 1.5132325887680054, + "learning_rate": 9.119591346153846e-06, + "loss": 0.8185, + "step": 2822 + }, + { + "epoch": 1.6467843080064168, + "grad_norm": 1.1789501905441284, + "learning_rate": 9.104567307692308e-06, + "loss": 0.7072, + "step": 2823 + }, + { + "epoch": 1.6473676534927812, + "grad_norm": 1.2296137809753418, + "learning_rate": 9.08954326923077e-06, + "loss": 0.8957, + "step": 2824 + }, + { + "epoch": 1.6479509989791454, + "grad_norm": 1.2670032978057861, + "learning_rate": 9.07451923076923e-06, + "loss": 1.0086, + "step": 2825 + }, + { + "epoch": 1.6485343444655096, + "grad_norm": 1.2225745916366577, + "learning_rate": 9.059495192307694e-06, + "loss": 0.7689, + "step": 2826 + }, + { + "epoch": 1.649117689951874, + "grad_norm": 1.3422561883926392, + "learning_rate": 9.044471153846154e-06, + "loss": 0.7682, + "step": 2827 + }, + { + "epoch": 1.6497010354382384, + "grad_norm": 1.7664170265197754, + "learning_rate": 9.029447115384616e-06, + "loss": 0.9315, + "step": 2828 + }, + { + "epoch": 1.6502843809246026, + "grad_norm": 1.141822338104248, + "learning_rate": 9.014423076923078e-06, + "loss": 0.7782, + "step": 2829 + }, + { + "epoch": 1.6508677264109668, + "grad_norm": 1.266010046005249, + "learning_rate": 8.99939903846154e-06, + "loss": 0.7129, + "step": 2830 + }, + { + "epoch": 1.6514510718973312, + "grad_norm": 1.1951196193695068, + "learning_rate": 8.984375e-06, + "loss": 0.8082, + "step": 2831 + }, + { + "epoch": 1.6520344173836956, + "grad_norm": 1.1922125816345215, + "learning_rate": 8.969350961538462e-06, + "loss": 0.8675, + "step": 2832 + }, + { + "epoch": 1.6526177628700598, + "grad_norm": 1.4151067733764648, + "learning_rate": 8.954326923076924e-06, + "loss": 0.7596, + "step": 2833 + }, + { + "epoch": 1.653201108356424, + "grad_norm": 1.496769666671753, + "learning_rate": 8.939302884615384e-06, + "loss": 0.7633, + "step": 2834 + }, + { + "epoch": 1.6537844538427884, + "grad_norm": 1.1096853017807007, + "learning_rate": 8.924278846153847e-06, + "loss": 0.782, + "step": 2835 + }, + { + "epoch": 1.6543677993291528, + "grad_norm": 1.4348713159561157, + "learning_rate": 8.909254807692308e-06, + "loss": 0.7652, + "step": 2836 + }, + { + "epoch": 1.654951144815517, + "grad_norm": 1.3088988065719604, + "learning_rate": 8.89423076923077e-06, + "loss": 0.7302, + "step": 2837 + }, + { + "epoch": 1.6555344903018812, + "grad_norm": 1.1694996356964111, + "learning_rate": 8.879206730769231e-06, + "loss": 0.9367, + "step": 2838 + }, + { + "epoch": 1.6561178357882456, + "grad_norm": 1.274279236793518, + "learning_rate": 8.864182692307693e-06, + "loss": 0.7277, + "step": 2839 + }, + { + "epoch": 1.65670118127461, + "grad_norm": 1.4117878675460815, + "learning_rate": 8.849158653846153e-06, + "loss": 0.6493, + "step": 2840 + }, + { + "epoch": 1.6572845267609742, + "grad_norm": 1.1404685974121094, + "learning_rate": 8.834134615384617e-06, + "loss": 0.8478, + "step": 2841 + }, + { + "epoch": 1.6578678722473383, + "grad_norm": 1.1155657768249512, + "learning_rate": 8.819110576923077e-06, + "loss": 0.8945, + "step": 2842 + }, + { + "epoch": 1.6584512177337027, + "grad_norm": 1.2752269506454468, + "learning_rate": 8.804086538461539e-06, + "loss": 0.7728, + "step": 2843 + }, + { + "epoch": 1.6590345632200671, + "grad_norm": 1.6460522413253784, + "learning_rate": 8.789062500000001e-06, + "loss": 0.7511, + "step": 2844 + }, + { + "epoch": 1.6596179087064313, + "grad_norm": 1.4639447927474976, + "learning_rate": 8.774038461538461e-06, + "loss": 0.6837, + "step": 2845 + }, + { + "epoch": 1.6602012541927955, + "grad_norm": 1.4805471897125244, + "learning_rate": 8.759014423076923e-06, + "loss": 0.832, + "step": 2846 + }, + { + "epoch": 1.6607845996791601, + "grad_norm": 1.1659775972366333, + "learning_rate": 8.743990384615385e-06, + "loss": 0.7268, + "step": 2847 + }, + { + "epoch": 1.6613679451655243, + "grad_norm": 1.190755009651184, + "learning_rate": 8.728966346153847e-06, + "loss": 0.6614, + "step": 2848 + }, + { + "epoch": 1.6619512906518885, + "grad_norm": 1.1789089441299438, + "learning_rate": 8.713942307692307e-06, + "loss": 0.8071, + "step": 2849 + }, + { + "epoch": 1.662534636138253, + "grad_norm": 1.2296158075332642, + "learning_rate": 8.69891826923077e-06, + "loss": 0.6635, + "step": 2850 + }, + { + "epoch": 1.6631179816246173, + "grad_norm": 1.1168346405029297, + "learning_rate": 8.683894230769231e-06, + "loss": 0.714, + "step": 2851 + }, + { + "epoch": 1.6637013271109815, + "grad_norm": 1.1426185369491577, + "learning_rate": 8.668870192307693e-06, + "loss": 0.7065, + "step": 2852 + }, + { + "epoch": 1.6642846725973457, + "grad_norm": 1.2550532817840576, + "learning_rate": 8.653846153846155e-06, + "loss": 0.6332, + "step": 2853 + }, + { + "epoch": 1.66486801808371, + "grad_norm": 1.2261812686920166, + "learning_rate": 8.638822115384617e-06, + "loss": 0.7257, + "step": 2854 + }, + { + "epoch": 1.6654513635700745, + "grad_norm": 1.1258302927017212, + "learning_rate": 8.623798076923077e-06, + "loss": 0.9171, + "step": 2855 + }, + { + "epoch": 1.6660347090564387, + "grad_norm": 1.2768467664718628, + "learning_rate": 8.60877403846154e-06, + "loss": 0.6836, + "step": 2856 + }, + { + "epoch": 1.6666180545428029, + "grad_norm": 0.9513541460037231, + "learning_rate": 8.59375e-06, + "loss": 0.7402, + "step": 2857 + }, + { + "epoch": 1.6672014000291673, + "grad_norm": 1.2110565900802612, + "learning_rate": 8.57872596153846e-06, + "loss": 0.6598, + "step": 2858 + }, + { + "epoch": 1.6677847455155317, + "grad_norm": 1.2651914358139038, + "learning_rate": 8.563701923076924e-06, + "loss": 0.8355, + "step": 2859 + }, + { + "epoch": 1.6683680910018959, + "grad_norm": 1.2185043096542358, + "learning_rate": 8.548677884615384e-06, + "loss": 0.6839, + "step": 2860 + }, + { + "epoch": 1.66895143648826, + "grad_norm": 1.2617533206939697, + "learning_rate": 8.533653846153846e-06, + "loss": 0.8926, + "step": 2861 + }, + { + "epoch": 1.6695347819746245, + "grad_norm": 1.3251715898513794, + "learning_rate": 8.518629807692308e-06, + "loss": 0.956, + "step": 2862 + }, + { + "epoch": 1.6701181274609889, + "grad_norm": 1.2607977390289307, + "learning_rate": 8.50360576923077e-06, + "loss": 0.81, + "step": 2863 + }, + { + "epoch": 1.670701472947353, + "grad_norm": 1.0856292247772217, + "learning_rate": 8.48858173076923e-06, + "loss": 0.8838, + "step": 2864 + }, + { + "epoch": 1.6712848184337172, + "grad_norm": 1.4323976039886475, + "learning_rate": 8.473557692307694e-06, + "loss": 0.908, + "step": 2865 + }, + { + "epoch": 1.6718681639200816, + "grad_norm": 1.1892898082733154, + "learning_rate": 8.458533653846154e-06, + "loss": 0.8645, + "step": 2866 + }, + { + "epoch": 1.672451509406446, + "grad_norm": 1.328147530555725, + "learning_rate": 8.443509615384616e-06, + "loss": 0.7748, + "step": 2867 + }, + { + "epoch": 1.6730348548928102, + "grad_norm": 1.4214242696762085, + "learning_rate": 8.428485576923078e-06, + "loss": 0.8796, + "step": 2868 + }, + { + "epoch": 1.6736182003791744, + "grad_norm": 0.941706120967865, + "learning_rate": 8.41346153846154e-06, + "loss": 0.7776, + "step": 2869 + }, + { + "epoch": 1.6742015458655388, + "grad_norm": 1.255053997039795, + "learning_rate": 8.3984375e-06, + "loss": 0.6698, + "step": 2870 + }, + { + "epoch": 1.6747848913519032, + "grad_norm": 1.1494503021240234, + "learning_rate": 8.383413461538462e-06, + "loss": 0.581, + "step": 2871 + }, + { + "epoch": 1.6753682368382674, + "grad_norm": 1.3444379568099976, + "learning_rate": 8.368389423076924e-06, + "loss": 0.7983, + "step": 2872 + }, + { + "epoch": 1.6759515823246318, + "grad_norm": 1.1211199760437012, + "learning_rate": 8.353365384615384e-06, + "loss": 0.6528, + "step": 2873 + }, + { + "epoch": 1.6765349278109962, + "grad_norm": 1.30573570728302, + "learning_rate": 8.338341346153848e-06, + "loss": 0.7975, + "step": 2874 + }, + { + "epoch": 1.6771182732973604, + "grad_norm": 1.0336012840270996, + "learning_rate": 8.323317307692308e-06, + "loss": 0.5304, + "step": 2875 + }, + { + "epoch": 1.6777016187837246, + "grad_norm": 1.2422670125961304, + "learning_rate": 8.30829326923077e-06, + "loss": 0.6592, + "step": 2876 + }, + { + "epoch": 1.678284964270089, + "grad_norm": 1.3174580335617065, + "learning_rate": 8.293269230769232e-06, + "loss": 0.7867, + "step": 2877 + }, + { + "epoch": 1.6788683097564534, + "grad_norm": 1.1110163927078247, + "learning_rate": 8.278245192307693e-06, + "loss": 0.7349, + "step": 2878 + }, + { + "epoch": 1.6794516552428176, + "grad_norm": 1.2657557725906372, + "learning_rate": 8.263221153846154e-06, + "loss": 0.9434, + "step": 2879 + }, + { + "epoch": 1.6800350007291818, + "grad_norm": 1.080836296081543, + "learning_rate": 8.248197115384616e-06, + "loss": 0.7363, + "step": 2880 + }, + { + "epoch": 1.6806183462155462, + "grad_norm": 1.3697772026062012, + "learning_rate": 8.233173076923077e-06, + "loss": 0.7499, + "step": 2881 + }, + { + "epoch": 1.6812016917019106, + "grad_norm": 1.0886021852493286, + "learning_rate": 8.21814903846154e-06, + "loss": 0.9581, + "step": 2882 + }, + { + "epoch": 1.6817850371882748, + "grad_norm": 1.3300132751464844, + "learning_rate": 8.203125000000001e-06, + "loss": 0.7848, + "step": 2883 + }, + { + "epoch": 1.682368382674639, + "grad_norm": 1.4838621616363525, + "learning_rate": 8.188100961538461e-06, + "loss": 0.9271, + "step": 2884 + }, + { + "epoch": 1.6829517281610034, + "grad_norm": 1.3038266897201538, + "learning_rate": 8.173076923076923e-06, + "loss": 0.7117, + "step": 2885 + }, + { + "epoch": 1.6835350736473678, + "grad_norm": 1.1892757415771484, + "learning_rate": 8.158052884615385e-06, + "loss": 0.5654, + "step": 2886 + }, + { + "epoch": 1.684118419133732, + "grad_norm": 1.1837880611419678, + "learning_rate": 8.143028846153847e-06, + "loss": 0.7745, + "step": 2887 + }, + { + "epoch": 1.6847017646200961, + "grad_norm": 1.2857214212417603, + "learning_rate": 8.128004807692307e-06, + "loss": 0.7224, + "step": 2888 + }, + { + "epoch": 1.6852851101064605, + "grad_norm": 1.1604810953140259, + "learning_rate": 8.112980769230769e-06, + "loss": 0.8615, + "step": 2889 + }, + { + "epoch": 1.685868455592825, + "grad_norm": 1.1492291688919067, + "learning_rate": 8.097956730769231e-06, + "loss": 0.8235, + "step": 2890 + }, + { + "epoch": 1.6864518010791891, + "grad_norm": 1.4180827140808105, + "learning_rate": 8.082932692307693e-06, + "loss": 0.7844, + "step": 2891 + }, + { + "epoch": 1.6870351465655533, + "grad_norm": 1.3198400735855103, + "learning_rate": 8.067908653846155e-06, + "loss": 0.9021, + "step": 2892 + }, + { + "epoch": 1.6876184920519177, + "grad_norm": 1.2082802057266235, + "learning_rate": 8.052884615384617e-06, + "loss": 0.8417, + "step": 2893 + }, + { + "epoch": 1.6882018375382821, + "grad_norm": 1.4099825620651245, + "learning_rate": 8.037860576923077e-06, + "loss": 0.9223, + "step": 2894 + }, + { + "epoch": 1.6887851830246463, + "grad_norm": 1.4837796688079834, + "learning_rate": 8.022836538461539e-06, + "loss": 1.0981, + "step": 2895 + }, + { + "epoch": 1.6893685285110105, + "grad_norm": 1.2653106451034546, + "learning_rate": 8.0078125e-06, + "loss": 0.8805, + "step": 2896 + }, + { + "epoch": 1.689951873997375, + "grad_norm": 1.1850988864898682, + "learning_rate": 7.992788461538461e-06, + "loss": 0.6671, + "step": 2897 + }, + { + "epoch": 1.6905352194837393, + "grad_norm": 1.2096939086914062, + "learning_rate": 7.977764423076923e-06, + "loss": 0.615, + "step": 2898 + }, + { + "epoch": 1.6911185649701035, + "grad_norm": 1.1689413785934448, + "learning_rate": 7.962740384615385e-06, + "loss": 0.7358, + "step": 2899 + }, + { + "epoch": 1.691701910456468, + "grad_norm": 1.2683144807815552, + "learning_rate": 7.947716346153847e-06, + "loss": 0.8217, + "step": 2900 + }, + { + "epoch": 1.6922852559428323, + "grad_norm": 1.4966411590576172, + "learning_rate": 7.932692307692308e-06, + "loss": 0.8058, + "step": 2901 + }, + { + "epoch": 1.6928686014291965, + "grad_norm": 1.3604378700256348, + "learning_rate": 7.91766826923077e-06, + "loss": 0.8571, + "step": 2902 + }, + { + "epoch": 1.6934519469155607, + "grad_norm": 1.2549089193344116, + "learning_rate": 7.90264423076923e-06, + "loss": 0.8095, + "step": 2903 + }, + { + "epoch": 1.694035292401925, + "grad_norm": 1.293096899986267, + "learning_rate": 7.887620192307692e-06, + "loss": 0.888, + "step": 2904 + }, + { + "epoch": 1.6946186378882895, + "grad_norm": 1.2265640497207642, + "learning_rate": 7.872596153846154e-06, + "loss": 0.6778, + "step": 2905 + }, + { + "epoch": 1.6952019833746537, + "grad_norm": 1.3105190992355347, + "learning_rate": 7.857572115384616e-06, + "loss": 0.6253, + "step": 2906 + }, + { + "epoch": 1.6957853288610178, + "grad_norm": 1.0808024406433105, + "learning_rate": 7.842548076923076e-06, + "loss": 0.8923, + "step": 2907 + }, + { + "epoch": 1.6963686743473823, + "grad_norm": 1.20291268825531, + "learning_rate": 7.82752403846154e-06, + "loss": 0.5709, + "step": 2908 + }, + { + "epoch": 1.6969520198337467, + "grad_norm": 1.1902101039886475, + "learning_rate": 7.8125e-06, + "loss": 0.7631, + "step": 2909 + }, + { + "epoch": 1.6975353653201108, + "grad_norm": 1.1930304765701294, + "learning_rate": 7.797475961538462e-06, + "loss": 0.8958, + "step": 2910 + }, + { + "epoch": 1.698118710806475, + "grad_norm": 1.1427156925201416, + "learning_rate": 7.782451923076924e-06, + "loss": 0.8889, + "step": 2911 + }, + { + "epoch": 1.6987020562928394, + "grad_norm": 1.172147512435913, + "learning_rate": 7.767427884615384e-06, + "loss": 0.733, + "step": 2912 + }, + { + "epoch": 1.6992854017792038, + "grad_norm": 1.1946724653244019, + "learning_rate": 7.752403846153846e-06, + "loss": 1.0244, + "step": 2913 + }, + { + "epoch": 1.699868747265568, + "grad_norm": 1.224003791809082, + "learning_rate": 7.737379807692308e-06, + "loss": 0.8716, + "step": 2914 + }, + { + "epoch": 1.7004520927519322, + "grad_norm": 1.2379382848739624, + "learning_rate": 7.72235576923077e-06, + "loss": 0.9171, + "step": 2915 + }, + { + "epoch": 1.7010354382382966, + "grad_norm": 1.3675721883773804, + "learning_rate": 7.70733173076923e-06, + "loss": 0.7572, + "step": 2916 + }, + { + "epoch": 1.701618783724661, + "grad_norm": 1.1662015914916992, + "learning_rate": 7.692307692307694e-06, + "loss": 0.726, + "step": 2917 + }, + { + "epoch": 1.7022021292110252, + "grad_norm": 0.9387063980102539, + "learning_rate": 7.677283653846154e-06, + "loss": 0.8387, + "step": 2918 + }, + { + "epoch": 1.7027854746973894, + "grad_norm": 1.1246999502182007, + "learning_rate": 7.662259615384616e-06, + "loss": 0.7998, + "step": 2919 + }, + { + "epoch": 1.7033688201837538, + "grad_norm": 1.011136770248413, + "learning_rate": 7.647235576923078e-06, + "loss": 0.6432, + "step": 2920 + }, + { + "epoch": 1.7039521656701182, + "grad_norm": 1.1951652765274048, + "learning_rate": 7.63221153846154e-06, + "loss": 0.9636, + "step": 2921 + }, + { + "epoch": 1.7045355111564824, + "grad_norm": 1.4355970621109009, + "learning_rate": 7.6171875000000005e-06, + "loss": 0.6864, + "step": 2922 + }, + { + "epoch": 1.7051188566428466, + "grad_norm": 1.2137242555618286, + "learning_rate": 7.6021634615384615e-06, + "loss": 0.9197, + "step": 2923 + }, + { + "epoch": 1.705702202129211, + "grad_norm": 1.3542622327804565, + "learning_rate": 7.5871394230769234e-06, + "loss": 0.5522, + "step": 2924 + }, + { + "epoch": 1.7062855476155754, + "grad_norm": 1.1288126707077026, + "learning_rate": 7.5721153846153845e-06, + "loss": 0.7477, + "step": 2925 + }, + { + "epoch": 1.7068688931019396, + "grad_norm": 1.3344682455062866, + "learning_rate": 7.557091346153846e-06, + "loss": 0.873, + "step": 2926 + }, + { + "epoch": 1.707452238588304, + "grad_norm": 1.3780889511108398, + "learning_rate": 7.542067307692307e-06, + "loss": 0.7851, + "step": 2927 + }, + { + "epoch": 1.7080355840746684, + "grad_norm": 1.1783818006515503, + "learning_rate": 7.52704326923077e-06, + "loss": 0.8652, + "step": 2928 + }, + { + "epoch": 1.7086189295610326, + "grad_norm": 1.1674573421478271, + "learning_rate": 7.512019230769231e-06, + "loss": 0.8358, + "step": 2929 + }, + { + "epoch": 1.7092022750473967, + "grad_norm": 1.1287046670913696, + "learning_rate": 7.496995192307693e-06, + "loss": 0.7869, + "step": 2930 + }, + { + "epoch": 1.7097856205337612, + "grad_norm": 1.240212082862854, + "learning_rate": 7.481971153846154e-06, + "loss": 0.9021, + "step": 2931 + }, + { + "epoch": 1.7103689660201256, + "grad_norm": 1.4593373537063599, + "learning_rate": 7.466947115384616e-06, + "loss": 0.8768, + "step": 2932 + }, + { + "epoch": 1.7109523115064897, + "grad_norm": 1.5059845447540283, + "learning_rate": 7.451923076923077e-06, + "loss": 0.7657, + "step": 2933 + }, + { + "epoch": 1.711535656992854, + "grad_norm": 1.0853300094604492, + "learning_rate": 7.43689903846154e-06, + "loss": 0.935, + "step": 2934 + }, + { + "epoch": 1.7121190024792183, + "grad_norm": 1.382460117340088, + "learning_rate": 7.421875e-06, + "loss": 0.644, + "step": 2935 + }, + { + "epoch": 1.7127023479655827, + "grad_norm": 1.4362154006958008, + "learning_rate": 7.406850961538461e-06, + "loss": 0.5482, + "step": 2936 + }, + { + "epoch": 1.713285693451947, + "grad_norm": 1.2484700679779053, + "learning_rate": 7.391826923076924e-06, + "loss": 0.799, + "step": 2937 + }, + { + "epoch": 1.713869038938311, + "grad_norm": 1.0450493097305298, + "learning_rate": 7.376802884615385e-06, + "loss": 0.7939, + "step": 2938 + }, + { + "epoch": 1.7144523844246755, + "grad_norm": 1.2090156078338623, + "learning_rate": 7.361778846153847e-06, + "loss": 0.7845, + "step": 2939 + }, + { + "epoch": 1.71503572991104, + "grad_norm": 1.0005606412887573, + "learning_rate": 7.346754807692308e-06, + "loss": 0.8881, + "step": 2940 + }, + { + "epoch": 1.715619075397404, + "grad_norm": 1.2529767751693726, + "learning_rate": 7.33173076923077e-06, + "loss": 0.9444, + "step": 2941 + }, + { + "epoch": 1.7162024208837683, + "grad_norm": 1.3556801080703735, + "learning_rate": 7.316706730769231e-06, + "loss": 0.8145, + "step": 2942 + }, + { + "epoch": 1.7167857663701327, + "grad_norm": 1.385504126548767, + "learning_rate": 7.301682692307693e-06, + "loss": 0.8717, + "step": 2943 + }, + { + "epoch": 1.717369111856497, + "grad_norm": 1.2711490392684937, + "learning_rate": 7.286658653846154e-06, + "loss": 0.6642, + "step": 2944 + }, + { + "epoch": 1.7179524573428613, + "grad_norm": 1.199277639389038, + "learning_rate": 7.271634615384616e-06, + "loss": 0.908, + "step": 2945 + }, + { + "epoch": 1.7185358028292255, + "grad_norm": 1.2609366178512573, + "learning_rate": 7.256610576923077e-06, + "loss": 0.9037, + "step": 2946 + }, + { + "epoch": 1.7191191483155899, + "grad_norm": 1.1617000102996826, + "learning_rate": 7.241586538461539e-06, + "loss": 0.8382, + "step": 2947 + }, + { + "epoch": 1.7197024938019543, + "grad_norm": 1.1657629013061523, + "learning_rate": 7.2265625e-06, + "loss": 1.1246, + "step": 2948 + }, + { + "epoch": 1.7202858392883185, + "grad_norm": 1.197189211845398, + "learning_rate": 7.211538461538461e-06, + "loss": 0.8029, + "step": 2949 + }, + { + "epoch": 1.7208691847746826, + "grad_norm": 1.611620306968689, + "learning_rate": 7.196514423076923e-06, + "loss": 0.9494, + "step": 2950 + }, + { + "epoch": 1.721452530261047, + "grad_norm": 1.2203035354614258, + "learning_rate": 7.181490384615384e-06, + "loss": 1.0667, + "step": 2951 + }, + { + "epoch": 1.7220358757474115, + "grad_norm": 1.2028281688690186, + "learning_rate": 7.166466346153847e-06, + "loss": 0.6483, + "step": 2952 + }, + { + "epoch": 1.7226192212337756, + "grad_norm": 1.251402735710144, + "learning_rate": 7.151442307692307e-06, + "loss": 0.8199, + "step": 2953 + }, + { + "epoch": 1.72320256672014, + "grad_norm": 1.2342396974563599, + "learning_rate": 7.13641826923077e-06, + "loss": 0.7685, + "step": 2954 + }, + { + "epoch": 1.7237859122065045, + "grad_norm": 1.1816489696502686, + "learning_rate": 7.121394230769231e-06, + "loss": 0.7799, + "step": 2955 + }, + { + "epoch": 1.7243692576928686, + "grad_norm": 1.373840093612671, + "learning_rate": 7.106370192307693e-06, + "loss": 0.7768, + "step": 2956 + }, + { + "epoch": 1.7249526031792328, + "grad_norm": 1.6701674461364746, + "learning_rate": 7.091346153846154e-06, + "loss": 0.953, + "step": 2957 + }, + { + "epoch": 1.7255359486655972, + "grad_norm": 1.3481673002243042, + "learning_rate": 7.076322115384617e-06, + "loss": 0.8016, + "step": 2958 + }, + { + "epoch": 1.7261192941519616, + "grad_norm": 1.2999407052993774, + "learning_rate": 7.061298076923077e-06, + "loss": 0.5779, + "step": 2959 + }, + { + "epoch": 1.7267026396383258, + "grad_norm": 1.1688950061798096, + "learning_rate": 7.04627403846154e-06, + "loss": 0.937, + "step": 2960 + }, + { + "epoch": 1.72728598512469, + "grad_norm": 1.5214632749557495, + "learning_rate": 7.031250000000001e-06, + "loss": 0.8384, + "step": 2961 + }, + { + "epoch": 1.7278693306110544, + "grad_norm": 1.1693167686462402, + "learning_rate": 7.016225961538461e-06, + "loss": 0.8144, + "step": 2962 + }, + { + "epoch": 1.7284526760974188, + "grad_norm": 1.2496626377105713, + "learning_rate": 7.001201923076924e-06, + "loss": 0.7753, + "step": 2963 + }, + { + "epoch": 1.729036021583783, + "grad_norm": 1.1495461463928223, + "learning_rate": 6.986177884615385e-06, + "loss": 0.7307, + "step": 2964 + }, + { + "epoch": 1.7296193670701472, + "grad_norm": 1.15946626663208, + "learning_rate": 6.9711538461538465e-06, + "loss": 0.8847, + "step": 2965 + }, + { + "epoch": 1.7302027125565116, + "grad_norm": 1.1948374509811401, + "learning_rate": 6.9561298076923076e-06, + "loss": 0.7088, + "step": 2966 + }, + { + "epoch": 1.730786058042876, + "grad_norm": 1.298287272453308, + "learning_rate": 6.94110576923077e-06, + "loss": 0.6594, + "step": 2967 + }, + { + "epoch": 1.7313694035292402, + "grad_norm": 1.1406173706054688, + "learning_rate": 6.9260817307692305e-06, + "loss": 0.9859, + "step": 2968 + }, + { + "epoch": 1.7319527490156044, + "grad_norm": 1.2161551713943481, + "learning_rate": 6.911057692307693e-06, + "loss": 0.7891, + "step": 2969 + }, + { + "epoch": 1.7325360945019688, + "grad_norm": 1.0298594236373901, + "learning_rate": 6.896033653846154e-06, + "loss": 0.7256, + "step": 2970 + }, + { + "epoch": 1.7331194399883332, + "grad_norm": 1.2798943519592285, + "learning_rate": 6.881009615384616e-06, + "loss": 0.8829, + "step": 2971 + }, + { + "epoch": 1.7337027854746974, + "grad_norm": 1.514907717704773, + "learning_rate": 6.865985576923077e-06, + "loss": 0.7678, + "step": 2972 + }, + { + "epoch": 1.7342861309610615, + "grad_norm": 1.2793967723846436, + "learning_rate": 6.850961538461539e-06, + "loss": 0.8854, + "step": 2973 + }, + { + "epoch": 1.734869476447426, + "grad_norm": 1.2280986309051514, + "learning_rate": 6.8359375e-06, + "loss": 0.9037, + "step": 2974 + }, + { + "epoch": 1.7354528219337904, + "grad_norm": 1.116003155708313, + "learning_rate": 6.820913461538461e-06, + "loss": 0.7507, + "step": 2975 + }, + { + "epoch": 1.7360361674201545, + "grad_norm": 1.0788702964782715, + "learning_rate": 6.805889423076924e-06, + "loss": 0.9569, + "step": 2976 + }, + { + "epoch": 1.7366195129065187, + "grad_norm": 1.329489827156067, + "learning_rate": 6.790865384615384e-06, + "loss": 0.7846, + "step": 2977 + }, + { + "epoch": 1.7372028583928831, + "grad_norm": 1.1528865098953247, + "learning_rate": 6.775841346153847e-06, + "loss": 0.7985, + "step": 2978 + }, + { + "epoch": 1.7377862038792475, + "grad_norm": 0.9944823980331421, + "learning_rate": 6.760817307692308e-06, + "loss": 0.7359, + "step": 2979 + }, + { + "epoch": 1.7383695493656117, + "grad_norm": 1.269169569015503, + "learning_rate": 6.74579326923077e-06, + "loss": 0.7273, + "step": 2980 + }, + { + "epoch": 1.7389528948519761, + "grad_norm": 1.0448265075683594, + "learning_rate": 6.730769230769231e-06, + "loss": 0.8073, + "step": 2981 + }, + { + "epoch": 1.7395362403383405, + "grad_norm": 1.3037874698638916, + "learning_rate": 6.715745192307693e-06, + "loss": 0.6997, + "step": 2982 + }, + { + "epoch": 1.7401195858247047, + "grad_norm": 1.2340432405471802, + "learning_rate": 6.700721153846154e-06, + "loss": 0.8573, + "step": 2983 + }, + { + "epoch": 1.740702931311069, + "grad_norm": 1.128672480583191, + "learning_rate": 6.6856971153846165e-06, + "loss": 0.8413, + "step": 2984 + }, + { + "epoch": 1.7412862767974333, + "grad_norm": 1.1064525842666626, + "learning_rate": 6.6706730769230775e-06, + "loss": 0.9186, + "step": 2985 + }, + { + "epoch": 1.7418696222837977, + "grad_norm": 1.9784815311431885, + "learning_rate": 6.6556490384615394e-06, + "loss": 0.8194, + "step": 2986 + }, + { + "epoch": 1.742452967770162, + "grad_norm": 1.0543274879455566, + "learning_rate": 6.6406250000000005e-06, + "loss": 0.6185, + "step": 2987 + }, + { + "epoch": 1.743036313256526, + "grad_norm": 1.2074421644210815, + "learning_rate": 6.6256009615384615e-06, + "loss": 0.5638, + "step": 2988 + }, + { + "epoch": 1.7436196587428905, + "grad_norm": 0.9913759231567383, + "learning_rate": 6.610576923076923e-06, + "loss": 0.7278, + "step": 2989 + }, + { + "epoch": 1.744203004229255, + "grad_norm": 1.276524543762207, + "learning_rate": 6.5955528846153845e-06, + "loss": 0.7934, + "step": 2990 + }, + { + "epoch": 1.744786349715619, + "grad_norm": 1.247419834136963, + "learning_rate": 6.580528846153846e-06, + "loss": 0.8914, + "step": 2991 + }, + { + "epoch": 1.7453696952019833, + "grad_norm": 1.3992359638214111, + "learning_rate": 6.565504807692307e-06, + "loss": 0.7806, + "step": 2992 + }, + { + "epoch": 1.7459530406883477, + "grad_norm": 1.1906291246414185, + "learning_rate": 6.55048076923077e-06, + "loss": 0.8607, + "step": 2993 + }, + { + "epoch": 1.746536386174712, + "grad_norm": 1.327694296836853, + "learning_rate": 6.535456730769231e-06, + "loss": 0.606, + "step": 2994 + }, + { + "epoch": 1.7471197316610763, + "grad_norm": 1.3064364194869995, + "learning_rate": 6.520432692307693e-06, + "loss": 0.8269, + "step": 2995 + }, + { + "epoch": 1.7477030771474404, + "grad_norm": 1.3761636018753052, + "learning_rate": 6.505408653846154e-06, + "loss": 0.8794, + "step": 2996 + }, + { + "epoch": 1.7482864226338048, + "grad_norm": 1.2299649715423584, + "learning_rate": 6.490384615384616e-06, + "loss": 0.9386, + "step": 2997 + }, + { + "epoch": 1.7488697681201693, + "grad_norm": 1.324949026107788, + "learning_rate": 6.475360576923077e-06, + "loss": 0.8732, + "step": 2998 + }, + { + "epoch": 1.7494531136065334, + "grad_norm": 1.2557692527770996, + "learning_rate": 6.46033653846154e-06, + "loss": 0.6888, + "step": 2999 + }, + { + "epoch": 1.7500364590928976, + "grad_norm": 1.2142000198364258, + "learning_rate": 6.4453125e-06, + "loss": 0.828, + "step": 3000 + }, + { + "epoch": 1.7500364590928976, + "eval_loss_squad": 0.852206126167439, + "eval_perplexity": 8.171628834408976, + "eval_perplexity_reconstruct": 1.8986877809494127, + "step": 3000 + }, + { + "epoch": 1.750619804579262, + "grad_norm": 1.2730783224105835, + "learning_rate": 6.430288461538461e-06, + "loss": 0.8912, + "step": 3001 + }, + { + "epoch": 1.7512031500656264, + "grad_norm": 1.2231379747390747, + "learning_rate": 6.415264423076924e-06, + "loss": 0.8677, + "step": 3002 + }, + { + "epoch": 1.7517864955519906, + "grad_norm": 1.3539924621582031, + "learning_rate": 6.400240384615385e-06, + "loss": 0.8291, + "step": 3003 + }, + { + "epoch": 1.7523698410383548, + "grad_norm": 1.3997881412506104, + "learning_rate": 6.385216346153847e-06, + "loss": 0.9718, + "step": 3004 + }, + { + "epoch": 1.7529531865247194, + "grad_norm": 1.2460448741912842, + "learning_rate": 6.370192307692308e-06, + "loss": 0.9308, + "step": 3005 + }, + { + "epoch": 1.7535365320110836, + "grad_norm": 1.063209056854248, + "learning_rate": 6.35516826923077e-06, + "loss": 0.637, + "step": 3006 + }, + { + "epoch": 1.7541198774974478, + "grad_norm": 1.0733965635299683, + "learning_rate": 6.340144230769231e-06, + "loss": 0.7837, + "step": 3007 + }, + { + "epoch": 1.7547032229838122, + "grad_norm": 1.167663335800171, + "learning_rate": 6.325120192307693e-06, + "loss": 0.8921, + "step": 3008 + }, + { + "epoch": 1.7552865684701766, + "grad_norm": 1.3398858308792114, + "learning_rate": 6.310096153846154e-06, + "loss": 0.7743, + "step": 3009 + }, + { + "epoch": 1.7558699139565408, + "grad_norm": 1.180702805519104, + "learning_rate": 6.295072115384616e-06, + "loss": 0.6499, + "step": 3010 + }, + { + "epoch": 1.756453259442905, + "grad_norm": 1.4334017038345337, + "learning_rate": 6.280048076923077e-06, + "loss": 0.6341, + "step": 3011 + }, + { + "epoch": 1.7570366049292694, + "grad_norm": 1.1912078857421875, + "learning_rate": 6.265024038461539e-06, + "loss": 0.6193, + "step": 3012 + }, + { + "epoch": 1.7576199504156338, + "grad_norm": 1.267592430114746, + "learning_rate": 6.25e-06, + "loss": 0.8952, + "step": 3013 + }, + { + "epoch": 1.758203295901998, + "grad_norm": 1.6982357501983643, + "learning_rate": 6.234975961538462e-06, + "loss": 0.8871, + "step": 3014 + }, + { + "epoch": 1.7587866413883622, + "grad_norm": 1.41012704372406, + "learning_rate": 6.219951923076923e-06, + "loss": 0.7593, + "step": 3015 + }, + { + "epoch": 1.7593699868747266, + "grad_norm": 1.222355842590332, + "learning_rate": 6.204927884615385e-06, + "loss": 0.8029, + "step": 3016 + }, + { + "epoch": 1.759953332361091, + "grad_norm": 1.1510112285614014, + "learning_rate": 6.189903846153847e-06, + "loss": 0.7361, + "step": 3017 + }, + { + "epoch": 1.7605366778474552, + "grad_norm": 1.1464331150054932, + "learning_rate": 6.174879807692308e-06, + "loss": 0.6445, + "step": 3018 + }, + { + "epoch": 1.7611200233338193, + "grad_norm": 1.6973276138305664, + "learning_rate": 6.15985576923077e-06, + "loss": 0.7339, + "step": 3019 + }, + { + "epoch": 1.7617033688201837, + "grad_norm": 1.2533224821090698, + "learning_rate": 6.144831730769231e-06, + "loss": 0.735, + "step": 3020 + }, + { + "epoch": 1.7622867143065482, + "grad_norm": 0.871324360370636, + "learning_rate": 6.129807692307692e-06, + "loss": 0.7549, + "step": 3021 + }, + { + "epoch": 1.7628700597929123, + "grad_norm": 1.1804686784744263, + "learning_rate": 6.114783653846154e-06, + "loss": 0.9045, + "step": 3022 + }, + { + "epoch": 1.7634534052792765, + "grad_norm": 1.0688856840133667, + "learning_rate": 6.099759615384616e-06, + "loss": 0.965, + "step": 3023 + }, + { + "epoch": 1.764036750765641, + "grad_norm": 1.29346764087677, + "learning_rate": 6.084735576923077e-06, + "loss": 0.8251, + "step": 3024 + }, + { + "epoch": 1.7646200962520053, + "grad_norm": 1.247200846672058, + "learning_rate": 6.069711538461539e-06, + "loss": 0.8057, + "step": 3025 + }, + { + "epoch": 1.7652034417383695, + "grad_norm": 1.3537094593048096, + "learning_rate": 6.054687500000001e-06, + "loss": 0.7938, + "step": 3026 + }, + { + "epoch": 1.7657867872247337, + "grad_norm": 1.2823100090026855, + "learning_rate": 6.039663461538462e-06, + "loss": 0.9466, + "step": 3027 + }, + { + "epoch": 1.766370132711098, + "grad_norm": 1.4946316480636597, + "learning_rate": 6.0246394230769236e-06, + "loss": 0.7986, + "step": 3028 + }, + { + "epoch": 1.7669534781974625, + "grad_norm": 1.1807023286819458, + "learning_rate": 6.0096153846153855e-06, + "loss": 0.7605, + "step": 3029 + }, + { + "epoch": 1.7675368236838267, + "grad_norm": 1.2204418182373047, + "learning_rate": 5.9945913461538465e-06, + "loss": 0.8484, + "step": 3030 + }, + { + "epoch": 1.7681201691701909, + "grad_norm": 0.9801056981086731, + "learning_rate": 5.979567307692308e-06, + "loss": 0.8015, + "step": 3031 + }, + { + "epoch": 1.7687035146565555, + "grad_norm": 1.2062498331069946, + "learning_rate": 5.9645432692307694e-06, + "loss": 0.7702, + "step": 3032 + }, + { + "epoch": 1.7692868601429197, + "grad_norm": 1.5518649816513062, + "learning_rate": 5.9495192307692305e-06, + "loss": 0.9168, + "step": 3033 + }, + { + "epoch": 1.7698702056292839, + "grad_norm": 1.196702480316162, + "learning_rate": 5.934495192307692e-06, + "loss": 0.7405, + "step": 3034 + }, + { + "epoch": 1.7704535511156483, + "grad_norm": 1.3550660610198975, + "learning_rate": 5.919471153846154e-06, + "loss": 0.8246, + "step": 3035 + }, + { + "epoch": 1.7710368966020127, + "grad_norm": 1.0379894971847534, + "learning_rate": 5.904447115384615e-06, + "loss": 0.7812, + "step": 3036 + }, + { + "epoch": 1.7716202420883769, + "grad_norm": 1.0726555585861206, + "learning_rate": 5.889423076923077e-06, + "loss": 1.0311, + "step": 3037 + }, + { + "epoch": 1.772203587574741, + "grad_norm": 1.0926289558410645, + "learning_rate": 5.874399038461539e-06, + "loss": 0.8649, + "step": 3038 + }, + { + "epoch": 1.7727869330611055, + "grad_norm": 1.2088254690170288, + "learning_rate": 5.859375e-06, + "loss": 0.6787, + "step": 3039 + }, + { + "epoch": 1.7733702785474699, + "grad_norm": 1.2711060047149658, + "learning_rate": 5.844350961538462e-06, + "loss": 0.6789, + "step": 3040 + }, + { + "epoch": 1.773953624033834, + "grad_norm": 1.2558777332305908, + "learning_rate": 5.829326923076923e-06, + "loss": 0.8561, + "step": 3041 + }, + { + "epoch": 1.7745369695201982, + "grad_norm": 1.1498215198516846, + "learning_rate": 5.814302884615385e-06, + "loss": 0.9378, + "step": 3042 + }, + { + "epoch": 1.7751203150065626, + "grad_norm": 1.2702573537826538, + "learning_rate": 5.799278846153847e-06, + "loss": 0.7744, + "step": 3043 + }, + { + "epoch": 1.775703660492927, + "grad_norm": 1.2708110809326172, + "learning_rate": 5.784254807692308e-06, + "loss": 0.8994, + "step": 3044 + }, + { + "epoch": 1.7762870059792912, + "grad_norm": 1.1213219165802002, + "learning_rate": 5.76923076923077e-06, + "loss": 0.5506, + "step": 3045 + }, + { + "epoch": 1.7768703514656554, + "grad_norm": 1.5344654321670532, + "learning_rate": 5.754206730769231e-06, + "loss": 0.8867, + "step": 3046 + }, + { + "epoch": 1.7774536969520198, + "grad_norm": 1.2419880628585815, + "learning_rate": 5.739182692307693e-06, + "loss": 0.7802, + "step": 3047 + }, + { + "epoch": 1.7780370424383842, + "grad_norm": 1.1512519121170044, + "learning_rate": 5.724158653846154e-06, + "loss": 0.6938, + "step": 3048 + }, + { + "epoch": 1.7786203879247484, + "grad_norm": 1.4187743663787842, + "learning_rate": 5.709134615384616e-06, + "loss": 0.7221, + "step": 3049 + }, + { + "epoch": 1.7792037334111126, + "grad_norm": 1.180336594581604, + "learning_rate": 5.694110576923077e-06, + "loss": 0.9465, + "step": 3050 + }, + { + "epoch": 1.779787078897477, + "grad_norm": 1.244598627090454, + "learning_rate": 5.6790865384615386e-06, + "loss": 0.7442, + "step": 3051 + }, + { + "epoch": 1.7803704243838414, + "grad_norm": 1.2341411113739014, + "learning_rate": 5.6640625000000005e-06, + "loss": 0.748, + "step": 3052 + }, + { + "epoch": 1.7809537698702056, + "grad_norm": 1.1170315742492676, + "learning_rate": 5.6490384615384615e-06, + "loss": 0.7429, + "step": 3053 + }, + { + "epoch": 1.7815371153565698, + "grad_norm": 1.3362447023391724, + "learning_rate": 5.634014423076923e-06, + "loss": 0.7197, + "step": 3054 + }, + { + "epoch": 1.7821204608429342, + "grad_norm": 1.4332809448242188, + "learning_rate": 5.618990384615385e-06, + "loss": 0.9911, + "step": 3055 + }, + { + "epoch": 1.7827038063292986, + "grad_norm": 1.2477706670761108, + "learning_rate": 5.603966346153846e-06, + "loss": 1.1378, + "step": 3056 + }, + { + "epoch": 1.7832871518156628, + "grad_norm": 1.2143217325210571, + "learning_rate": 5.588942307692308e-06, + "loss": 0.6478, + "step": 3057 + }, + { + "epoch": 1.7838704973020272, + "grad_norm": 1.2848975658416748, + "learning_rate": 5.57391826923077e-06, + "loss": 0.8657, + "step": 3058 + }, + { + "epoch": 1.7844538427883916, + "grad_norm": 1.3641600608825684, + "learning_rate": 5.55889423076923e-06, + "loss": 0.9904, + "step": 3059 + }, + { + "epoch": 1.7850371882747558, + "grad_norm": 1.3162271976470947, + "learning_rate": 5.543870192307692e-06, + "loss": 0.6504, + "step": 3060 + }, + { + "epoch": 1.78562053376112, + "grad_norm": 1.1452566385269165, + "learning_rate": 5.528846153846154e-06, + "loss": 0.8217, + "step": 3061 + }, + { + "epoch": 1.7862038792474844, + "grad_norm": 1.2015888690948486, + "learning_rate": 5.513822115384615e-06, + "loss": 0.9543, + "step": 3062 + }, + { + "epoch": 1.7867872247338488, + "grad_norm": 1.3449658155441284, + "learning_rate": 5.498798076923077e-06, + "loss": 0.8232, + "step": 3063 + }, + { + "epoch": 1.787370570220213, + "grad_norm": 1.3045237064361572, + "learning_rate": 5.483774038461539e-06, + "loss": 0.8037, + "step": 3064 + }, + { + "epoch": 1.7879539157065771, + "grad_norm": 1.1564418077468872, + "learning_rate": 5.46875e-06, + "loss": 0.7541, + "step": 3065 + }, + { + "epoch": 1.7885372611929415, + "grad_norm": 1.2231559753417969, + "learning_rate": 5.453725961538462e-06, + "loss": 0.7623, + "step": 3066 + }, + { + "epoch": 1.789120606679306, + "grad_norm": 1.238562822341919, + "learning_rate": 5.438701923076924e-06, + "loss": 0.8263, + "step": 3067 + }, + { + "epoch": 1.7897039521656701, + "grad_norm": 1.3414685726165771, + "learning_rate": 5.423677884615385e-06, + "loss": 0.8375, + "step": 3068 + }, + { + "epoch": 1.7902872976520343, + "grad_norm": 1.1576594114303589, + "learning_rate": 5.408653846153847e-06, + "loss": 0.8616, + "step": 3069 + }, + { + "epoch": 1.7908706431383987, + "grad_norm": 1.2848812341690063, + "learning_rate": 5.3936298076923085e-06, + "loss": 0.8936, + "step": 3070 + }, + { + "epoch": 1.7914539886247631, + "grad_norm": 1.2974978685379028, + "learning_rate": 5.37860576923077e-06, + "loss": 0.7183, + "step": 3071 + }, + { + "epoch": 1.7920373341111273, + "grad_norm": 1.0673803091049194, + "learning_rate": 5.363581730769231e-06, + "loss": 0.9361, + "step": 3072 + }, + { + "epoch": 1.7926206795974915, + "grad_norm": 1.156964659690857, + "learning_rate": 5.3485576923076925e-06, + "loss": 0.665, + "step": 3073 + }, + { + "epoch": 1.793204025083856, + "grad_norm": 1.2498539686203003, + "learning_rate": 5.3335336538461536e-06, + "loss": 0.683, + "step": 3074 + }, + { + "epoch": 1.7937873705702203, + "grad_norm": 1.0622308254241943, + "learning_rate": 5.3185096153846155e-06, + "loss": 0.7311, + "step": 3075 + }, + { + "epoch": 1.7943707160565845, + "grad_norm": 1.147762417793274, + "learning_rate": 5.303485576923077e-06, + "loss": 0.9415, + "step": 3076 + }, + { + "epoch": 1.7949540615429487, + "grad_norm": 1.6700465679168701, + "learning_rate": 5.288461538461538e-06, + "loss": 0.7959, + "step": 3077 + }, + { + "epoch": 1.795537407029313, + "grad_norm": 1.326983094215393, + "learning_rate": 5.2734375e-06, + "loss": 0.6907, + "step": 3078 + }, + { + "epoch": 1.7961207525156775, + "grad_norm": 1.2017778158187866, + "learning_rate": 5.258413461538462e-06, + "loss": 0.7195, + "step": 3079 + }, + { + "epoch": 1.7967040980020417, + "grad_norm": 1.3539180755615234, + "learning_rate": 5.243389423076923e-06, + "loss": 0.7981, + "step": 3080 + }, + { + "epoch": 1.7972874434884059, + "grad_norm": 1.202142357826233, + "learning_rate": 5.228365384615385e-06, + "loss": 0.8899, + "step": 3081 + }, + { + "epoch": 1.7978707889747703, + "grad_norm": 1.4579527378082275, + "learning_rate": 5.213341346153847e-06, + "loss": 0.7488, + "step": 3082 + }, + { + "epoch": 1.7984541344611347, + "grad_norm": 1.3240573406219482, + "learning_rate": 5.198317307692308e-06, + "loss": 0.6901, + "step": 3083 + }, + { + "epoch": 1.7990374799474989, + "grad_norm": 0.9925790429115295, + "learning_rate": 5.18329326923077e-06, + "loss": 0.7306, + "step": 3084 + }, + { + "epoch": 1.7996208254338633, + "grad_norm": 1.2076115608215332, + "learning_rate": 5.168269230769231e-06, + "loss": 0.7139, + "step": 3085 + }, + { + "epoch": 1.8002041709202277, + "grad_norm": 1.3439483642578125, + "learning_rate": 5.153245192307692e-06, + "loss": 0.7188, + "step": 3086 + }, + { + "epoch": 1.8007875164065918, + "grad_norm": 1.2917673587799072, + "learning_rate": 5.138221153846154e-06, + "loss": 0.8389, + "step": 3087 + }, + { + "epoch": 1.801370861892956, + "grad_norm": 1.1600747108459473, + "learning_rate": 5.123197115384616e-06, + "loss": 0.8402, + "step": 3088 + }, + { + "epoch": 1.8019542073793204, + "grad_norm": 1.4509084224700928, + "learning_rate": 5.108173076923077e-06, + "loss": 0.8959, + "step": 3089 + }, + { + "epoch": 1.8025375528656848, + "grad_norm": 1.1624780893325806, + "learning_rate": 5.093149038461539e-06, + "loss": 0.7454, + "step": 3090 + }, + { + "epoch": 1.803120898352049, + "grad_norm": 1.2352399826049805, + "learning_rate": 5.078125000000001e-06, + "loss": 0.8392, + "step": 3091 + }, + { + "epoch": 1.8037042438384132, + "grad_norm": 1.2602814435958862, + "learning_rate": 5.063100961538462e-06, + "loss": 0.6345, + "step": 3092 + }, + { + "epoch": 1.8042875893247776, + "grad_norm": 1.5061650276184082, + "learning_rate": 5.0480769230769235e-06, + "loss": 0.6893, + "step": 3093 + }, + { + "epoch": 1.804870934811142, + "grad_norm": 1.175838828086853, + "learning_rate": 5.0330528846153854e-06, + "loss": 0.7926, + "step": 3094 + }, + { + "epoch": 1.8054542802975062, + "grad_norm": 1.4317418336868286, + "learning_rate": 5.0180288461538465e-06, + "loss": 0.8662, + "step": 3095 + }, + { + "epoch": 1.8060376257838704, + "grad_norm": 1.0850125551223755, + "learning_rate": 5.003004807692308e-06, + "loss": 0.67, + "step": 3096 + }, + { + "epoch": 1.8066209712702348, + "grad_norm": 1.199704885482788, + "learning_rate": 4.987980769230769e-06, + "loss": 1.0953, + "step": 3097 + }, + { + "epoch": 1.8072043167565992, + "grad_norm": 1.1284786462783813, + "learning_rate": 4.9729567307692305e-06, + "loss": 0.8975, + "step": 3098 + }, + { + "epoch": 1.8077876622429634, + "grad_norm": 1.3013429641723633, + "learning_rate": 4.957932692307692e-06, + "loss": 0.7548, + "step": 3099 + }, + { + "epoch": 1.8083710077293276, + "grad_norm": 1.3402200937271118, + "learning_rate": 4.942908653846154e-06, + "loss": 0.7116, + "step": 3100 + }, + { + "epoch": 1.808954353215692, + "grad_norm": 1.1357074975967407, + "learning_rate": 4.927884615384615e-06, + "loss": 0.747, + "step": 3101 + }, + { + "epoch": 1.8095376987020564, + "grad_norm": 1.097558856010437, + "learning_rate": 4.912860576923077e-06, + "loss": 0.5583, + "step": 3102 + }, + { + "epoch": 1.8101210441884206, + "grad_norm": 1.726578712463379, + "learning_rate": 4.897836538461539e-06, + "loss": 0.9862, + "step": 3103 + }, + { + "epoch": 1.8107043896747848, + "grad_norm": 1.3363982439041138, + "learning_rate": 4.8828125e-06, + "loss": 0.7787, + "step": 3104 + }, + { + "epoch": 1.8112877351611492, + "grad_norm": 1.1213432550430298, + "learning_rate": 4.867788461538462e-06, + "loss": 0.9938, + "step": 3105 + }, + { + "epoch": 1.8118710806475136, + "grad_norm": 1.5810742378234863, + "learning_rate": 4.852764423076923e-06, + "loss": 0.8159, + "step": 3106 + }, + { + "epoch": 1.8124544261338777, + "grad_norm": 1.1917210817337036, + "learning_rate": 4.837740384615385e-06, + "loss": 0.8894, + "step": 3107 + }, + { + "epoch": 1.813037771620242, + "grad_norm": 1.112142562866211, + "learning_rate": 4.822716346153847e-06, + "loss": 0.7393, + "step": 3108 + }, + { + "epoch": 1.8136211171066063, + "grad_norm": 1.3024792671203613, + "learning_rate": 4.807692307692308e-06, + "loss": 0.7867, + "step": 3109 + }, + { + "epoch": 1.8142044625929707, + "grad_norm": 1.378227949142456, + "learning_rate": 4.79266826923077e-06, + "loss": 0.6839, + "step": 3110 + }, + { + "epoch": 1.814787808079335, + "grad_norm": 1.1621403694152832, + "learning_rate": 4.777644230769231e-06, + "loss": 0.7255, + "step": 3111 + }, + { + "epoch": 1.8153711535656993, + "grad_norm": 1.3222122192382812, + "learning_rate": 4.762620192307693e-06, + "loss": 0.7682, + "step": 3112 + }, + { + "epoch": 1.8159544990520637, + "grad_norm": 1.3286000490188599, + "learning_rate": 4.747596153846154e-06, + "loss": 0.7242, + "step": 3113 + }, + { + "epoch": 1.816537844538428, + "grad_norm": 1.1924617290496826, + "learning_rate": 4.732572115384616e-06, + "loss": 0.9124, + "step": 3114 + }, + { + "epoch": 1.817121190024792, + "grad_norm": 1.5701050758361816, + "learning_rate": 4.717548076923077e-06, + "loss": 0.9612, + "step": 3115 + }, + { + "epoch": 1.8177045355111565, + "grad_norm": 0.9868125319480896, + "learning_rate": 4.7025240384615385e-06, + "loss": 0.672, + "step": 3116 + }, + { + "epoch": 1.818287880997521, + "grad_norm": 1.411771535873413, + "learning_rate": 4.6875000000000004e-06, + "loss": 0.7319, + "step": 3117 + }, + { + "epoch": 1.818871226483885, + "grad_norm": 1.2565867900848389, + "learning_rate": 4.6724759615384615e-06, + "loss": 0.6905, + "step": 3118 + }, + { + "epoch": 1.8194545719702493, + "grad_norm": 1.527400016784668, + "learning_rate": 4.657451923076923e-06, + "loss": 0.7905, + "step": 3119 + }, + { + "epoch": 1.8200379174566137, + "grad_norm": 1.2910518646240234, + "learning_rate": 4.642427884615385e-06, + "loss": 0.7373, + "step": 3120 + }, + { + "epoch": 1.820621262942978, + "grad_norm": 1.4282567501068115, + "learning_rate": 4.627403846153846e-06, + "loss": 0.8244, + "step": 3121 + }, + { + "epoch": 1.8212046084293423, + "grad_norm": 1.2436426877975464, + "learning_rate": 4.612379807692308e-06, + "loss": 0.8656, + "step": 3122 + }, + { + "epoch": 1.8217879539157065, + "grad_norm": 1.2247263193130493, + "learning_rate": 4.59735576923077e-06, + "loss": 0.8739, + "step": 3123 + }, + { + "epoch": 1.8223712994020709, + "grad_norm": 1.359684705734253, + "learning_rate": 4.58233173076923e-06, + "loss": 0.6765, + "step": 3124 + }, + { + "epoch": 1.8229546448884353, + "grad_norm": 1.2492992877960205, + "learning_rate": 4.567307692307692e-06, + "loss": 0.6232, + "step": 3125 + }, + { + "epoch": 1.8235379903747995, + "grad_norm": 1.314278244972229, + "learning_rate": 4.552283653846154e-06, + "loss": 0.7352, + "step": 3126 + }, + { + "epoch": 1.8241213358611637, + "grad_norm": 1.5234570503234863, + "learning_rate": 4.537259615384615e-06, + "loss": 0.9334, + "step": 3127 + }, + { + "epoch": 1.824704681347528, + "grad_norm": 1.2035748958587646, + "learning_rate": 4.522235576923077e-06, + "loss": 0.8481, + "step": 3128 + }, + { + "epoch": 1.8252880268338925, + "grad_norm": 1.1306325197219849, + "learning_rate": 4.507211538461539e-06, + "loss": 0.7214, + "step": 3129 + }, + { + "epoch": 1.8258713723202566, + "grad_norm": 1.265943169593811, + "learning_rate": 4.4921875e-06, + "loss": 0.7674, + "step": 3130 + }, + { + "epoch": 1.8264547178066208, + "grad_norm": 1.22818124294281, + "learning_rate": 4.477163461538462e-06, + "loss": 0.9115, + "step": 3131 + }, + { + "epoch": 1.8270380632929852, + "grad_norm": 1.2925962209701538, + "learning_rate": 4.462139423076924e-06, + "loss": 1.0034, + "step": 3132 + }, + { + "epoch": 1.8276214087793496, + "grad_norm": 1.3289649486541748, + "learning_rate": 4.447115384615385e-06, + "loss": 0.8526, + "step": 3133 + }, + { + "epoch": 1.8282047542657138, + "grad_norm": 1.2596584558486938, + "learning_rate": 4.432091346153847e-06, + "loss": 0.8599, + "step": 3134 + }, + { + "epoch": 1.828788099752078, + "grad_norm": 1.5771028995513916, + "learning_rate": 4.4170673076923085e-06, + "loss": 0.7632, + "step": 3135 + }, + { + "epoch": 1.8293714452384424, + "grad_norm": 1.3541091680526733, + "learning_rate": 4.4020432692307696e-06, + "loss": 0.7655, + "step": 3136 + }, + { + "epoch": 1.8299547907248068, + "grad_norm": 1.2546583414077759, + "learning_rate": 4.387019230769231e-06, + "loss": 0.7144, + "step": 3137 + }, + { + "epoch": 1.830538136211171, + "grad_norm": 1.216031789779663, + "learning_rate": 4.3719951923076925e-06, + "loss": 0.8208, + "step": 3138 + }, + { + "epoch": 1.8311214816975354, + "grad_norm": 1.219329595565796, + "learning_rate": 4.3569711538461535e-06, + "loss": 0.5568, + "step": 3139 + }, + { + "epoch": 1.8317048271838998, + "grad_norm": 1.1087514162063599, + "learning_rate": 4.3419471153846154e-06, + "loss": 0.9815, + "step": 3140 + }, + { + "epoch": 1.832288172670264, + "grad_norm": 1.2218693494796753, + "learning_rate": 4.326923076923077e-06, + "loss": 0.7445, + "step": 3141 + }, + { + "epoch": 1.8328715181566282, + "grad_norm": 1.487666130065918, + "learning_rate": 4.311899038461538e-06, + "loss": 0.8892, + "step": 3142 + }, + { + "epoch": 1.8334548636429926, + "grad_norm": 1.117554783821106, + "learning_rate": 4.296875e-06, + "loss": 0.6522, + "step": 3143 + }, + { + "epoch": 1.834038209129357, + "grad_norm": 1.2432799339294434, + "learning_rate": 4.281850961538462e-06, + "loss": 0.7858, + "step": 3144 + }, + { + "epoch": 1.8346215546157212, + "grad_norm": 1.2413891553878784, + "learning_rate": 4.266826923076923e-06, + "loss": 0.6435, + "step": 3145 + }, + { + "epoch": 1.8352049001020854, + "grad_norm": 1.193045735359192, + "learning_rate": 4.251802884615385e-06, + "loss": 0.7756, + "step": 3146 + }, + { + "epoch": 1.8357882455884498, + "grad_norm": 1.2209383249282837, + "learning_rate": 4.236778846153847e-06, + "loss": 0.7637, + "step": 3147 + }, + { + "epoch": 1.8363715910748142, + "grad_norm": 1.1595642566680908, + "learning_rate": 4.221754807692308e-06, + "loss": 0.7416, + "step": 3148 + }, + { + "epoch": 1.8369549365611784, + "grad_norm": 1.1345254182815552, + "learning_rate": 4.20673076923077e-06, + "loss": 0.4896, + "step": 3149 + }, + { + "epoch": 1.8375382820475425, + "grad_norm": 1.1374866962432861, + "learning_rate": 4.191706730769231e-06, + "loss": 0.8506, + "step": 3150 + }, + { + "epoch": 1.838121627533907, + "grad_norm": 1.126652717590332, + "learning_rate": 4.176682692307692e-06, + "loss": 0.7785, + "step": 3151 + }, + { + "epoch": 1.8387049730202714, + "grad_norm": 1.1542102098464966, + "learning_rate": 4.161658653846154e-06, + "loss": 0.6363, + "step": 3152 + }, + { + "epoch": 1.8392883185066355, + "grad_norm": 1.0646016597747803, + "learning_rate": 4.146634615384616e-06, + "loss": 0.6941, + "step": 3153 + }, + { + "epoch": 1.8398716639929997, + "grad_norm": 1.2552902698516846, + "learning_rate": 4.131610576923077e-06, + "loss": 0.7249, + "step": 3154 + }, + { + "epoch": 1.8404550094793641, + "grad_norm": 1.0623902082443237, + "learning_rate": 4.116586538461539e-06, + "loss": 0.7205, + "step": 3155 + }, + { + "epoch": 1.8410383549657285, + "grad_norm": 1.0725420713424683, + "learning_rate": 4.101562500000001e-06, + "loss": 0.7775, + "step": 3156 + }, + { + "epoch": 1.8416217004520927, + "grad_norm": 1.095056176185608, + "learning_rate": 4.086538461538462e-06, + "loss": 0.6987, + "step": 3157 + }, + { + "epoch": 1.842205045938457, + "grad_norm": 1.199759840965271, + "learning_rate": 4.0715144230769235e-06, + "loss": 0.7286, + "step": 3158 + }, + { + "epoch": 1.8427883914248213, + "grad_norm": 0.9884356260299683, + "learning_rate": 4.0564903846153846e-06, + "loss": 0.7607, + "step": 3159 + }, + { + "epoch": 1.8433717369111857, + "grad_norm": 1.2776890993118286, + "learning_rate": 4.0414663461538465e-06, + "loss": 0.6219, + "step": 3160 + }, + { + "epoch": 1.84395508239755, + "grad_norm": 1.3608481884002686, + "learning_rate": 4.026442307692308e-06, + "loss": 0.9211, + "step": 3161 + }, + { + "epoch": 1.844538427883914, + "grad_norm": 1.205276608467102, + "learning_rate": 4.011418269230769e-06, + "loss": 0.6703, + "step": 3162 + }, + { + "epoch": 1.8451217733702785, + "grad_norm": 1.4303468465805054, + "learning_rate": 3.9963942307692304e-06, + "loss": 0.7663, + "step": 3163 + }, + { + "epoch": 1.845705118856643, + "grad_norm": 1.1754591464996338, + "learning_rate": 3.981370192307692e-06, + "loss": 0.7512, + "step": 3164 + }, + { + "epoch": 1.846288464343007, + "grad_norm": 1.1947484016418457, + "learning_rate": 3.966346153846154e-06, + "loss": 0.715, + "step": 3165 + }, + { + "epoch": 1.8468718098293715, + "grad_norm": 1.4016227722167969, + "learning_rate": 3.951322115384615e-06, + "loss": 0.9545, + "step": 3166 + }, + { + "epoch": 1.847455155315736, + "grad_norm": 1.25657057762146, + "learning_rate": 3.936298076923077e-06, + "loss": 0.8419, + "step": 3167 + }, + { + "epoch": 1.8480385008021, + "grad_norm": 1.1799569129943848, + "learning_rate": 3.921274038461538e-06, + "loss": 0.9247, + "step": 3168 + }, + { + "epoch": 1.8486218462884643, + "grad_norm": 1.4153069257736206, + "learning_rate": 3.90625e-06, + "loss": 0.8692, + "step": 3169 + }, + { + "epoch": 1.8492051917748287, + "grad_norm": 1.103832483291626, + "learning_rate": 3.891225961538462e-06, + "loss": 0.7473, + "step": 3170 + }, + { + "epoch": 1.849788537261193, + "grad_norm": 1.3749737739562988, + "learning_rate": 3.876201923076923e-06, + "loss": 0.669, + "step": 3171 + }, + { + "epoch": 1.8503718827475573, + "grad_norm": 1.2396540641784668, + "learning_rate": 3.861177884615385e-06, + "loss": 0.7441, + "step": 3172 + }, + { + "epoch": 1.8509552282339214, + "grad_norm": 1.2196747064590454, + "learning_rate": 3.846153846153847e-06, + "loss": 0.7092, + "step": 3173 + }, + { + "epoch": 1.8515385737202859, + "grad_norm": 1.1058050394058228, + "learning_rate": 3.831129807692308e-06, + "loss": 0.6491, + "step": 3174 + }, + { + "epoch": 1.8521219192066503, + "grad_norm": 1.1973541975021362, + "learning_rate": 3.81610576923077e-06, + "loss": 0.8188, + "step": 3175 + }, + { + "epoch": 1.8527052646930144, + "grad_norm": 1.4277015924453735, + "learning_rate": 3.8010817307692308e-06, + "loss": 0.87, + "step": 3176 + }, + { + "epoch": 1.8532886101793786, + "grad_norm": 1.5026133060455322, + "learning_rate": 3.7860576923076922e-06, + "loss": 0.864, + "step": 3177 + }, + { + "epoch": 1.853871955665743, + "grad_norm": 1.1631428003311157, + "learning_rate": 3.7710336538461537e-06, + "loss": 0.8488, + "step": 3178 + }, + { + "epoch": 1.8544553011521074, + "grad_norm": 1.1146588325500488, + "learning_rate": 3.7560096153846156e-06, + "loss": 0.7239, + "step": 3179 + }, + { + "epoch": 1.8550386466384716, + "grad_norm": 1.4882702827453613, + "learning_rate": 3.740985576923077e-06, + "loss": 0.723, + "step": 3180 + }, + { + "epoch": 1.8556219921248358, + "grad_norm": 1.207821011543274, + "learning_rate": 3.7259615384615385e-06, + "loss": 0.6462, + "step": 3181 + }, + { + "epoch": 1.8562053376112002, + "grad_norm": 1.1613391637802124, + "learning_rate": 3.7109375e-06, + "loss": 0.9761, + "step": 3182 + }, + { + "epoch": 1.8567886830975646, + "grad_norm": 1.2488850355148315, + "learning_rate": 3.695913461538462e-06, + "loss": 0.9367, + "step": 3183 + }, + { + "epoch": 1.8573720285839288, + "grad_norm": 1.1730568408966064, + "learning_rate": 3.6808894230769233e-06, + "loss": 0.7909, + "step": 3184 + }, + { + "epoch": 1.857955374070293, + "grad_norm": 1.4373273849487305, + "learning_rate": 3.665865384615385e-06, + "loss": 0.824, + "step": 3185 + }, + { + "epoch": 1.8585387195566574, + "grad_norm": 1.7913013696670532, + "learning_rate": 3.6508413461538467e-06, + "loss": 0.8291, + "step": 3186 + }, + { + "epoch": 1.8591220650430218, + "grad_norm": 1.2750526666641235, + "learning_rate": 3.635817307692308e-06, + "loss": 0.9025, + "step": 3187 + }, + { + "epoch": 1.859705410529386, + "grad_norm": 1.5024226903915405, + "learning_rate": 3.6207932692307696e-06, + "loss": 0.643, + "step": 3188 + }, + { + "epoch": 1.8602887560157502, + "grad_norm": 1.1984363794326782, + "learning_rate": 3.6057692307692307e-06, + "loss": 0.8266, + "step": 3189 + }, + { + "epoch": 1.8608721015021146, + "grad_norm": 1.3658881187438965, + "learning_rate": 3.590745192307692e-06, + "loss": 0.8933, + "step": 3190 + }, + { + "epoch": 1.861455446988479, + "grad_norm": 1.5781782865524292, + "learning_rate": 3.5757211538461536e-06, + "loss": 0.9418, + "step": 3191 + }, + { + "epoch": 1.8620387924748432, + "grad_norm": 1.1617451906204224, + "learning_rate": 3.5606971153846155e-06, + "loss": 0.5586, + "step": 3192 + }, + { + "epoch": 1.8626221379612076, + "grad_norm": 1.3859959840774536, + "learning_rate": 3.545673076923077e-06, + "loss": 0.971, + "step": 3193 + }, + { + "epoch": 1.863205483447572, + "grad_norm": 1.154800534248352, + "learning_rate": 3.5306490384615384e-06, + "loss": 0.9459, + "step": 3194 + }, + { + "epoch": 1.8637888289339362, + "grad_norm": 1.2615742683410645, + "learning_rate": 3.5156250000000003e-06, + "loss": 0.8425, + "step": 3195 + }, + { + "epoch": 1.8643721744203003, + "grad_norm": 1.4125624895095825, + "learning_rate": 3.500600961538462e-06, + "loss": 0.8873, + "step": 3196 + }, + { + "epoch": 1.8649555199066647, + "grad_norm": 1.2564436197280884, + "learning_rate": 3.4855769230769233e-06, + "loss": 0.8137, + "step": 3197 + }, + { + "epoch": 1.8655388653930292, + "grad_norm": 1.1753637790679932, + "learning_rate": 3.470552884615385e-06, + "loss": 0.8264, + "step": 3198 + }, + { + "epoch": 1.8661222108793933, + "grad_norm": 1.0851452350616455, + "learning_rate": 3.4555288461538466e-06, + "loss": 0.7682, + "step": 3199 + }, + { + "epoch": 1.8667055563657575, + "grad_norm": 1.3882149457931519, + "learning_rate": 3.440504807692308e-06, + "loss": 0.723, + "step": 3200 + }, + { + "epoch": 1.8667055563657575, + "eval_loss_squad": 0.8567332937405445, + "eval_perplexity": 8.266156372908618, + "eval_perplexity_reconstruct": 1.8969144391866655, + "step": 3200 + }, + { + "epoch": 1.867288901852122, + "grad_norm": 1.3030450344085693, + "learning_rate": 3.4254807692307695e-06, + "loss": 0.8131, + "step": 3201 + }, + { + "epoch": 1.8678722473384863, + "grad_norm": 1.364791989326477, + "learning_rate": 3.4104567307692306e-06, + "loss": 0.8387, + "step": 3202 + }, + { + "epoch": 1.8684555928248505, + "grad_norm": 1.1007466316223145, + "learning_rate": 3.395432692307692e-06, + "loss": 0.745, + "step": 3203 + }, + { + "epoch": 1.8690389383112147, + "grad_norm": 1.2074437141418457, + "learning_rate": 3.380408653846154e-06, + "loss": 0.7523, + "step": 3204 + }, + { + "epoch": 1.869622283797579, + "grad_norm": 1.4134937524795532, + "learning_rate": 3.3653846153846154e-06, + "loss": 0.7514, + "step": 3205 + }, + { + "epoch": 1.8702056292839435, + "grad_norm": 1.2706308364868164, + "learning_rate": 3.350360576923077e-06, + "loss": 0.9497, + "step": 3206 + }, + { + "epoch": 1.8707889747703077, + "grad_norm": 1.3989231586456299, + "learning_rate": 3.3353365384615388e-06, + "loss": 0.8504, + "step": 3207 + }, + { + "epoch": 1.8713723202566719, + "grad_norm": 1.2219780683517456, + "learning_rate": 3.3203125000000002e-06, + "loss": 0.7592, + "step": 3208 + }, + { + "epoch": 1.8719556657430363, + "grad_norm": 1.3430885076522827, + "learning_rate": 3.3052884615384617e-06, + "loss": 0.8517, + "step": 3209 + }, + { + "epoch": 1.8725390112294007, + "grad_norm": 1.190152883529663, + "learning_rate": 3.290264423076923e-06, + "loss": 0.6381, + "step": 3210 + }, + { + "epoch": 1.8731223567157649, + "grad_norm": 1.4218403100967407, + "learning_rate": 3.275240384615385e-06, + "loss": 0.7845, + "step": 3211 + }, + { + "epoch": 1.873705702202129, + "grad_norm": 1.5346728563308716, + "learning_rate": 3.2602163461538465e-06, + "loss": 0.6876, + "step": 3212 + }, + { + "epoch": 1.8742890476884935, + "grad_norm": 1.3545806407928467, + "learning_rate": 3.245192307692308e-06, + "loss": 0.7361, + "step": 3213 + }, + { + "epoch": 1.8748723931748579, + "grad_norm": 1.127886176109314, + "learning_rate": 3.23016826923077e-06, + "loss": 1.1837, + "step": 3214 + }, + { + "epoch": 1.875455738661222, + "grad_norm": 1.2765008211135864, + "learning_rate": 3.2151442307692305e-06, + "loss": 0.7988, + "step": 3215 + }, + { + "epoch": 1.8760390841475862, + "grad_norm": 1.3014349937438965, + "learning_rate": 3.2001201923076924e-06, + "loss": 0.8047, + "step": 3216 + }, + { + "epoch": 1.8766224296339509, + "grad_norm": 1.284070372581482, + "learning_rate": 3.185096153846154e-06, + "loss": 0.9213, + "step": 3217 + }, + { + "epoch": 1.877205775120315, + "grad_norm": 1.1256341934204102, + "learning_rate": 3.1700721153846153e-06, + "loss": 0.9295, + "step": 3218 + }, + { + "epoch": 1.8777891206066792, + "grad_norm": 1.0452816486358643, + "learning_rate": 3.155048076923077e-06, + "loss": 0.7096, + "step": 3219 + }, + { + "epoch": 1.8783724660930436, + "grad_norm": 1.4434984922409058, + "learning_rate": 3.1400240384615387e-06, + "loss": 0.906, + "step": 3220 + }, + { + "epoch": 1.878955811579408, + "grad_norm": 1.309296727180481, + "learning_rate": 3.125e-06, + "loss": 0.8231, + "step": 3221 + }, + { + "epoch": 1.8795391570657722, + "grad_norm": 1.0300049781799316, + "learning_rate": 3.1099759615384616e-06, + "loss": 0.8218, + "step": 3222 + }, + { + "epoch": 1.8801225025521364, + "grad_norm": 1.3848848342895508, + "learning_rate": 3.0949519230769235e-06, + "loss": 0.6832, + "step": 3223 + }, + { + "epoch": 1.8807058480385008, + "grad_norm": 1.3134764432907104, + "learning_rate": 3.079927884615385e-06, + "loss": 0.7673, + "step": 3224 + }, + { + "epoch": 1.8812891935248652, + "grad_norm": 1.3645648956298828, + "learning_rate": 3.064903846153846e-06, + "loss": 0.8301, + "step": 3225 + }, + { + "epoch": 1.8818725390112294, + "grad_norm": 1.2801153659820557, + "learning_rate": 3.049879807692308e-06, + "loss": 0.8582, + "step": 3226 + }, + { + "epoch": 1.8824558844975936, + "grad_norm": 1.3583933115005493, + "learning_rate": 3.0348557692307694e-06, + "loss": 0.8457, + "step": 3227 + }, + { + "epoch": 1.883039229983958, + "grad_norm": 1.4235001802444458, + "learning_rate": 3.019831730769231e-06, + "loss": 0.8377, + "step": 3228 + }, + { + "epoch": 1.8836225754703224, + "grad_norm": 1.4059243202209473, + "learning_rate": 3.0048076923076927e-06, + "loss": 0.7966, + "step": 3229 + }, + { + "epoch": 1.8842059209566866, + "grad_norm": 1.2851868867874146, + "learning_rate": 2.989783653846154e-06, + "loss": 0.655, + "step": 3230 + }, + { + "epoch": 1.8847892664430508, + "grad_norm": 1.42172110080719, + "learning_rate": 2.9747596153846152e-06, + "loss": 0.774, + "step": 3231 + }, + { + "epoch": 1.8853726119294152, + "grad_norm": 1.1469957828521729, + "learning_rate": 2.959735576923077e-06, + "loss": 0.6521, + "step": 3232 + }, + { + "epoch": 1.8859559574157796, + "grad_norm": 1.7705121040344238, + "learning_rate": 2.9447115384615386e-06, + "loss": 0.9157, + "step": 3233 + }, + { + "epoch": 1.8865393029021438, + "grad_norm": 1.3892390727996826, + "learning_rate": 2.9296875e-06, + "loss": 0.846, + "step": 3234 + }, + { + "epoch": 1.887122648388508, + "grad_norm": 1.1506258249282837, + "learning_rate": 2.9146634615384615e-06, + "loss": 0.8809, + "step": 3235 + }, + { + "epoch": 1.8877059938748724, + "grad_norm": 1.0249956846237183, + "learning_rate": 2.8996394230769234e-06, + "loss": 0.7122, + "step": 3236 + }, + { + "epoch": 1.8882893393612368, + "grad_norm": 1.3840585947036743, + "learning_rate": 2.884615384615385e-06, + "loss": 0.8086, + "step": 3237 + }, + { + "epoch": 1.888872684847601, + "grad_norm": 1.3755419254302979, + "learning_rate": 2.8695913461538464e-06, + "loss": 0.9811, + "step": 3238 + }, + { + "epoch": 1.8894560303339651, + "grad_norm": 1.3516442775726318, + "learning_rate": 2.854567307692308e-06, + "loss": 0.7445, + "step": 3239 + }, + { + "epoch": 1.8900393758203295, + "grad_norm": 1.0328466892242432, + "learning_rate": 2.8395432692307693e-06, + "loss": 0.6279, + "step": 3240 + }, + { + "epoch": 1.890622721306694, + "grad_norm": 1.1680728197097778, + "learning_rate": 2.8245192307692307e-06, + "loss": 0.7424, + "step": 3241 + }, + { + "epoch": 1.8912060667930581, + "grad_norm": 1.1502512693405151, + "learning_rate": 2.8094951923076926e-06, + "loss": 0.8519, + "step": 3242 + }, + { + "epoch": 1.8917894122794223, + "grad_norm": 1.0622332096099854, + "learning_rate": 2.794471153846154e-06, + "loss": 0.6469, + "step": 3243 + }, + { + "epoch": 1.892372757765787, + "grad_norm": 1.0960564613342285, + "learning_rate": 2.779447115384615e-06, + "loss": 0.8083, + "step": 3244 + }, + { + "epoch": 1.8929561032521511, + "grad_norm": 1.226771593093872, + "learning_rate": 2.764423076923077e-06, + "loss": 0.6511, + "step": 3245 + }, + { + "epoch": 1.8935394487385153, + "grad_norm": 1.045430302619934, + "learning_rate": 2.7493990384615385e-06, + "loss": 0.9077, + "step": 3246 + }, + { + "epoch": 1.8941227942248797, + "grad_norm": 1.3713380098342896, + "learning_rate": 2.734375e-06, + "loss": 0.9468, + "step": 3247 + }, + { + "epoch": 1.8947061397112441, + "grad_norm": 1.352195143699646, + "learning_rate": 2.719350961538462e-06, + "loss": 1.0285, + "step": 3248 + }, + { + "epoch": 1.8952894851976083, + "grad_norm": 1.1400947570800781, + "learning_rate": 2.7043269230769233e-06, + "loss": 0.8711, + "step": 3249 + }, + { + "epoch": 1.8958728306839725, + "grad_norm": 1.3555407524108887, + "learning_rate": 2.689302884615385e-06, + "loss": 0.9186, + "step": 3250 + }, + { + "epoch": 1.896456176170337, + "grad_norm": 1.185897946357727, + "learning_rate": 2.6742788461538463e-06, + "loss": 0.7208, + "step": 3251 + }, + { + "epoch": 1.8970395216567013, + "grad_norm": 1.1496164798736572, + "learning_rate": 2.6592548076923077e-06, + "loss": 0.8495, + "step": 3252 + }, + { + "epoch": 1.8976228671430655, + "grad_norm": 1.4104948043823242, + "learning_rate": 2.644230769230769e-06, + "loss": 0.9402, + "step": 3253 + }, + { + "epoch": 1.8982062126294297, + "grad_norm": 1.2339621782302856, + "learning_rate": 2.629206730769231e-06, + "loss": 0.8283, + "step": 3254 + }, + { + "epoch": 1.898789558115794, + "grad_norm": 1.3827406167984009, + "learning_rate": 2.6141826923076926e-06, + "loss": 0.8392, + "step": 3255 + }, + { + "epoch": 1.8993729036021585, + "grad_norm": 1.5171111822128296, + "learning_rate": 2.599158653846154e-06, + "loss": 0.7742, + "step": 3256 + }, + { + "epoch": 1.8999562490885227, + "grad_norm": 1.340565800666809, + "learning_rate": 2.5841346153846155e-06, + "loss": 0.7106, + "step": 3257 + }, + { + "epoch": 1.9005395945748869, + "grad_norm": 1.0620558261871338, + "learning_rate": 2.569110576923077e-06, + "loss": 0.6781, + "step": 3258 + }, + { + "epoch": 1.9011229400612513, + "grad_norm": 1.4068678617477417, + "learning_rate": 2.5540865384615384e-06, + "loss": 0.6035, + "step": 3259 + }, + { + "epoch": 1.9017062855476157, + "grad_norm": 1.356840968132019, + "learning_rate": 2.5390625000000003e-06, + "loss": 0.8417, + "step": 3260 + }, + { + "epoch": 1.9022896310339799, + "grad_norm": 1.0584261417388916, + "learning_rate": 2.5240384615384618e-06, + "loss": 0.844, + "step": 3261 + }, + { + "epoch": 1.902872976520344, + "grad_norm": 1.1918290853500366, + "learning_rate": 2.5090144230769232e-06, + "loss": 0.8733, + "step": 3262 + }, + { + "epoch": 1.9034563220067084, + "grad_norm": 1.2692493200302124, + "learning_rate": 2.4939903846153847e-06, + "loss": 0.9443, + "step": 3263 + }, + { + "epoch": 1.9040396674930729, + "grad_norm": 1.1847847700119019, + "learning_rate": 2.478966346153846e-06, + "loss": 0.7985, + "step": 3264 + }, + { + "epoch": 1.904623012979437, + "grad_norm": 1.3458268642425537, + "learning_rate": 2.4639423076923076e-06, + "loss": 0.7175, + "step": 3265 + }, + { + "epoch": 1.9052063584658012, + "grad_norm": 1.0749139785766602, + "learning_rate": 2.4489182692307695e-06, + "loss": 0.7089, + "step": 3266 + }, + { + "epoch": 1.9057897039521656, + "grad_norm": 1.1422741413116455, + "learning_rate": 2.433894230769231e-06, + "loss": 0.7655, + "step": 3267 + }, + { + "epoch": 1.90637304943853, + "grad_norm": 1.0878541469573975, + "learning_rate": 2.4188701923076925e-06, + "loss": 0.7034, + "step": 3268 + }, + { + "epoch": 1.9069563949248942, + "grad_norm": 1.0475382804870605, + "learning_rate": 2.403846153846154e-06, + "loss": 0.8075, + "step": 3269 + }, + { + "epoch": 1.9075397404112586, + "grad_norm": 1.3869905471801758, + "learning_rate": 2.3888221153846154e-06, + "loss": 0.8407, + "step": 3270 + }, + { + "epoch": 1.908123085897623, + "grad_norm": 1.221571445465088, + "learning_rate": 2.373798076923077e-06, + "loss": 0.9829, + "step": 3271 + }, + { + "epoch": 1.9087064313839872, + "grad_norm": 1.1875793933868408, + "learning_rate": 2.3587740384615383e-06, + "loss": 0.7063, + "step": 3272 + }, + { + "epoch": 1.9092897768703514, + "grad_norm": 1.3426343202590942, + "learning_rate": 2.3437500000000002e-06, + "loss": 0.8446, + "step": 3273 + }, + { + "epoch": 1.9098731223567158, + "grad_norm": 1.2949109077453613, + "learning_rate": 2.3287259615384617e-06, + "loss": 0.7361, + "step": 3274 + }, + { + "epoch": 1.9104564678430802, + "grad_norm": 1.5180912017822266, + "learning_rate": 2.313701923076923e-06, + "loss": 0.8648, + "step": 3275 + }, + { + "epoch": 1.9110398133294444, + "grad_norm": 1.1568496227264404, + "learning_rate": 2.298677884615385e-06, + "loss": 0.7235, + "step": 3276 + }, + { + "epoch": 1.9116231588158086, + "grad_norm": 1.4971147775650024, + "learning_rate": 2.283653846153846e-06, + "loss": 0.5042, + "step": 3277 + }, + { + "epoch": 1.912206504302173, + "grad_norm": 1.248701572418213, + "learning_rate": 2.2686298076923076e-06, + "loss": 0.6988, + "step": 3278 + }, + { + "epoch": 1.9127898497885374, + "grad_norm": 1.498721718788147, + "learning_rate": 2.2536057692307694e-06, + "loss": 0.7654, + "step": 3279 + }, + { + "epoch": 1.9133731952749016, + "grad_norm": 1.3017774820327759, + "learning_rate": 2.238581730769231e-06, + "loss": 0.8718, + "step": 3280 + }, + { + "epoch": 1.9139565407612658, + "grad_norm": 1.3298211097717285, + "learning_rate": 2.2235576923076924e-06, + "loss": 0.8205, + "step": 3281 + }, + { + "epoch": 1.9145398862476302, + "grad_norm": 1.2720110416412354, + "learning_rate": 2.2085336538461543e-06, + "loss": 0.7552, + "step": 3282 + }, + { + "epoch": 1.9151232317339946, + "grad_norm": 1.2517110109329224, + "learning_rate": 2.1935096153846153e-06, + "loss": 0.8019, + "step": 3283 + }, + { + "epoch": 1.9157065772203588, + "grad_norm": 1.2890255451202393, + "learning_rate": 2.1784855769230768e-06, + "loss": 0.7499, + "step": 3284 + }, + { + "epoch": 1.916289922706723, + "grad_norm": 1.0850515365600586, + "learning_rate": 2.1634615384615387e-06, + "loss": 0.8125, + "step": 3285 + }, + { + "epoch": 1.9168732681930873, + "grad_norm": 1.2969878911972046, + "learning_rate": 2.1484375e-06, + "loss": 0.7481, + "step": 3286 + }, + { + "epoch": 1.9174566136794517, + "grad_norm": 1.0845204591751099, + "learning_rate": 2.1334134615384616e-06, + "loss": 0.8015, + "step": 3287 + }, + { + "epoch": 1.918039959165816, + "grad_norm": 1.371109962463379, + "learning_rate": 2.1183894230769235e-06, + "loss": 0.7493, + "step": 3288 + }, + { + "epoch": 1.9186233046521801, + "grad_norm": 1.1624324321746826, + "learning_rate": 2.103365384615385e-06, + "loss": 0.7489, + "step": 3289 + }, + { + "epoch": 1.9192066501385445, + "grad_norm": 1.3240567445755005, + "learning_rate": 2.088341346153846e-06, + "loss": 0.847, + "step": 3290 + }, + { + "epoch": 1.919789995624909, + "grad_norm": 1.2875831127166748, + "learning_rate": 2.073317307692308e-06, + "loss": 1.0138, + "step": 3291 + }, + { + "epoch": 1.9203733411112731, + "grad_norm": 1.2602109909057617, + "learning_rate": 2.0582932692307694e-06, + "loss": 0.9535, + "step": 3292 + }, + { + "epoch": 1.9209566865976373, + "grad_norm": 1.1849132776260376, + "learning_rate": 2.043269230769231e-06, + "loss": 0.7271, + "step": 3293 + }, + { + "epoch": 1.9215400320840017, + "grad_norm": 1.2771323919296265, + "learning_rate": 2.0282451923076923e-06, + "loss": 0.9094, + "step": 3294 + }, + { + "epoch": 1.922123377570366, + "grad_norm": 1.2452751398086548, + "learning_rate": 2.013221153846154e-06, + "loss": 0.8227, + "step": 3295 + }, + { + "epoch": 1.9227067230567303, + "grad_norm": 1.2949745655059814, + "learning_rate": 1.9981971153846152e-06, + "loss": 0.776, + "step": 3296 + }, + { + "epoch": 1.9232900685430947, + "grad_norm": 1.4040955305099487, + "learning_rate": 1.983173076923077e-06, + "loss": 0.7911, + "step": 3297 + }, + { + "epoch": 1.923873414029459, + "grad_norm": 1.2795517444610596, + "learning_rate": 1.9681490384615386e-06, + "loss": 0.7292, + "step": 3298 + }, + { + "epoch": 1.9244567595158233, + "grad_norm": 1.1882306337356567, + "learning_rate": 1.953125e-06, + "loss": 0.8261, + "step": 3299 + }, + { + "epoch": 1.9250401050021875, + "grad_norm": 1.011581540107727, + "learning_rate": 1.9381009615384615e-06, + "loss": 0.5729, + "step": 3300 + }, + { + "epoch": 1.9256234504885519, + "grad_norm": 1.0311501026153564, + "learning_rate": 1.9230769230769234e-06, + "loss": 0.8459, + "step": 3301 + }, + { + "epoch": 1.9262067959749163, + "grad_norm": 1.1920017004013062, + "learning_rate": 1.908052884615385e-06, + "loss": 0.9399, + "step": 3302 + }, + { + "epoch": 1.9267901414612805, + "grad_norm": 1.0624799728393555, + "learning_rate": 1.8930288461538461e-06, + "loss": 0.6593, + "step": 3303 + }, + { + "epoch": 1.9273734869476447, + "grad_norm": 1.4519715309143066, + "learning_rate": 1.8780048076923078e-06, + "loss": 0.7391, + "step": 3304 + }, + { + "epoch": 1.927956832434009, + "grad_norm": 1.2700550556182861, + "learning_rate": 1.8629807692307693e-06, + "loss": 0.9545, + "step": 3305 + }, + { + "epoch": 1.9285401779203735, + "grad_norm": 1.4977349042892456, + "learning_rate": 1.847956730769231e-06, + "loss": 0.7466, + "step": 3306 + }, + { + "epoch": 1.9291235234067377, + "grad_norm": 1.3640085458755493, + "learning_rate": 1.8329326923076924e-06, + "loss": 0.6418, + "step": 3307 + }, + { + "epoch": 1.9297068688931018, + "grad_norm": 1.5322325229644775, + "learning_rate": 1.817908653846154e-06, + "loss": 0.8667, + "step": 3308 + }, + { + "epoch": 1.9302902143794662, + "grad_norm": 1.1606767177581787, + "learning_rate": 1.8028846153846153e-06, + "loss": 0.618, + "step": 3309 + }, + { + "epoch": 1.9308735598658306, + "grad_norm": 1.2893586158752441, + "learning_rate": 1.7878605769230768e-06, + "loss": 0.6867, + "step": 3310 + }, + { + "epoch": 1.9314569053521948, + "grad_norm": 1.2085678577423096, + "learning_rate": 1.7728365384615385e-06, + "loss": 0.6753, + "step": 3311 + }, + { + "epoch": 1.932040250838559, + "grad_norm": 1.0935242176055908, + "learning_rate": 1.7578125000000002e-06, + "loss": 0.6407, + "step": 3312 + }, + { + "epoch": 1.9326235963249234, + "grad_norm": 1.3033195734024048, + "learning_rate": 1.7427884615384616e-06, + "loss": 0.7061, + "step": 3313 + }, + { + "epoch": 1.9332069418112878, + "grad_norm": 1.1596806049346924, + "learning_rate": 1.7277644230769233e-06, + "loss": 0.6813, + "step": 3314 + }, + { + "epoch": 1.933790287297652, + "grad_norm": 1.1415847539901733, + "learning_rate": 1.7127403846153848e-06, + "loss": 0.8781, + "step": 3315 + }, + { + "epoch": 1.9343736327840162, + "grad_norm": 1.3523577451705933, + "learning_rate": 1.697716346153846e-06, + "loss": 0.854, + "step": 3316 + }, + { + "epoch": 1.9349569782703806, + "grad_norm": 1.3228445053100586, + "learning_rate": 1.6826923076923077e-06, + "loss": 0.7102, + "step": 3317 + }, + { + "epoch": 1.935540323756745, + "grad_norm": 1.152653455734253, + "learning_rate": 1.6676682692307694e-06, + "loss": 0.8008, + "step": 3318 + }, + { + "epoch": 1.9361236692431092, + "grad_norm": 1.4329392910003662, + "learning_rate": 1.6526442307692309e-06, + "loss": 0.9733, + "step": 3319 + }, + { + "epoch": 1.9367070147294734, + "grad_norm": 1.1904407739639282, + "learning_rate": 1.6376201923076925e-06, + "loss": 0.7539, + "step": 3320 + }, + { + "epoch": 1.9372903602158378, + "grad_norm": 1.1539748907089233, + "learning_rate": 1.622596153846154e-06, + "loss": 0.8373, + "step": 3321 + }, + { + "epoch": 1.9378737057022022, + "grad_norm": 1.211431860923767, + "learning_rate": 1.6075721153846153e-06, + "loss": 0.7206, + "step": 3322 + }, + { + "epoch": 1.9384570511885664, + "grad_norm": 1.0680336952209473, + "learning_rate": 1.592548076923077e-06, + "loss": 0.8083, + "step": 3323 + }, + { + "epoch": 1.9390403966749308, + "grad_norm": 1.3347814083099365, + "learning_rate": 1.5775240384615384e-06, + "loss": 0.7336, + "step": 3324 + }, + { + "epoch": 1.9396237421612952, + "grad_norm": 1.1900486946105957, + "learning_rate": 1.5625e-06, + "loss": 0.8278, + "step": 3325 + }, + { + "epoch": 1.9402070876476594, + "grad_norm": 1.2089030742645264, + "learning_rate": 1.5474759615384618e-06, + "loss": 0.6548, + "step": 3326 + }, + { + "epoch": 1.9407904331340236, + "grad_norm": 1.1675139665603638, + "learning_rate": 1.532451923076923e-06, + "loss": 0.8378, + "step": 3327 + }, + { + "epoch": 1.941373778620388, + "grad_norm": 1.2819353342056274, + "learning_rate": 1.5174278846153847e-06, + "loss": 0.8911, + "step": 3328 + }, + { + "epoch": 1.9419571241067524, + "grad_norm": 1.1306663751602173, + "learning_rate": 1.5024038461538464e-06, + "loss": 0.8226, + "step": 3329 + }, + { + "epoch": 1.9425404695931165, + "grad_norm": 1.330399990081787, + "learning_rate": 1.4873798076923076e-06, + "loss": 0.7923, + "step": 3330 + }, + { + "epoch": 1.9431238150794807, + "grad_norm": 1.2952775955200195, + "learning_rate": 1.4723557692307693e-06, + "loss": 0.8788, + "step": 3331 + }, + { + "epoch": 1.9437071605658451, + "grad_norm": 1.1201375722885132, + "learning_rate": 1.4573317307692308e-06, + "loss": 0.5674, + "step": 3332 + }, + { + "epoch": 1.9442905060522095, + "grad_norm": 1.1273674964904785, + "learning_rate": 1.4423076923076924e-06, + "loss": 0.8559, + "step": 3333 + }, + { + "epoch": 1.9448738515385737, + "grad_norm": 1.462166428565979, + "learning_rate": 1.427283653846154e-06, + "loss": 0.774, + "step": 3334 + }, + { + "epoch": 1.945457197024938, + "grad_norm": 1.3099924325942993, + "learning_rate": 1.4122596153846154e-06, + "loss": 0.6735, + "step": 3335 + }, + { + "epoch": 1.9460405425113023, + "grad_norm": 1.3009566068649292, + "learning_rate": 1.397235576923077e-06, + "loss": 0.813, + "step": 3336 + }, + { + "epoch": 1.9466238879976667, + "grad_norm": 1.3090664148330688, + "learning_rate": 1.3822115384615385e-06, + "loss": 0.8125, + "step": 3337 + }, + { + "epoch": 1.947207233484031, + "grad_norm": 1.452247977256775, + "learning_rate": 1.3671875e-06, + "loss": 0.8553, + "step": 3338 + }, + { + "epoch": 1.947790578970395, + "grad_norm": 1.308634877204895, + "learning_rate": 1.3521634615384617e-06, + "loss": 0.7305, + "step": 3339 + }, + { + "epoch": 1.9483739244567595, + "grad_norm": 1.4182379245758057, + "learning_rate": 1.3371394230769231e-06, + "loss": 0.6694, + "step": 3340 + }, + { + "epoch": 1.948957269943124, + "grad_norm": 1.5073226690292358, + "learning_rate": 1.3221153846153846e-06, + "loss": 0.7478, + "step": 3341 + }, + { + "epoch": 1.949540615429488, + "grad_norm": 1.1056112051010132, + "learning_rate": 1.3070913461538463e-06, + "loss": 0.9675, + "step": 3342 + }, + { + "epoch": 1.9501239609158523, + "grad_norm": 1.3071699142456055, + "learning_rate": 1.2920673076923077e-06, + "loss": 0.9027, + "step": 3343 + }, + { + "epoch": 1.9507073064022167, + "grad_norm": 1.2924457788467407, + "learning_rate": 1.2770432692307692e-06, + "loss": 0.9082, + "step": 3344 + }, + { + "epoch": 1.951290651888581, + "grad_norm": 1.27849280834198, + "learning_rate": 1.2620192307692309e-06, + "loss": 0.8798, + "step": 3345 + }, + { + "epoch": 1.9518739973749453, + "grad_norm": 1.2095881700515747, + "learning_rate": 1.2469951923076924e-06, + "loss": 0.9073, + "step": 3346 + }, + { + "epoch": 1.9524573428613095, + "grad_norm": 1.2283076047897339, + "learning_rate": 1.2319711538461538e-06, + "loss": 0.6455, + "step": 3347 + }, + { + "epoch": 1.9530406883476739, + "grad_norm": 1.241592526435852, + "learning_rate": 1.2169471153846155e-06, + "loss": 0.7726, + "step": 3348 + }, + { + "epoch": 1.9536240338340383, + "grad_norm": 1.2830148935317993, + "learning_rate": 1.201923076923077e-06, + "loss": 0.7911, + "step": 3349 + }, + { + "epoch": 1.9542073793204024, + "grad_norm": 1.2363495826721191, + "learning_rate": 1.1868990384615384e-06, + "loss": 0.7075, + "step": 3350 + }, + { + "epoch": 1.9547907248067669, + "grad_norm": 1.2319027185440063, + "learning_rate": 1.1718750000000001e-06, + "loss": 0.8239, + "step": 3351 + }, + { + "epoch": 1.9553740702931313, + "grad_norm": 1.2770158052444458, + "learning_rate": 1.1568509615384616e-06, + "loss": 0.6991, + "step": 3352 + }, + { + "epoch": 1.9559574157794954, + "grad_norm": 1.1271159648895264, + "learning_rate": 1.141826923076923e-06, + "loss": 0.8684, + "step": 3353 + }, + { + "epoch": 1.9565407612658596, + "grad_norm": 1.2182567119598389, + "learning_rate": 1.1268028846153847e-06, + "loss": 0.8585, + "step": 3354 + }, + { + "epoch": 1.957124106752224, + "grad_norm": 1.2966188192367554, + "learning_rate": 1.1117788461538462e-06, + "loss": 0.7743, + "step": 3355 + }, + { + "epoch": 1.9577074522385884, + "grad_norm": 1.308058261871338, + "learning_rate": 1.0967548076923077e-06, + "loss": 0.7612, + "step": 3356 + }, + { + "epoch": 1.9582907977249526, + "grad_norm": 1.1872565746307373, + "learning_rate": 1.0817307692307693e-06, + "loss": 0.9376, + "step": 3357 + }, + { + "epoch": 1.9588741432113168, + "grad_norm": 1.1720060110092163, + "learning_rate": 1.0667067307692308e-06, + "loss": 0.689, + "step": 3358 + }, + { + "epoch": 1.9594574886976812, + "grad_norm": 1.202436923980713, + "learning_rate": 1.0516826923076925e-06, + "loss": 0.8309, + "step": 3359 + }, + { + "epoch": 1.9600408341840456, + "grad_norm": 1.271864414215088, + "learning_rate": 1.036658653846154e-06, + "loss": 0.8216, + "step": 3360 + }, + { + "epoch": 1.9606241796704098, + "grad_norm": 1.501631259918213, + "learning_rate": 1.0216346153846154e-06, + "loss": 0.9438, + "step": 3361 + }, + { + "epoch": 1.961207525156774, + "grad_norm": 1.3683483600616455, + "learning_rate": 1.006610576923077e-06, + "loss": 0.8276, + "step": 3362 + }, + { + "epoch": 1.9617908706431384, + "grad_norm": 1.202343225479126, + "learning_rate": 9.915865384615386e-07, + "loss": 0.682, + "step": 3363 + }, + { + "epoch": 1.9623742161295028, + "grad_norm": 1.0876096487045288, + "learning_rate": 9.765625e-07, + "loss": 0.7981, + "step": 3364 + }, + { + "epoch": 1.962957561615867, + "grad_norm": 1.5290825366973877, + "learning_rate": 9.615384615384617e-07, + "loss": 0.9378, + "step": 3365 + }, + { + "epoch": 1.9635409071022312, + "grad_norm": 1.267568588256836, + "learning_rate": 9.465144230769231e-07, + "loss": 0.7676, + "step": 3366 + }, + { + "epoch": 1.9641242525885956, + "grad_norm": 1.1771818399429321, + "learning_rate": 9.314903846153846e-07, + "loss": 0.6243, + "step": 3367 + }, + { + "epoch": 1.96470759807496, + "grad_norm": 1.4345792531967163, + "learning_rate": 9.164663461538462e-07, + "loss": 0.7595, + "step": 3368 + }, + { + "epoch": 1.9652909435613242, + "grad_norm": 2.9055094718933105, + "learning_rate": 9.014423076923077e-07, + "loss": 0.8694, + "step": 3369 + }, + { + "epoch": 1.9658742890476884, + "grad_norm": 1.2087029218673706, + "learning_rate": 8.864182692307692e-07, + "loss": 0.8892, + "step": 3370 + }, + { + "epoch": 1.9664576345340528, + "grad_norm": 1.1844966411590576, + "learning_rate": 8.713942307692308e-07, + "loss": 0.9044, + "step": 3371 + }, + { + "epoch": 1.9670409800204172, + "grad_norm": 1.4420499801635742, + "learning_rate": 8.563701923076924e-07, + "loss": 0.7416, + "step": 3372 + }, + { + "epoch": 1.9676243255067813, + "grad_norm": 1.2633183002471924, + "learning_rate": 8.413461538461539e-07, + "loss": 0.8739, + "step": 3373 + }, + { + "epoch": 1.9682076709931455, + "grad_norm": 1.2410845756530762, + "learning_rate": 8.263221153846154e-07, + "loss": 0.9586, + "step": 3374 + }, + { + "epoch": 1.96879101647951, + "grad_norm": 1.3219760656356812, + "learning_rate": 8.11298076923077e-07, + "loss": 0.8433, + "step": 3375 + }, + { + "epoch": 1.9693743619658743, + "grad_norm": 1.2453432083129883, + "learning_rate": 7.962740384615385e-07, + "loss": 0.6931, + "step": 3376 + }, + { + "epoch": 1.9699577074522385, + "grad_norm": 1.3527343273162842, + "learning_rate": 7.8125e-07, + "loss": 0.8611, + "step": 3377 + }, + { + "epoch": 1.970541052938603, + "grad_norm": 1.1058260202407837, + "learning_rate": 7.662259615384615e-07, + "loss": 0.9306, + "step": 3378 + }, + { + "epoch": 1.9711243984249673, + "grad_norm": 1.070041537284851, + "learning_rate": 7.512019230769232e-07, + "loss": 0.8113, + "step": 3379 + }, + { + "epoch": 1.9717077439113315, + "grad_norm": 2.1356186866760254, + "learning_rate": 7.361778846153846e-07, + "loss": 0.7269, + "step": 3380 + }, + { + "epoch": 1.9722910893976957, + "grad_norm": 1.2367362976074219, + "learning_rate": 7.211538461538462e-07, + "loss": 0.8238, + "step": 3381 + }, + { + "epoch": 1.9728744348840601, + "grad_norm": 1.401431679725647, + "learning_rate": 7.061298076923077e-07, + "loss": 0.6869, + "step": 3382 + }, + { + "epoch": 1.9734577803704245, + "grad_norm": 1.330978512763977, + "learning_rate": 6.911057692307693e-07, + "loss": 0.7662, + "step": 3383 + }, + { + "epoch": 1.9740411258567887, + "grad_norm": 1.0215189456939697, + "learning_rate": 6.760817307692308e-07, + "loss": 0.7518, + "step": 3384 + }, + { + "epoch": 1.9746244713431529, + "grad_norm": 1.2366408109664917, + "learning_rate": 6.610576923076923e-07, + "loss": 0.9705, + "step": 3385 + }, + { + "epoch": 1.9752078168295173, + "grad_norm": 1.478739857673645, + "learning_rate": 6.460336538461539e-07, + "loss": 0.691, + "step": 3386 + }, + { + "epoch": 1.9757911623158817, + "grad_norm": 1.1755001544952393, + "learning_rate": 6.310096153846154e-07, + "loss": 0.589, + "step": 3387 + }, + { + "epoch": 1.9763745078022459, + "grad_norm": 1.2783706188201904, + "learning_rate": 6.159855769230769e-07, + "loss": 0.7802, + "step": 3388 + }, + { + "epoch": 1.97695785328861, + "grad_norm": 1.1876357793807983, + "learning_rate": 6.009615384615385e-07, + "loss": 0.7924, + "step": 3389 + }, + { + "epoch": 1.9775411987749745, + "grad_norm": 1.159026026725769, + "learning_rate": 5.859375000000001e-07, + "loss": 0.7157, + "step": 3390 + }, + { + "epoch": 1.9781245442613389, + "grad_norm": 1.2553104162216187, + "learning_rate": 5.709134615384615e-07, + "loss": 0.7791, + "step": 3391 + }, + { + "epoch": 1.978707889747703, + "grad_norm": 1.2597992420196533, + "learning_rate": 5.558894230769231e-07, + "loss": 0.6958, + "step": 3392 + }, + { + "epoch": 1.9792912352340672, + "grad_norm": 1.3527934551239014, + "learning_rate": 5.408653846153847e-07, + "loss": 0.6869, + "step": 3393 + }, + { + "epoch": 1.9798745807204317, + "grad_norm": 1.4447914361953735, + "learning_rate": 5.258413461538462e-07, + "loss": 0.6232, + "step": 3394 + }, + { + "epoch": 1.980457926206796, + "grad_norm": 1.3879870176315308, + "learning_rate": 5.108173076923077e-07, + "loss": 0.7473, + "step": 3395 + }, + { + "epoch": 1.9810412716931602, + "grad_norm": 1.2205619812011719, + "learning_rate": 4.957932692307693e-07, + "loss": 0.8624, + "step": 3396 + }, + { + "epoch": 1.9816246171795244, + "grad_norm": 1.3446601629257202, + "learning_rate": 4.807692307692308e-07, + "loss": 0.6787, + "step": 3397 + }, + { + "epoch": 1.9822079626658888, + "grad_norm": 1.372126579284668, + "learning_rate": 4.657451923076923e-07, + "loss": 0.658, + "step": 3398 + }, + { + "epoch": 1.9827913081522532, + "grad_norm": 1.195761799812317, + "learning_rate": 4.5072115384615384e-07, + "loss": 0.8341, + "step": 3399 + }, + { + "epoch": 1.9833746536386174, + "grad_norm": 1.2088286876678467, + "learning_rate": 4.356971153846154e-07, + "loss": 0.6969, + "step": 3400 + }, + { + "epoch": 1.9833746536386174, + "eval_loss_squad": 0.8548139879200608, + "eval_perplexity": 8.228127154549277, + "eval_perplexity_reconstruct": 1.8982905574950977, + "step": 3400 + }, + { + "epoch": 1.9839579991249816, + "grad_norm": 1.5069819688796997, + "learning_rate": 4.2067307692307693e-07, + "loss": 0.9078, + "step": 3401 + }, + { + "epoch": 1.984541344611346, + "grad_norm": 1.4215034246444702, + "learning_rate": 4.056490384615385e-07, + "loss": 0.7893, + "step": 3402 + }, + { + "epoch": 1.9851246900977104, + "grad_norm": 1.417895793914795, + "learning_rate": 3.90625e-07, + "loss": 0.705, + "step": 3403 + }, + { + "epoch": 1.9857080355840746, + "grad_norm": 1.336230754852295, + "learning_rate": 3.756009615384616e-07, + "loss": 0.8535, + "step": 3404 + }, + { + "epoch": 1.986291381070439, + "grad_norm": 1.057672142982483, + "learning_rate": 3.605769230769231e-07, + "loss": 0.7968, + "step": 3405 + }, + { + "epoch": 1.9868747265568034, + "grad_norm": 1.924058198928833, + "learning_rate": 3.4555288461538463e-07, + "loss": 0.8941, + "step": 3406 + }, + { + "epoch": 1.9874580720431676, + "grad_norm": 1.2566702365875244, + "learning_rate": 3.3052884615384615e-07, + "loss": 0.9751, + "step": 3407 + }, + { + "epoch": 1.9880414175295318, + "grad_norm": 1.2347677946090698, + "learning_rate": 3.155048076923077e-07, + "loss": 0.9065, + "step": 3408 + }, + { + "epoch": 1.9886247630158962, + "grad_norm": 1.202021837234497, + "learning_rate": 3.0048076923076924e-07, + "loss": 1.108, + "step": 3409 + }, + { + "epoch": 1.9892081085022606, + "grad_norm": 1.1871834993362427, + "learning_rate": 2.8545673076923076e-07, + "loss": 0.7417, + "step": 3410 + }, + { + "epoch": 1.9897914539886248, + "grad_norm": 1.482557773590088, + "learning_rate": 2.7043269230769233e-07, + "loss": 0.8867, + "step": 3411 + }, + { + "epoch": 1.990374799474989, + "grad_norm": 1.1281570196151733, + "learning_rate": 2.5540865384615385e-07, + "loss": 0.7128, + "step": 3412 + }, + { + "epoch": 1.9909581449613534, + "grad_norm": 1.2373967170715332, + "learning_rate": 2.403846153846154e-07, + "loss": 0.9376, + "step": 3413 + }, + { + "epoch": 1.9915414904477178, + "grad_norm": 1.0291107892990112, + "learning_rate": 2.2536057692307692e-07, + "loss": 0.6016, + "step": 3414 + }, + { + "epoch": 1.992124835934082, + "grad_norm": 1.2457456588745117, + "learning_rate": 2.1033653846153846e-07, + "loss": 0.7507, + "step": 3415 + }, + { + "epoch": 1.9927081814204461, + "grad_norm": 1.7160253524780273, + "learning_rate": 1.953125e-07, + "loss": 0.6249, + "step": 3416 + }, + { + "epoch": 1.9932915269068106, + "grad_norm": 1.178257703781128, + "learning_rate": 1.8028846153846156e-07, + "loss": 0.9351, + "step": 3417 + }, + { + "epoch": 1.993874872393175, + "grad_norm": 1.1656147241592407, + "learning_rate": 1.6526442307692307e-07, + "loss": 0.7854, + "step": 3418 + }, + { + "epoch": 1.9944582178795391, + "grad_norm": 1.1849141120910645, + "learning_rate": 1.5024038461538462e-07, + "loss": 0.7907, + "step": 3419 + }, + { + "epoch": 1.9950415633659033, + "grad_norm": 1.764297366142273, + "learning_rate": 1.3521634615384617e-07, + "loss": 0.7752, + "step": 3420 + }, + { + "epoch": 1.9956249088522677, + "grad_norm": 1.2948668003082275, + "learning_rate": 1.201923076923077e-07, + "loss": 0.7394, + "step": 3421 + }, + { + "epoch": 1.9962082543386321, + "grad_norm": 1.251250982284546, + "learning_rate": 1.0516826923076923e-07, + "loss": 0.7703, + "step": 3422 + }, + { + "epoch": 1.9967915998249963, + "grad_norm": 1.1011062860488892, + "learning_rate": 9.014423076923078e-08, + "loss": 0.7541, + "step": 3423 + }, + { + "epoch": 1.9973749453113605, + "grad_norm": 1.1622389554977417, + "learning_rate": 7.512019230769231e-08, + "loss": 0.8255, + "step": 3424 + }, + { + "epoch": 1.997958290797725, + "grad_norm": 1.1566705703735352, + "learning_rate": 6.009615384615386e-08, + "loss": 0.8762, + "step": 3425 + }, + { + "epoch": 1.9985416362840893, + "grad_norm": 1.2555816173553467, + "learning_rate": 4.507211538461539e-08, + "loss": 0.7081, + "step": 3426 + }, + { + "epoch": 1.9991249817704535, + "grad_norm": 1.1134846210479736, + "learning_rate": 3.004807692307693e-08, + "loss": 0.7877, + "step": 3427 + }, + { + "epoch": 1.9997083272568177, + "grad_norm": 1.113712191581726, + "learning_rate": 1.5024038461538464e-08, + "loss": 0.9928, + "step": 3428 + } + ], + "logging_steps": 1.0, + "max_steps": 3428, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 652, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6980351498544218e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}